In [2]:
import pandas as pd
import numpy as np
import patsy
from mlxtend.frequent_patterns import apriori, association_rules

In [2]:
pd.set_option("max_colwidth", 150)
f = "https://github.com/cs6220/cs6220.spring2019/raw/master/data/Online%20Retail.xlsx"
df = pd.read_excel(f) #including Other countries besides UK
basket = (df[df["Country"] == "United Kingdom"]
.groupby(["InvoiceNo", "Description"])["Quantity"]
.sum().unstack().reset_index().fillna(0)
.set_index("InvoiceNo")) # transform transactions into baskets of items
basket_sets = basket.iloc[:,0:].astype(bool) # convert counts to booleans

In [3]:
basket_sets.head()

Description,20713,4 PURPLE FLOCK DINNER CANDLES,50'S CHRISTMAS GIFT BAG LARGE,DOLLY GIRL BEAKER,I LOVE LONDON MINI BACKPACK,NINE DRAWER OFFICE TIDY,OVAL WALL MIRROR DIAMANTE,RED SPOT GIFT BAG LARGE,SET 2 TEA TOWELS I LOVE LONDON,SPACEBOY BABY GIFT SET,...,wrongly coded 20713,wrongly coded 23343,wrongly coded-23343,wrongly marked,wrongly marked 23343,wrongly marked carton 22804,wrongly marked. 23343 in box,wrongly sold (22719) barcode,wrongly sold as sets,wrongly sold sets
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536365,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
536366,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
536367,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
536368,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
536369,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## 1. Online Retail Market Basket Analysis 

Task 1. What are the top 5 1-itemsets with the highest support?

Answer:\
The top 5 1-items are: 
1. WHITE HANGING HEART T-LIGHT HOLDER : support = 0.100091
2. JUMBO BAG RED RETROSPOT : support = 0.089746
3. REGENCY CAKESTAND 3 TIER : support = 0.083122
4. PARTY BUNTING : support = 0.073185	
5. LUNCH BAG RED RETROSPOT : support = 0.064882	

In [4]:
t5_one_list = apriori(basket_sets, min_support=0.063, use_colnames=True)
t5_one_list.sort_values(by='support', ascending=False).head()

Unnamed: 0,support,itemsets
4,0.100091,(WHITE HANGING HEART T-LIGHT HOLDER)
0,0.089746,(JUMBO BAG RED RETROSPOT)
3,0.083122,(REGENCY CAKESTAND 3 TIER)
2,0.073185,(PARTY BUNTING)
1,0.064882,(LUNCH BAG RED RETROSPOT)


Task 2. What are the top 5 2-itemsets with the highest support?

Answer:\
The top 5 1-items are: 
1. JUMBO BAG RED RETROSPOT, JUMBO BAG PINK POLKADOT : support = 0.035980
2. GREEN REGENCY TEACUP AND SAUCER, ROSES REGENCY TEACUP AND SAUCER : support = 0.032350
3. JUMBO STORAGE BAG SUKI, JUMBO BAG RED RETROSPOT : support = 0.032078
4. JUMBO BAG RED RETROSPOT, JUMBO SHOPPER VINTAGE RED PAISLEY : support = 0.029946	
5. LUNCH BAG RED RETROSPOT, LUNCH BAG BLACK SKULL. : support = 0.027858	

In [5]:
t5_two_list = apriori(basket_sets, min_support=0.02, use_colnames=True)
t5_two_list['item_num'] = t5_two_list['itemsets'].apply(len)
t5_two_list = t5_two_list.where(t5_two_list['item_num'] == 2)
t5_two_list.sort_values(by='support', ascending=False).head()


Unnamed: 0,support,itemsets,item_num
215,0.03598,"(JUMBO BAG PINK POLKADOT, JUMBO BAG RED RETROSPOT)",2.0
211,0.03235,"(GREEN REGENCY TEACUP AND SAUCER, ROSES REGENCY TEACUP AND SAUCER )",2.0
222,0.032078,"(JUMBO BAG RED RETROSPOT, JUMBO STORAGE BAG SUKI)",2.0
221,0.029946,"(JUMBO BAG RED RETROSPOT, JUMBO SHOPPER VINTAGE RED PAISLEY)",2.0
228,0.027858,"(LUNCH BAG RED RETROSPOT, LUNCH BAG BLACK SKULL.)",2.0


In [6]:
# Try optimize the code later
# t5_two_list_try = apriori(basket_sets, min_support=0.02, use_colnames=True)
# t5_two_list_try  = t5_two_list_try.where(t5_two_list_try.itemsets.count() == 2)
# t5_two_list_try.sort_values(by='support', ascending=False)

Task 3. What is the highest support value for the 1-itemsets?\
Answer: The highest support value is at WHITE HANGING HEART T-LIGHT HOLDER where support value is 0.100091

Task 4. What is the highest support value for the 2-itemsets? \
Answer: The highest support value is at JUMBO BAG RED RETROSPOT, JUMBO BAG PINK POLKADOT where the support value is 0.035980

## 1.2 Association Rule Generation

Task 1. What are the top 5 association rules?

In [7]:
frequent_itemsets = apriori(basket_sets, min_support=0.008, use_colnames=True)
frequent_itemsets.head()

Unnamed: 0,support,itemsets
0,0.011615,( SET 2 TEA TOWELS I LOVE LONDON )
1,0.01275,(10 COLOUR SPACEBOY PEN)
2,0.01098,(12 MESSAGE CARDS WITH ENVELOPES)
3,0.014927,(12 PENCIL SMALL TUBE WOODLAND)
4,0.015563,(12 PENCILS SMALL TUBE RED RETROSPOT)


In [8]:
association_rules(frequent_itemsets, metric="confidence", min_threshold=0.95).sort_values(by='confidence', ascending=False).head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(BEADED CRYSTAL HEART PINK ON STICK),(DOTCOM POSTAGE),0.009347,0.032169,0.00912,0.975728,30.331521,0.008819,39.874646
9,"(HERB MARKER CHIVES , HERB MARKER THYME)",(HERB MARKER PARSLEY),0.008394,0.01039,0.008122,0.967568,93.123097,0.008034,30.512969
7,"(HERB MARKER CHIVES , HERB MARKER ROSEMARY)",(HERB MARKER PARSLEY),0.008348,0.01039,0.008076,0.967391,93.106133,0.007989,30.348034
10,"(HERB MARKER CHIVES , HERB MARKER ROSEMARY)",(HERB MARKER THYME),0.008348,0.010481,0.008076,0.967391,92.300019,0.007989,30.345251
15,"(HERB MARKER BASIL, HERB MARKER MINT, HERB MARKER THYME)",(HERB MARKER ROSEMARY),0.008303,0.010526,0.008031,0.967213,91.885246,0.007943,30.178947


Task 2. What items make up one of the top association rules? Search online for the
items (or at least items with the same name). Do you think they are likely to be bought
together?

The dominant items are the following products with the name Herb Marker starts in the begining. \
Herb Marker Chives \
Herb Marker Rosemary \
Herb Marker Thyme \
Herb Marker Basil \
Herb Marker Mint 

They are very likely to be bought together as they are key ingredients to make certain food dish which implies the possibilities of consumers buying a combination of spices rather than only one is higher and the consumer behavior leads to the high confidence rate as a result.


## 2 Association Rule Mining U.S. Census Data 

Task 1. 
Transform the raw dataset into a format appropriate for association rule mining
by dropping all continuous columns and one-hot encoding the remaining columns. The
values for each resulting column should be binary, represented by a 1 or 0.

In [3]:
path = "https://raw.githubusercontent.com/cs6220/cs6220.spring2019/master/data/adult/"
names = pd.read_csv(path + "adult.names", sep="\t", header=None)
parse_cols = lambda x: x.str.split(":", expand=True).iloc[:, 0]
columns = np.roll(parse_cols(names.iloc[92:108, 0]), shift=-1)
df_adult = pd.read_csv(path + "adult.data", sep=",", header=None, index_col=False)
df_adult.columns = columns

In [4]:
df_adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,">50K, <=50K."
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
continuous_df = df_adult._get_numeric_data()
continuous_df.columns

Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week'],
      dtype='object')

In [6]:
without_continuous_adult_df = df_adult.drop(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week'], axis=1).dropna()
without_continuous_adult_df.rename({"marital-status" : "marriage", "native-country" : "country", ">50K, <=50K." : "Threshold"}, axis=1, inplace=True)
without_continuous_adult_df.head()

Unnamed: 0,workclass,education,marriage,occupation,relationship,race,sex,country,Threshold
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K


In [7]:
e = ' + '.join(without_continuous_adult_df.columns) + ' -1' 
encoding_adult_df = patsy.dmatrix(e, without_continuous_adult_df, return_type='dataframe')
# Convert the dataframe to boolean type
encoding_adult_df = encoding_adult_df.iloc[:,0:].astype(bool)
encoding_adult_df.head()

Unnamed: 0,workclass[ ?],workclass[ Federal-gov],workclass[ Local-gov],workclass[ Never-worked],workclass[ Private],workclass[ Self-emp-inc],workclass[ Self-emp-not-inc],workclass[ State-gov],workclass[ Without-pay],education[T. 11th],...,country[T. Puerto-Rico],country[T. Scotland],country[T. South],country[T. Taiwan],country[T. Thailand],country[T. Trinadad&Tobago],country[T. United-States],country[T. Vietnam],country[T. Yugoslavia],Threshold[T. >50K]
0,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False
1,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False
2,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3,False,False,False,False,True,False,False,False,False,True,...,False,False,False,False,False,False,True,False,False,False
4,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


Task 2. Use confidence for the rule interestingness (metric="confidence") and gen-
erate rules up to a depth of 3 (max len=3). Generate rules and flag at least 5 examples
that you find interesting. Comment on your findings and share a few notes on what you
learned from the rules you are highlighting. Note that you can choose your support and
confidence thresholds yourself in this analysis.



1. Depth of 1 Highest Support: 0.895857 is at (country[T. United-States]) However, it does not have confidence results when association rule is applied with interestingness of confidence at any level.
2. Depth of 2 Highest Support: 0.786862	(race[T. White], country[T. United-States]) and the highest confidence level is 0.921089 where race[T. White]) is associated with (country[T. United-States])
3. Depth of 3 Highest Support: Case 1. When max_len is set at 3: The 3rd Highest support is 0.895857 which provides the highest confidence rate at 0.921089 indicating an assoication between race[T. White]) --> (country[T. United-States]}
4. Depth of 3 (fixed 3): Case 2. When len is set at 3 fixedly: Highest support is 0.544455 Conclusion: When the basket always has 3 items, the support rate is almost 40% less comparing to the combination of baskets with number of items in between 1 and 3 inclusive.
5. Depth of 2 : When minimum support is set at 0.4, maximum length of basket is set at 3, and confidence is applied as the metric, the highest confidence is 0.921089. It means 92% confident that the occurrence of (race[T. White]) will lead to an association of this person coming from the US (country[T. United-States]).	

Interesting Rule #1:
When max_len is set at 1, the confidence at any level won't generate results after the application of the association rules

In [8]:
adult_fre1_itemsets = apriori(encoding_adult_df, min_support=0.4, max_len = 1, use_colnames=True)
adult_fre1_itemsets.sort_values(by='support', ascending=False).head()

Unnamed: 0,support,itemsets
4,0.895857,(country[T. United-States])
2,0.854274,(race[T. White])
0,0.69703,(workclass[ Private])
3,0.669205,(sex[T. Male])
1,0.459937,(marriage[T. Married-civ-spouse])


In [9]:
association_rules(adult_fre1_itemsets, metric='confidence', min_threshold=0.008)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction


Interesting Rule #2: \
When maximum length of basket is set to 2, the highest support rate is 0.786862	(race[T. White], country[T. United-States]) and the highest confidence level is 0.921089 where race[T. White]) is associated with (country[T. United-States])	

In [10]:
# adult_freq2_itemsets = apriori(encoding_adult_df, min_support=0.3, max_len = 2, use_colnames=True)
# adult_freq2_itemsets ['item_num'] = adult_freq2_itemsets ['itemsets'].apply(len)
# adult_freq2_itemsets  = adult_freq2_itemsets .where(adult_freq2_itemsets['item_num'] == 2)
# adult_freq2_itemsets.sort_values(by='support', ascending=False).head()

In [11]:
adult_fre2_itemsets = apriori(encoding_adult_df, min_support=0.5, max_len = 2, use_colnames=True)
adult_fre2_itemsets.tail().sort_values(by="support", ascending=True)

Unnamed: 0,support,itemsets
6,0.588864,"(race[T. White], sex[T. Male])"
4,0.595928,"(workclass[ Private], race[T. White])"
8,0.598507,"(country[T. United-States], sex[T. Male])"
5,0.618378,"(country[T. United-States], workclass[ Private])"
7,0.786862,"(country[T. United-States], race[T. White])"


In [12]:
association_rules(adult_fre2_itemsets, metric="confidence", min_threshold=0.8).sort_values(by='confidence', ascending=False).head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
4,(race[T. White]),(country[T. United-States]),0.854274,0.895857,0.786862,0.921089,1.028165,0.021555,1.319746
5,(sex[T. Male]),(country[T. United-States]),0.669205,0.895857,0.598507,0.894355,0.998324,-0.001005,0.985784
1,(workclass[ Private]),(country[T. United-States]),0.69703,0.895857,0.618378,0.887161,0.990293,-0.006062,0.922932
2,(sex[T. Male]),(race[T. White]),0.669205,0.854274,0.588864,0.879945,1.030051,0.017179,1.21383
3,(country[T. United-States]),(race[T. White]),0.895857,0.854274,0.786862,0.878334,1.028165,0.021555,1.197758


Interesting Rule #3: \
Here I did two cases: \
Case 1. When max_len is set at 3: The 3rd Highest support is 0.895857 which provides the highest confidence rate at 0.921089 indicating an assoication between race[T. White]) --> (country[T. United-States]}


In [13]:
#Case 1 Apriori Algorithm
a_fre3_itemsets = apriori(encoding_adult_df, min_support=0.5, max_len = 3, use_colnames=True)
a_fre3_itemsets.sort_values(by='support', ascending=False).head()

Unnamed: 0,support,itemsets
3,0.895857,(country[T. United-States])
1,0.854274,(race[T. White])
7,0.786862,"(country[T. United-States], race[T. White])"
0,0.69703,(workclass[ Private])
2,0.669205,(sex[T. Male])


In [14]:
#Case 1 Association_Rules
association_rules(a_fre3_itemsets, metric="confidence", min_threshold=0.08).sort_values(by='confidence', ascending=False).head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
7,(race[T. White]),(country[T. United-States]),0.854274,0.895857,0.786862,0.921089,1.028165,0.021555,1.319746
18,"(race[T. White], sex[T. Male])",(country[T. United-States]),0.588864,0.895857,0.542152,0.920674,1.027702,0.014614,1.312845
12,"(workclass[ Private], race[T. White])",(country[T. United-States]),0.595928,0.895857,0.544455,0.913626,1.019835,0.010589,1.205722
17,"(country[T. United-States], sex[T. Male])",(race[T. White]),0.598507,0.854274,0.542152,0.905839,1.060362,0.030863,1.547639
9,(sex[T. Male]),(country[T. United-States]),0.669205,0.895857,0.598507,0.894355,0.998324,-0.001005,0.985784


Interesting Rule #4: \
Case 2. When len is set at 3 fixedly: Highest support is 0.544455 \
Conclusion: When the basket always has 3 items, the support rate is almost 40% less comparing to the combination of baskets with number of items in between 1 and 3 inclusive.

In [15]:
#Case 2 Apriori Algorithm: Fixed length of basket is set at 3
adult_fre3_itemsets = apriori(encoding_adult_df, min_support=0.3, use_colnames=True)
adult_fre3_itemsets ['item_num'] = adult_fre3_itemsets ['itemsets'].apply(len)
adult_fre3_itemsets  = adult_fre3_itemsets .where(adult_fre3_itemsets['item_num'] == 3)
adult_fre3_itemsets.sort_values(by='support', ascending=False).head()

Unnamed: 0,support,itemsets,item_num
17,0.544455,"(country[T. United-States], workclass[ Private...",3.0
22,0.542152,"(country[T. United-States], race[T. White], se...",3.0
18,0.405669,"(country[T. United-States], workclass[ Private...",3.0
16,0.403028,"(workclass[ Private], race[T. White], sex[T. M...",3.0
20,0.379872,"(marriage[T. Married-civ-spouse], country[T. U...",3.0


In [39]:
# #Case 2 Association_Rules
# #------------------------------------------
# association_rules(adult_fre3_itemsets, metric="confidence", min_threshold=0.08).sort_values(by='confidence', ascending=False).head()
# #------------------------------------------


Interesting Rule #5: \
When minimum support is set at 0.4, maximum length of basket is set at 3, and confidence is applied as the metric, the highest confidence is 0.921089. It means 92% confident that the occurrence of (race[T. White]) will lead to an association of this person coming from the US (country[T. United-States]).


In [47]:
adult_general_itemsets = apriori(encoding_adult_df, min_support=0.4, max_len=2, use_colnames=True)
association_rules(adult_general_itemsets, metric='confidence', min_threshold=0.4).sort_values(by='confidence', ascending=False).head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
15,(race[T. White]),(country[T. United-States]),0.854274,0.895857,0.786862,0.921089,1.028165,0.021555,1.319746
6,(marriage[T. Married-civ-spouse]),(race[T. White]),0.459937,0.854274,0.411842,0.895433,1.04818,0.018931,1.393614
17,(sex[T. Male]),(country[T. United-States]),0.669205,0.895857,0.598507,0.894355,0.998324,-0.001005,0.985784
10,(marriage[T. Married-civ-spouse]),(country[T. United-States]),0.459937,0.895857,0.410553,0.892628,0.996396,-0.001485,0.969929
8,(marriage[T. Married-civ-spouse]),(sex[T. Male]),0.459937,0.669205,0.409048,0.889356,1.328973,0.101255,2.989728


Task 3. Use lift for the rule interestingness (metric="lift") and generate rules up to
a depth of 3 (max len=3). Generate rules and flag at least 5 examples that you find
interesting. Comment on your findings and share a few notes on what you learned from
the rules you are highlighting. Note that you can choose your support and confidence
thresholds yourself in this analysis.

Please note, the insights are drawn from the interpretations of lift in an association rule made by IBM: https://www.ibm.com/docs/en/db2/11.1?topic=SSEPGG_11.1.0/com.ibm.im.model.doc/c_lift_in_an_association_rule.html

Lift = confidence / support of head 
support of the head = consequent support
support of the body = antecedent support


1. When max_len = 3: When minimum support is set at 0.02, the highest lift values appear to be closer to 0. A lift value that's closer to 0 indicates the antecedents list and the consequents list very rarely appear in a baset. And the occurrences of the antecedents list maybe have negative or no effect on the occurrence of the consequents list.
2. When max_len = 3: The mininum support is set at 0.003, the lift value reaches a relatively low point as 0.140238. It implies a negative or maybe no effect that the occurence of the other service as an occupation is associated with white or a package that's higher than 50K. Shown in instance #10633 and # 10632
3. When max_len = 2: When minimum support is set at 0.02 and max length is set at 2, the highest lift value is 3.852606 which indicates an occurence of proffesional occupation is highly associate with an occurence of this person having a master's degree, shown in instance # 122 and # 123: (occupation[T. Prof-specialty]) --> (education[T. Masters]) and (education[T. Masters]) --> (occupation[T. Prof-specialty])	
4. When max len = 2: minimum support is set at 0.005, then the highest lift value appears to reach a very high point at 29.756020. A lift value that's greater than 1 indicates the antecedents list and the consequents list appear together more frequently than expected. And the occurrences of the antecedents list have positive effect on the occurrence of the consequents list. For instance #497 and #496, an Aisn Pacific Islander is very much likely to be a Philippines, vice versa.
5. When max_len = 2, the min_support is set at 0.03, the Lowest Lift Value hits the point 2.009944. The results indicate that the occurence of a person being an executive managerial role is very likely to be associated with a package that is greater than 50K. Shown in instance # 8 and # 9. (occupation[T. Exec-managerial]) --> (Threshold[T. >50K]) and (Threshold[T. >50K]) --> (occupation[T. Exec-managerial])


Interesting Rule#1:\
When minimum support is set at 0.02 and max length is set at 3, the highest lift value is 3.882057 which indicates an occurence of a master level education degree is highly associated with an occurrence of a proffessional occupation and is a white, shown in instance # 1253 and # 1256: (occupation[T. Prof-specialty], race[T. White]) --> (education[T. Masters]) and 2. (education[T. Masters]) --> (occupation[T. Prof-specialty], race[T. White])


In [18]:
adult_general_itemsets = apriori(encoding_adult_df, min_support=0.02, max_len = 3, use_colnames=True)
association_rules(adult_general_itemsets, metric='lift', min_threshold=1).sort_values(by='lift', ascending=False).head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
1253,"(occupation[T. Prof-specialty], race[T. White])",(education[T. Masters]),0.112128,0.052916,0.023034,0.205423,3.882057,0.0171,1.191935
1256,(education[T. Masters]),"(occupation[T. Prof-specialty], race[T. White])",0.052916,0.112128,0.023034,0.435287,3.882057,0.0171,1.572254
1259,"(country[T. United-States], education[T. Maste...",(occupation[T. Prof-specialty]),0.046897,0.127146,0.022972,0.489849,3.852654,0.01701,1.710973
1260,(occupation[T. Prof-specialty]),"(country[T. United-States], education[T. Maste...",0.127146,0.046897,0.022972,0.180676,3.852654,0.01701,1.163281
122,(occupation[T. Prof-specialty]),(education[T. Masters]),0.127146,0.052916,0.025921,0.203865,3.852606,0.019193,1.189602


Interesting Rule#2: \
When max_len = 3: The mininum support is set at 0.003, the lift value reaches a relatively low point as 0.140238. It implies a negative or maybe no effect that the occurence of the other service as an occupation is associated with white or a package that's higher than 50K. Shown in instance #10633 and # 10632

In [19]:
adult_general_itemsets = apriori(encoding_adult_df, min_support=0.003, max_len = 3, use_colnames=True)
association_rules(adult_general_itemsets, metric='lift', min_threshold=0.01).sort_values(by='lift', ascending=True).head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
10634,(occupation[T. Other-service]),"(Threshold[T. >50K], race[T. White])",0.101195,0.218574,0.003102,0.030653,0.140238,-0.019017,0.806136
10631,"(Threshold[T. >50K], race[T. White])",(occupation[T. Other-service]),0.218574,0.101195,0.003102,0.014191,0.140238,-0.019017,0.911744
10642,"(country[T. United-States], Threshold[T. >50K])",(occupation[T. Other-service]),0.220233,0.101195,0.003255,0.014782,0.146072,-0.019031,0.912291
10647,(occupation[T. Other-service]),"(country[T. United-States], Threshold[T. >50K])",0.101195,0.220233,0.003255,0.03217,0.146072,-0.019031,0.805686
7351,(occupation[T. Prof-specialty]),"(education[T. HS-grad], sex[T. Male])",0.127146,0.21839,0.004085,0.032126,0.147102,-0.023683,0.807553


Interesting Rule#3: \
When minimum support is set at 0.02 and max length is set at 2, the highest lift value is 3.852606 which indicates an occurence of proffesional occupation is highly associate with an occurence of this person having a master's degree, shown in instance # 122 and # 123: (occupation[T. Prof-specialty]) --> (education[T. Masters]) and (education[T. Masters]) --> (occupation[T. Prof-specialty])	

In [20]:
adult_general_itemsets = apriori(encoding_adult_df, min_support=0.02, max_len = 2, use_colnames=True)
association_rules(adult_general_itemsets, metric='lift', min_threshold=1).sort_values(by='lift', ascending=False).head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
122,(occupation[T. Prof-specialty]),(education[T. Masters]),0.127146,0.052916,0.025921,0.203865,3.852606,0.019193,1.189602
123,(education[T. Masters]),(occupation[T. Prof-specialty]),0.052916,0.127146,0.025921,0.489843,3.852606,0.019193,1.710953
174,(marriage[T. Never-married]),(relationship[T. Own-child]),0.328092,0.155646,0.137741,0.419826,2.697307,0.086675,1.455345
175,(relationship[T. Own-child]),(marriage[T. Never-married]),0.155646,0.328092,0.137741,0.884964,2.697307,0.086675,5.840875
8,(occupation[T. Prof-specialty]),(workclass[ Local-gov]),0.127146,0.064279,0.021652,0.17029,2.649215,0.013479,1.127768


Interesting Rule#4:\
When max length is set at 2, minimum support is set at 0.005, then the highest lift value appears to reach a very high point at 29.756020. A lift value that's greater than 1 indicates the antecedents list and the consequents list appear together more frequently than expected. And the occurrences of the antecedents list have positive effect on the occurrence of the consequents list. For instance #497 and #496, an Aisn Pacific Islander is very much likely to be a Philippines, vice versa.

In [21]:
adult_general_itemsets = apriori(encoding_adult_df, min_support=0.005, max_len = 2, use_colnames=True)
association_rules(adult_general_itemsets, metric='lift', min_threshold=1).sort_values(by='lift', ascending=False).head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
496,(race[T. Asian-Pac-Islander]),(country[T. Philippines]),0.031909,0.006081,0.005774,0.180943,29.75602,0.00558,1.213492
497,(country[T. Philippines]),(race[T. Asian-Pac-Islander]),0.006081,0.031909,0.005774,0.949495,29.75602,0.00558,19.168195
42,(occupation[T. Protective-serv]),(workclass[ Local-gov]),0.019932,0.064279,0.009336,0.468413,7.287145,0.008055,1.76024
43,(workclass[ Local-gov]),(occupation[T. Protective-serv]),0.064279,0.019932,0.009336,0.145246,7.287145,0.008055,1.146609
287,(education[T. Prof-school]),(occupation[T. Prof-specialty]),0.01769,0.127146,0.013882,0.784722,6.171821,0.011632,4.054548


Interesting Rule#5: \
When max_len = 2, the min_support is set at 0.03, the Lowest Lift Value hits the point 2.009944. The results indicate that the occurence of a person being an executive managerial role is very likely to be associated with a package that is greater than 50K. Shown in instance # 8 and # 9. (occupation[T. Exec-managerial]) --> (Threshold[T. >50K]) and (Threshold[T. >50K]) --> (occupation[T. Exec-managerial])

In [22]:
adult_general_itemsets = apriori(encoding_adult_df, min_support=0.03, max_len = 2, use_colnames=True)
association_rules(adult_general_itemsets, metric='lift', min_threshold=2).sort_values(by='lift', ascending=True).head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
8,(occupation[T. Exec-managerial]),(Threshold[T. >50K]),0.124873,0.24081,0.06044,0.484014,2.009944,0.03037,1.471339
9,(Threshold[T. >50K]),(occupation[T. Exec-managerial]),0.24081,0.124873,0.06044,0.250988,2.009944,0.03037,1.168375
0,(occupation[T. Exec-managerial]),(education[T. Bachelors]),0.124873,0.164461,0.042044,0.336695,2.047266,0.021507,1.25966
1,(education[T. Bachelors]),(occupation[T. Exec-managerial]),0.164461,0.124873,0.042044,0.255649,2.047266,0.021507,1.175691
4,(marriage[T. Married-civ-spouse]),(relationship[T. Wife]),0.459937,0.048156,0.047787,0.1039,2.157573,0.025639,1.062207


Task 4. Compare the top rules using the two interestingness measures for the same levels
of support (use at least two different levels of support) and comment on your findings.

Case 1 
* Controlled Variables: Top Rule Threshold: When min_support is at 0.5, max_len is at 2 the lift level is at the highest.\
min_support at 0.5 \
max_len = 2 
* Differences: \
Result A uses confidence as metric \
Result B uses lift as metric 
* Conclusion: \
With above conditions applied, result A has generated 6 instances whereas result B has 10 instances. \
The overall confidence in result A is almost identical to result B.\
Confidence in result A is a bit higher than result B since B has more instances.
In this case, both confidence and lift metrics are equally in functionality.

In [52]:
#Result A.
adult_general_itemsets = apriori(encoding_adult_df, min_support=0.5, max_len = 2, use_colnames=True)
association_rules(adult_general_itemsets, metric="confidence").sort_values(by='confidence', ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
4,(race[T. White]),(country[T. United-States]),0.854274,0.895857,0.786862,0.921089,1.028165,0.021555,1.319746
5,(sex[T. Male]),(country[T. United-States]),0.669205,0.895857,0.598507,0.894355,0.998324,-0.001005,0.985784
1,(workclass[ Private]),(country[T. United-States]),0.69703,0.895857,0.618378,0.887161,0.990293,-0.006062,0.922932
2,(sex[T. Male]),(race[T. White]),0.669205,0.854274,0.588864,0.879945,1.030051,0.017179,1.21383
3,(country[T. United-States]),(race[T. White]),0.895857,0.854274,0.786862,0.878334,1.028165,0.021555,1.197758
0,(workclass[ Private]),(race[T. White]),0.69703,0.854274,0.595928,0.854952,1.000795,0.000473,1.004681


In [53]:
#Result B.
adult_general_itemsets = apriori(encoding_adult_df, min_support=0.5, max_len = 2, use_colnames=True)
association_rules(adult_general_itemsets, metric="lift").sort_values(by='confidence', ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
7,(race[T. White]),(country[T. United-States]),0.854274,0.895857,0.786862,0.921089,1.028165,0.021555,1.319746
9,(sex[T. Male]),(country[T. United-States]),0.669205,0.895857,0.598507,0.894355,0.998324,-0.001005,0.985784
3,(workclass[ Private]),(country[T. United-States]),0.69703,0.895857,0.618378,0.887161,0.990293,-0.006062,0.922932
5,(sex[T. Male]),(race[T. White]),0.669205,0.854274,0.588864,0.879945,1.030051,0.017179,1.21383
6,(country[T. United-States]),(race[T. White]),0.895857,0.854274,0.786862,0.878334,1.028165,0.021555,1.197758
0,(workclass[ Private]),(race[T. White]),0.69703,0.854274,0.595928,0.854952,1.000795,0.000473,1.004681
1,(race[T. White]),(workclass[ Private]),0.854274,0.69703,0.595928,0.697584,1.000795,0.000473,1.001832
2,(country[T. United-States]),(workclass[ Private]),0.895857,0.69703,0.618378,0.690264,0.990293,-0.006062,0.978155
4,(race[T. White]),(sex[T. Male]),0.854274,0.669205,0.588864,0.689316,1.030051,0.017179,1.064728
8,(country[T. United-States]),(sex[T. Male]),0.895857,0.669205,0.598507,0.668084,0.998324,-0.001005,0.99662


Case 2. 
Top Rule Threshold: When min_support is at 0.005, the lift level is at the highest. 
* Controlled Variables: \
min_support at 0.005 \
max_len = 2 
* Differences: \
Result A uses lift as metric \
Result B uses confidence as metric 
* Conclusion: \
With above conditions applied, result A has 756 instances whereas result B has 96 instances. \
When lift reaches the highest point at 29.756, the confidence in result A has 0.180943 and 0.949495 while the confidence in result B is 0.949495. \
In general, both lift and confidence reach to the lower points in result A when using lift metric.
Applying lift in metric is helpful to avoid the ignorance of the support value. 

In [36]:
#Result A.
adult_general_itemsets = apriori(encoding_adult_df, min_support=0.005, max_len = 2, use_colnames=True)
association_rules(adult_general_itemsets, metric='lift').sort_values(by='lift', ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
734,(race[T. Asian-Pac-Islander]),(country[T. Philippines]),0.031909,0.006081,0.005774,0.180943,29.756020,0.005580,1.213492
735,(country[T. Philippines]),(race[T. Asian-Pac-Islander]),0.006081,0.031909,0.005774,0.949495,29.756020,0.005580,19.168195
62,(occupation[T. Protective-serv]),(workclass[ Local-gov]),0.019932,0.064279,0.009336,0.468413,7.287145,0.008055,1.760240
63,(workclass[ Local-gov]),(occupation[T. Protective-serv]),0.064279,0.019932,0.009336,0.145246,7.287145,0.008055,1.146609
409,(education[T. Prof-school]),(occupation[T. Prof-specialty]),0.017690,0.127146,0.013882,0.784722,6.171821,0.011632,4.054548
...,...,...,...,...,...,...,...,...,...
696,(race[T. Asian-Pac-Islander]),(relationship[T. Not-in-family]),0.031909,0.255060,0.006572,0.205967,0.807526,-0.001567,0.938173
217,(relationship[T. Own-child]),(workclass[ State-gov]),0.155646,0.039864,0.005006,0.032163,0.806815,-0.001199,0.992043
216,(workclass[ State-gov]),(relationship[T. Own-child]),0.039864,0.155646,0.005006,0.125578,0.806815,-0.001199,0.965613
121,(workclass[ Private]),(occupation[T. Prof-specialty]),0.697030,0.127146,0.071036,0.101912,0.801537,-0.017589,0.971903


In [37]:
#Result B.
adult_general_itemsets = apriori(encoding_adult_df, min_support=0.005, max_len = 2, use_colnames=True)
association_rules(adult_general_itemsets, metric='confidence').sort_values(by='lift', ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
86,(country[T. Philippines]),(race[T. Asian-Pac-Islander]),0.006081,0.031909,0.005774,0.949495,29.756020,0.005580,19.168195
46,(relationship[T. Own-child]),(marriage[T. Never-married]),0.155646,0.328092,0.137741,0.884964,2.697307,0.086675,5.840875
41,(relationship[T. Wife]),(marriage[T. Married-civ-spouse]),0.048156,0.459937,0.047787,0.992347,2.157573,0.025639,70.568267
45,(Threshold[T. >50K]),(marriage[T. Married-civ-spouse]),0.240810,0.459937,0.205522,0.853463,1.855609,0.094765,3.685497
55,(occupation[T. Craft-repair]),(sex[T. Male]),0.125887,0.669205,0.119069,0.945840,1.413378,0.034825,6.107778
...,...,...,...,...,...,...,...,...,...
52,(occupation[T. Adm-clerical]),(race[T. White]),0.115783,0.854274,0.094715,0.818037,0.957582,-0.004196,0.800858
65,(occupation[T. Machine-op-inspct]),(race[T. White]),0.061485,0.854274,0.049476,0.804695,0.941964,-0.003048,0.746149
22,(education[T. 12th]),(country[T. United-States]),0.013298,0.895857,0.011210,0.842956,0.940949,-0.000703,0.663146
67,(occupation[T. Other-service]),(country[T. United-States]),0.101195,0.895857,0.085286,0.842792,0.940766,-0.005370,0.662454
