## Associated Rule Learning Practice

In [1]:
# import mlxtend to detect associations

import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

### Test on Apriori

In [2]:
# self-made set

data = {'ID': [1, 2, 3, 4, 5, 6],
        'Onion': [1, 0, 0, 1, 1, 1],
        'Potato': [1, 1, 0, 1, 1, 1],
        'Burger': [1, 1, 0, 0, 1, 1],
        'Milk': [0, 1, 1, 1, 0, 1],
        'Beer': [0, 0, 1, 0, 1, 0]}

df = pd.DataFrame(data)

df = df[['ID', 'Onion', 'Potato', 'Burger', 'Milk', 'Beer']]

In [3]:
df

Unnamed: 0,ID,Onion,Potato,Burger,Milk,Beer
0,1,1,1,1,0,0
1,2,0,1,1,1,0
2,3,0,0,0,1,1
3,4,1,1,0,1,0
4,5,1,1,1,0,1
5,6,1,1,1,1,0


### Minimum support to discover rules

In [4]:
# call apriori out directly to see frequent combination of items

frequent_itemsets = apriori(
    df[['Onion', 'Potato', 'Burger', 'Milk', 'Beer']], min_support=0.50, use_colnames=True)

In [5]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.666667,(Onion)
1,0.833333,(Potato)
2,0.666667,(Burger)
3,0.666667,(Milk)
4,0.666667,"(Potato, Onion)"
5,0.5,"(Burger, Onion)"
6,0.666667,"(Burger, Potato)"
7,0.5,"(Milk, Potato)"
8,0.5,"(Burger, Potato, Onion)"


### Present association rule metrics

In [6]:
# present different metrics to show association between groups

rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1)

In [7]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Potato),(Onion),0.833333,0.666667,0.666667,0.8,1.2,0.111111,1.666667
1,(Onion),(Potato),0.666667,0.833333,0.666667,1.0,1.2,0.111111,inf
2,(Burger),(Onion),0.666667,0.666667,0.5,0.75,1.125,0.055556,1.333333
3,(Onion),(Burger),0.666667,0.666667,0.5,0.75,1.125,0.055556,1.333333
4,(Burger),(Potato),0.666667,0.833333,0.666667,1.0,1.2,0.111111,inf
5,(Potato),(Burger),0.833333,0.666667,0.666667,0.8,1.2,0.111111,1.666667
6,"(Burger, Potato)",(Onion),0.666667,0.666667,0.5,0.75,1.125,0.055556,1.333333
7,"(Burger, Onion)",(Potato),0.5,0.833333,0.5,1.0,1.2,0.083333,inf
8,"(Potato, Onion)",(Burger),0.666667,0.666667,0.5,0.75,1.125,0.055556,1.333333
9,(Burger),"(Potato, Onion)",0.666667,0.666667,0.5,0.75,1.125,0.055556,1.333333


In [8]:
# You see, it's clear from support that Onion and Burger are associated with Potato
# (Onino, Burger) is also possible with Potato.

rules[(rules['lift'] > 1.125) & (rules['confidence'] > 0.8)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
1,(Onion),(Potato),0.666667,0.833333,0.666667,1.0,1.2,0.111111,inf
4,(Burger),(Potato),0.666667,0.833333,0.666667,1.0,1.2,0.111111,inf
7,"(Burger, Onion)",(Potato),0.5,0.833333,0.5,1.0,1.2,0.083333,inf


### In reality, we face lists of baskets...

In [9]:
# Build lists of baskets

retail_shopping_basket = {'ID':[1,2,3,4,5,6],
                         'Basket':[['Beer', 'Diaper', 'Pretzels', 'Chips', 'Aspirin'],
                                   ['Diaper', 'Beer', 'Chips', 'Lotion', 'Juice', 'BabyFood', 'Milk'],
                                   ['Soda', 'Chips', 'Milk'],
                                   ['Soup', 'Beer', 'Diaper', 'Milk', 'IceCream'],
                                   ['Soda', 'Coffee', 'Milk', 'Bread'],
                                   ['Beer', 'Chips']
                                  ]
                         }

retail = pd.DataFrame(retail_shopping_basket)

retail = retail[['ID', 'Basket']]

pd.options.display.max_colwidth=100

In [10]:
# in DataFrame, they are in a column, we have to split the lists and count.

retail

Unnamed: 0,ID,Basket
0,1,"[Beer, Diaper, Pretzels, Chips, Aspirin]"
1,2,"[Diaper, Beer, Chips, Lotion, Juice, BabyFood, Milk]"
2,3,"[Soda, Chips, Milk]"
3,4,"[Soup, Beer, Diaper, Milk, IceCream]"
4,5,"[Soda, Coffee, Milk, Bread]"
5,6,"[Beer, Chips]"


In [11]:
# drop unused id column

retail_id = retail.drop('Basket' ,1)
retail_id

Unnamed: 0,ID
0,1
1,2
2,3
3,4
4,5
5,6


In [12]:
# Use one-hot encoder to show frequencies of items.

# Bind the lists into strings, split by comma

retail_Basket = retail.Basket.str.join(',')
retail_Basket

0              Beer,Diaper,Pretzels,Chips,Aspirin
1    Diaper,Beer,Chips,Lotion,Juice,BabyFood,Milk
2                                 Soda,Chips,Milk
3                  Soup,Beer,Diaper,Milk,IceCream
4                          Soda,Coffee,Milk,Bread
5                                      Beer,Chips
Name: Basket, dtype: object

In [13]:
# Get_dummy contains string split method.

retail_Basket = retail_Basket.str.get_dummies(',')
retail_Basket

Unnamed: 0,Aspirin,BabyFood,Beer,Bread,Chips,Coffee,Diaper,IceCream,Juice,Lotion,Milk,Pretzels,Soda,Soup
0,1,0,1,0,1,0,1,0,0,0,0,1,0,0
1,0,1,1,0,1,0,1,0,1,1,1,0,0,0
2,0,0,0,0,1,0,0,0,0,0,1,0,1,0
3,0,0,1,0,0,0,1,1,0,0,1,0,0,1
4,0,0,0,1,0,1,0,0,0,0,1,0,1,0
5,0,0,1,0,1,0,0,0,0,0,0,0,0,0


In [14]:
# return id to new dataframe.

retail = retail_id.join(retail_Basket)
retail

Unnamed: 0,ID,Aspirin,BabyFood,Beer,Bread,Chips,Coffee,Diaper,IceCream,Juice,Lotion,Milk,Pretzels,Soda,Soup
0,1,1,0,1,0,1,0,1,0,0,0,0,1,0,0
1,2,0,1,1,0,1,0,1,0,1,1,1,0,0,0
2,3,0,0,0,0,1,0,0,0,0,0,1,0,1,0
3,4,0,0,1,0,0,0,1,1,0,0,1,0,0,1
4,5,0,0,0,1,0,1,0,0,0,0,1,0,1,0
5,6,0,0,1,0,1,0,0,0,0,0,0,0,0,0


In [15]:
# using apriori the same way

frequent_itemsets_2 = apriori(retail.drop('ID',1), use_colnames=True)

In [16]:
frequent_itemsets_2

Unnamed: 0,support,itemsets
0,0.666667,(Beer)
1,0.666667,(Chips)
2,0.5,(Diaper)
3,0.666667,(Milk)
4,0.5,"(Chips, Beer)"
5,0.5,"(Beer, Diaper)"


In [17]:
# There are only Beer, Chips, Diaper, and Milk
# Easy to know by higher confidence and lift that Diaper is related to Beer.

association_rules(frequent_itemsets_2, metric='lift')

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Chips),(Beer),0.666667,0.666667,0.5,0.75,1.125,0.055556,1.333333
1,(Beer),(Chips),0.666667,0.666667,0.5,0.75,1.125,0.055556,1.333333
2,(Beer),(Diaper),0.666667,0.5,0.5,0.75,1.5,0.166667,2.0
3,(Diaper),(Beer),0.5,0.666667,0.5,1.0,1.5,0.166667,inf


### Real practice on Movie Genres

In [18]:
# Dataset comes from https://grouplens.org/datasets/movielens/

movies = pd.read_csv('ml-latest-small/movies.csv')

In [19]:
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [20]:
#see more columns
pd.options.display.max_columns=100
# one-hot encoder to split strings
movies_ohe = movies.drop('genres',1).join(movies.genres.str.get_dummies())
movies_ohe.head()

Unnamed: 0,movieId,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [21]:
# the shape is 9125 rows, 22 columns.

movies_ohe.shape

(9125, 22)

In [22]:
# move id and title to multiindex
movies_ohe.set_index(['movieId','title'],inplace=True)

In [23]:
movies_ohe.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,title,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,Jumanji (1995),0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,Grumpier Old Men (1995),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,Waiting to Exhale (1995),0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
5,Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [24]:
# apriori association for genres
frequent_itemsets_movies = apriori(movies_ohe,use_colnames=True, min_support=0.025)

In [25]:
frequent_itemsets_movies

Unnamed: 0,support,itemsets
0,0.169315,(Action)
1,0.122411,(Adventure)
2,0.048986,(Animation)
3,0.06389,(Children)
4,0.363288,(Comedy)
5,0.120548,(Crime)
6,0.054247,(Documentary)
7,0.478356,(Drama)
8,0.071671,(Fantasy)
9,0.09611,(Horror)


In [26]:
# calculate genres group association metrics.
rules_movies =  association_rules(frequent_itemsets_movies, metric='lift', min_threshold=1.25)

In [27]:
rules_movies

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Action),(Adventure),0.169315,0.122411,0.058301,0.344337,2.812955,0.037575,1.338475
1,(Adventure),(Action),0.122411,0.169315,0.058301,0.476276,2.812955,0.037575,1.586111
2,(Crime),(Action),0.120548,0.169315,0.038247,0.317273,1.87386,0.017836,1.216716
3,(Action),(Crime),0.169315,0.120548,0.038247,0.22589,1.87386,0.017836,1.136081
4,(Sci-Fi),(Action),0.086795,0.169315,0.040986,0.472222,2.789015,0.026291,1.573929
5,(Action),(Sci-Fi),0.169315,0.086795,0.040986,0.242071,2.789015,0.026291,1.20487
6,(Thriller),(Action),0.189479,0.169315,0.062904,0.331984,1.960746,0.030822,1.24351
7,(Action),(Thriller),0.169315,0.189479,0.062904,0.371521,1.960746,0.030822,1.289654
8,(Children),(Adventure),0.06389,0.122411,0.02926,0.457976,3.741299,0.021439,1.619096
9,(Adventure),(Children),0.122411,0.06389,0.02926,0.239033,3.741299,0.021439,1.230158


In [28]:
# sorted by lift, apparently Children and Animation are associated.
rules_movies[(rules_movies.lift>4)].sort_values(by=['lift'], ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
14,(Children),(Animation),0.06389,0.048986,0.027068,0.423671,8.648758,0.023939,1.650122
15,(Animation),(Children),0.048986,0.06389,0.027068,0.552573,8.648758,0.023939,2.092205


In [29]:
# Showing movies with Children but without Animation, there aren't so many, proving the association of Children and Animation
movies[(movies.genres.str.contains('Children')) & (~movies.genres.str.contains('Animation'))]

Unnamed: 0,movieId,title,genres
1,2,Jumanji (1995),Adventure|Children|Fantasy
7,8,Tom and Huck (1995),Adventure|Children
26,27,Now and Then (1995),Children|Drama
32,34,Babe (1995),Children|Drama
36,38,It Takes Two (1995),Children|Comedy
...,...,...,...
8918,135268,Zenon: Z3 (2004),Adventure|Children|Comedy
8960,139620,Everything's Gonna Be Great (1998),Adventure|Children|Comedy|Drama
8967,140152,Dreamcatcher (2015),Children|Crime|Documentary
8981,140747,16 Wishes (2010),Children|Drama|Fantasy
