In [44]:
import pandas as pd
import numpy as np
from itertools import permutations 
# Load transactions from pandas.
books = pd.read_csv("datasets/bookstore_transactions.csv")

# Print the header
print(books.head(2))

        Transaction
0  History,Bookmark
1  History,Bookmark


In [13]:
books.Transaction.value_counts()

Biography,Bookmark    40
History,Bookmark      25
Fiction,Bookmark      25
Poetry,Bookmark        9
Name: Transaction, dtype: int64

In [7]:
# Split transaction strings into lists.
transactions = books['Transaction'].apply(lambda t: t.split(','))
# Convert DataFrame into list of strings.
transactions = list(transactions)

In [14]:
transactions.count(['Biography','Bookmark'])

40

In [17]:
transactions.count(['Poetry','Bookmark'])

9

In [19]:
# Import the transaction encoder function from mlxtend
from mlxtend.preprocessing import TransactionEncoder
import pandas as pd

# Instantiate transaction encoder and identify unique items
encoder = TransactionEncoder().fit(transactions)

# One-hot encode transactions
onehot = encoder.transform(transactions)

# Convert one-hot encoded data to DataFrame
onehot = pd.DataFrame(onehot, columns = encoder.columns_)

# Print the one-hot encoded transaction dataset
print(onehot)

    Biography  Bookmark  Fiction  History  Poetry
0       False      True    False     True   False
1       False      True    False     True   False
2       False      True     True    False   False
3        True      True    False    False   False
4       False      True    False     True   False
..        ...       ...      ...      ...     ...
94       True      True    False    False   False
95      False      True    False    False    True
96      False      True    False     True   False
97       True      True    False    False   False
98       True      True    False    False   False

[99 rows x 5 columns]


In [20]:
# Compute the support
support = onehot.mean()

# Print the support
print(support)

Biography    0.404040
Bookmark     1.000000
Fiction      0.252525
History      0.252525
Poetry       0.090909
dtype: float64


In [23]:
# Add a jam+bread column to the DataFrame onehot
onehot['Biography+History'] = np.logical_and(onehot['Biography'], onehot['History'])

# Compute the support
support = onehot.mean()

# Print the support values
print(support)

Biography            0.404040
Bookmark             1.000000
Fiction              0.252525
History              0.252525
Poetry               0.090909
Biography+History    0.000000
dtype: float64


In [25]:
# Load transactions from pandas.
movies = pd.read_csv("datasets/movielens_movies.csv")

# Print the header
print(movies.head(2))

   movieId             title                                       genres
0        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy
1        2    Jumanji (1995)                   Adventure|Children|Fantasy


In [26]:
# Split transaction strings into lists.
transactions = movies['genres'].apply(lambda t: t.split('|'))
# Convert DataFrame into list of strings.
transactions = list(transactions)

In [27]:
# Import the transaction encoder function from mlxtend
from mlxtend.preprocessing import TransactionEncoder
import pandas as pd

# Instantiate transaction encoder and identify unique items
encoder = TransactionEncoder().fit(transactions)

# One-hot encode transactions
onehot = encoder.transform(transactions)

# Convert one-hot encoded data to DataFrame
onehot = pd.DataFrame(onehot, columns = encoder.columns_)

# Print the one-hot encoded transaction dataset
print(onehot)

       (no genres listed)  Action  Adventure  Animation  Children  Comedy  \
0                   False   False       True       True      True    True   
1                   False   False       True      False      True   False   
2                   False   False      False      False     False    True   
3                   False   False      False      False     False    True   
4                   False   False      False      False     False    True   
...                   ...     ...        ...        ...       ...     ...   
27273               False   False      False      False     False    True   
27274               False   False      False      False     False    True   
27275               False   False       True      False     False   False   
27276                True   False      False      False     False   False   
27277               False   False       True      False     False   False   

       Crime  Documentary  Drama  Fantasy  Film-Noir  Horror   IMAX  Musica

In [30]:
supportCD = np.logical_and(onehot['Crime'], onehot['Documentary']).mean()
supportFW = np.logical_and(onehot['Fantasy'], onehot['War']).mean()
supportCC = np.logical_and(onehot['Comedy'], onehot['Children']).mean()

# Print support values
print("Hunger Games and Harry Potter: %.2f" % supportCD)
print("Hunger Games and Twilight: %.2f" % supportFW)
print("Hunger Games and Twilight: %.2f" % supportCC)

Hunger Games and Harry Potter: 0.00
Hunger Games and Twilight: 0.00
Hunger Games and Twilight: 0.02


In [31]:
# Compute support for Potter and Twilight
supportCC = np.logical_and(onehot['Comedy'], onehot['Children']).mean()

# Compute support for Potter
supportCh = onehot['Children'].mean()

# Compute support for Twilight
supportCo = onehot['Comedy'].mean()

# Compute confidence for both rules
confidenceCh_Co = supportCC / supportCh
confidenceCo_Ch = supportCC / supportCo

# Print results
print('{0:.2f}, {1:.2f}'.format(confidenceCh_Co, confidenceCo_Ch))

0.46, 0.06


In [34]:
# Compute support for Comedy and Children
supportCC = np.logical_and(onehot['Comedy'], onehot['Children']).mean()

# Compute support for Children
supportCh = onehot['Children'].mean()

# Compute support for Comedy
supportCo = onehot['Comedy'].mean()

# Compute lift
lift = supportCC / (supportCh * supportCo)

# Print lift
print("Lift: %.2f" % lift)

Lift: 1.50


# Lift is more than 1 hence relationship between movies of Children genre and Comedy is not random

In [35]:
# Compute support for Comedy and Children
supportCC = np.logical_and(onehot['Comedy'], onehot['Children']).mean()

# Compute support for Children
supportCh = onehot['Children'].mean()


# Compute support for NOT Hunger
supportnCo = 1.0 - onehot['Comedy'].mean()

# Compute support for Potter and NOT Hunger
supportChnCo = supportCh - supportCC

# Compute and print conviction for Potter -> Hunger
conviction = supportCh * supportnCo / supportChnCo
print("Conviction: %.2f" % conviction)

Conviction: 1.29


# Conviction

In [36]:
def conviction(antecedent, consequent):
	# Compute support for antecedent AND consequent
	supportAC = np.logical_and(antecedent, consequent).mean()

	# Compute support for antecedent
	supportA = antecedent.mean()

	# Compute support for NOT consequent
	supportnC = 1.0 - consequent.mean()

	# Compute support for antecedent and NOT consequent
	supportAnC = supportA - supportAC

    # Return conviction
	return supportA * supportnC / supportAnC

In [37]:
onehot.columns

Index(['(no genres listed)', 'Action', 'Adventure', 'Animation', 'Children',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
       'War', 'Western'],
      dtype='object')

In [38]:
# Compute conviction for Documentary -> Sci-Fi and Sci-Fi -> Documentary
convictionDS = conviction(onehot['Documentary'], onehot['Sci-Fi'])
convictionSD = conviction(onehot['Sci-Fi'], onehot['Documentary'])


# Print results
print('Documentary -> Sci-Fi: ', convictionDS)
print('Sci-Fi -> Documentary: ', convictionSD)

Documentary -> Sci-Fi:  0.9402881770770929
Sci-Fi -> Documentary:  0.9151899050864022


In [39]:
# Compute conviction for Children -> Animation and Animation -> Children
convictionCA = conviction(onehot['Children'], onehot['Animation'])
convictionAC = conviction(onehot['Animation'], onehot['Children'])


# Print results
print('Children -> Animation: ', convictionCA)
print('Animation -> Children: ', convictionAC)

Children -> Animation:  1.6384414758039654
Animation -> Children:  1.7668174996640087


# Notice that the value of conviction was less than 1, suggesting that the rule `if Potter then Hunger'' is not supported.

# Zhang metric

In [41]:
# Zhang metric Comdefy --> Children (range +1 -1)

# Compute support for Children
supportCh = onehot['Children'].mean()


# Compute support for NOT Hunger
supportCo = onehot['Comedy'].mean()

supportCC = np.logical_and(onehot['Comedy'], onehot['Children']).mean()

# Complete the expressions for the numerator and denominator
numerator = supportCC - supportCh*supportCo
denominator = max(supportCC*(1-supportCh), supportCh*(1-supportCo))

# Compute and print Zhang's metric
zhang = numerator / denominator
print(zhang)

0.22340325333081665


In [42]:
# Define a function to compute Zhang's metric
def zhang(antecedent, consequent):
	# Compute the support of each book
	supportA = antecedent.mean()
	supportC = consequent.mean()

	# Compute the support of both books
	supportAC = np.logical_and(antecedent, consequent).mean()

	# Complete the expressions for the numerator and denominator
	numerator = supportAC - supportA*supportC
	denominator = max(supportAC*(1-supportC), supportA*(1-supportC))

	# Return Zhang's metric
	return numerator / denominator

In [64]:
itemsets = permutations(['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX',
       'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
       'Western'], 2) 

In [69]:
# Define an empty list for Zhang's metric
zhangs_metric = []
antecedent_l = []
consequent_l = []

itemsets = permutations(['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX',
       'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
       'Western'], 2) 

# Loop over lists in itemsets
for itemset in list(itemsets):
    print(itemset)
    # Extract the antecedent and consequent columns
    antecedent = onehot[itemset[0]]
    consequent = onehot[itemset[1]]

    # Complete Zhang's metric and append it to the list
    zhangs_metric.append(zhang(antecedent, consequent))
    antecedent_l.append(itemset[0])
    consequent_l.append(itemset[1])

# Print results
rules = pd.DataFrame({'antecedent' : antecedent_l, 'consequent' : consequent_l, 'zhangs_metric' : zhangs_metric})
print(rules)

('Action', 'Adventure')
('Action', 'Animation')
('Action', 'Children')
('Action', 'Comedy')
('Action', 'Crime')
('Action', 'Documentary')
('Action', 'Drama')
('Action', 'Fantasy')
('Action', 'Film-Noir')
('Action', 'Horror')
('Action', 'IMAX')
('Action', 'Musical')
('Action', 'Mystery')
('Action', 'Romance')
('Action', 'Sci-Fi')
('Action', 'Thriller')
('Action', 'War')
('Action', 'Western')
('Adventure', 'Action')
('Adventure', 'Animation')
('Adventure', 'Children')
('Adventure', 'Comedy')
('Adventure', 'Crime')
('Adventure', 'Documentary')
('Adventure', 'Drama')
('Adventure', 'Fantasy')
('Adventure', 'Film-Noir')
('Adventure', 'Horror')
('Adventure', 'IMAX')
('Adventure', 'Musical')
('Adventure', 'Mystery')
('Adventure', 'Romance')
('Adventure', 'Sci-Fi')
('Adventure', 'Thriller')
('Adventure', 'War')
('Adventure', 'Western')
('Animation', 'Action')
('Animation', 'Adventure')
('Animation', 'Children')
('Animation', 'Comedy')
('Animation', 'Crime')
('Animation', 'Documentary')
('Animat

In [76]:
rules.sort_values('zhangs_metric')

Unnamed: 0,antecedent,consequent,zhangs_metric
114,Documentary,Drama,-0.861795
42,Animation,Drama,-0.704134
187,Horror,Drama,-0.620260
60,Children,Drama,-0.579532
205,IMAX,Drama,-0.568124
...,...,...,...
169,Film-Noir,Drama,0.436432
249,Mystery,Thriller,0.439206
198,IMAX,Action,0.443494
167,Film-Noir,Crime,0.490567


# Zhang metric: Closer to -1 -- more  dissociated, while closer to +1 more associated

# Aggregation

In [77]:
import pandas as pd

In [79]:
df = pd.read_csv('datasets/online_retail.csv')

In [80]:
df

Unnamed: 0,InvoiceNo,StockCode,Description
0,562583,35637A,IVORY STRING CURTAIN WITH POLE
1,562583,35638A,PINK AND BLACK STRING CURTAIN
2,562583,84927F,PSYCHEDELIC TILE HOOK
3,562583,22425,ENAMEL COLANDER CREAM
4,562583,16008,SMALL FOLDING SCISSOR(POINTED EDGE)
...,...,...,...
227755,C581229,23158,SET OF 5 LUCKY CAT MAGNETS
227756,C581229,22712,CARD DOLLY GIRL
227757,C581229,22027,TEA PARTY BIRTHDAY CARD
227758,C581229,21508,VINTAGE KID DOLLY CARD


In [83]:
onehot = pd.get_dummies(df.Description)

In [84]:
# Select the column headers for sign items
sign_headers = [i for i in onehot.columns if i.lower().find('sign')>=0]

# Select columns of sign items
sign_columns = onehot[sign_headers]

# Perform aggregation of sign items into sign category
signs = sign_columns.sum(axis = 1) >= 1.0

# Print support for signs
print('Share of Signs: %.2f' % signs.mean())

Share of Signs: 0.08


In [86]:
def aggregate(item):
	# Select the column headers for sign items
	item_headers = [i for i in onehot.columns if i.lower().find(item)>=0]

	# Select columns of sign items
	item_columns = onehot[item_headers]

	# Return category of aggregated items
	return item_columns.sum(axis = 1) >= 1.0

# Aggregate items for the bags, boxes, and candles categories  
bags = aggregate('bag')
boxes = aggregate('boxes')
candles = aggregate('candles')

In [88]:
print('Share of bags: %.2f' % bags.mean())
print('Share of boxes: %.2f' % boxes.mean())
print('Share of candles: %.2f' % candles.mean())

Share of bags: 0.09
Share of boxes: 0.00
Share of candles: 0.01


In [90]:
# Import apriori from mlxtend
from mlxtend.frequent_patterns import apriori

# Compute frequent itemsets using the Apriori algorithm
frequent_itemsets = apriori(onehot, 
                            min_support = 0.002, 
                            max_len = 3, 
                            use_colnames = True)

# Print the number of freqeuent itemsets
print(frequent_itemsets)

     support                               itemsets
0   0.002129      (60 CAKE CASES VINTAGE CHRISTMAS)
1   0.002107           (ALARM CLOCK BAKELIKE GREEN)
2   0.002661        (ASSORTED COLOUR BIRD ORNAMENT)
3   0.002099        (BAKING SET 9 PIECE RETROSPOT )
4   0.002397           (CHOCOLATE HOT WATER BOTTLE)
5   0.002178   (GARDENERS KNEELING PAD CUP OF TEA )
6   0.002612    (GARDENERS KNEELING PAD KEEP CALM )
7   0.002415               (HAND WARMER OWL DESIGN)
8   0.002094        (HAND WARMER SCOTTY DOG DESIGN)
9   0.003451           (HOT WATER BOTTLE KEEP CALM)
10  0.002331    (HOT WATER BOTTLE TEA AND SYMPATHY)
11  0.002806            (JUMBO BAG 50'S CHRISTMAS )
12  0.002121                   (JUMBO BAG ALPHABET)
13  0.003556              (JUMBO BAG RED RETROSPOT)
14  0.002972             (JUMBO BAG VINTAGE DOILY )
15  0.002112               (JUMBO BAG VINTAGE LEAF)
16  0.002112                (LOVE HOT WATER BOTTLE)
17  0.002503              (LUNCH BAG  BLACK SKULL.)
18  0.002064

In [91]:
print(len(frequent_itemsets))

41


In [92]:
# Import apriori from mlxtend
from mlxtend.frequent_patterns import apriori

# Compute frequent itemsets using a support of 0.003 and length of 3
frequent_itemsets_1 = apriori(onehot, min_support = 0.003, 
                            max_len=3, use_colnames = True)

# Compute frequent itemsets using a support of 0.001 and length of 3
frequent_itemsets_2 = apriori(onehot, min_support = 0.001, 
                            max_len=3, use_colnames = True)

# Print the number of freqeuent itemsets
print(len(frequent_itemsets_1), len(frequent_itemsets_2))

5 199
