# Recommendation Systems

In [1]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules

import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

## 1. Content based recommender

In [2]:
users = pd.read_json('renttherunway_final_data.json', lines=True)
#data = data.iloc[:10000, :]
users = users.dropna()

In [3]:
# 1. Count the frequency of each user_id
user_counts = users['user_id'].value_counts()

In [4]:
# 2. Filter the dataset to only include user_id's with count >= 5
frequent_users = user_counts[user_counts >= 50].index
users = users[users['user_id'].isin(frequent_users)]
users = users.reset_index(drop=True)
# Display the filtered dataset
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2713 entries, 0 to 2712
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   fit             2713 non-null   object 
 1   user_id         2713 non-null   int64  
 2   bust size       2713 non-null   object 
 3   item_id         2713 non-null   int64  
 4   weight          2713 non-null   object 
 5   rating          2713 non-null   float64
 6   rented for      2713 non-null   object 
 7   review_text     2713 non-null   object 
 8   body type       2713 non-null   object 
 9   review_summary  2713 non-null   object 
 10  category        2713 non-null   object 
 11  height          2713 non-null   object 
 12  size            2713 non-null   int64  
 13  age             2713 non-null   float64
 14  review_date     2713 non-null   object 
dtypes: float64(2), int64(3), object(10)
memory usage: 318.1+ KB


In [5]:
users = users.drop_duplicates(subset=['item_id'])

In [6]:
print(users['item_id'].duplicated().any())

False


In [7]:
users = users.reset_index(drop=True)

In [8]:
#Import TfIdfVectorizer from the scikit-learn librarydata.head()
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stopwords
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
users['review_text'] = users['review_text'].fillna('')

#Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature
tfidf_matrix = tfidf.fit_transform(users['review_text'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(2014, 3710)

In [9]:
# Import linear_kernel to compute the dot product
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [10]:
#Construct a reverse mapping of indices and item titles, and drop duplicate titles, if any
indices = pd.Series(users.index, index=users['item_id']).drop_duplicates()
indices

item_id
1923283       0
259136        1
985087        2
1310167       3
2958376       4
           ... 
608649     2009
607186     2010
1363651    2011
451969     2012
2747774    2013
Length: 2014, dtype: int64

In [11]:
# Function that takes in item title as input and gives recommendations 
def content_recommender(item_id, cosine_sim=cosine_sim, df=users, indices=indices):
    # Obtain the index of the item that matches the title
    idx = indices[item_id]

    # Get the pairwsie similarity scores of all movies with that movie
    # And convert it into a list of tuples as described above
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the items clothes based on the cosine similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies. Ignore the first movie.
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    clothes_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return users['item_id'].iloc[clothes_indices]

In [12]:
print(cosine_sim.shape)

(2014, 2014)


In [13]:
content_recommender(2747774)

1983     222467
1556    1861964
1669    2083596
484     1300112
1008     683251
1998    1967750
683     2389457
168     2609834
491     1953967
814     1129763
Name: item_id, dtype: int64

### The framework

In [14]:
users.head(5)

Unnamed: 0,fit,user_id,bust size,item_id,weight,rating,rented for,review_text,body type,review_summary,category,height,size,age,review_date
0,fit,742630,32a,1923283,110lbs,10.0,date,This dress is extremely flattering and comfort...,straight & narrow,STYLIST REVIEW,dress,"5' 6""",4,24.0,"July 13, 2017"
1,fit,269712,34d,259136,125lbs,8.0,work,I was wavering between true to size and large....,full bust,Great work dress,sheath,"5' 2""",8,32.0,"March 1, 2016"
2,fit,32925,34b,985087,122lbs,10.0,other,This is one of my favorite gowns on the site. ...,straight & narrow,Stylist Review!,gown,"5' 7""",4,26.0,"April 21, 2015"
3,fit,45337,34b,1310167,130lbs,10.0,formal affair,Can't go wrong with a Badgley Mischka gown! I ...,pear,STYLIST REVIEW!,gown,"5' 7""",8,26.0,"November 12, 2015"
4,fit,491589,38dd,2958376,210lbs,8.0,party,"Super cute. Loved this Romper, even though we ...",full bust,Stylist Review,romper,"5' 9""",24,24.0,"May 14, 2016"


In [15]:
ratings = users[['user_id','item_id','rating']]
ratings.head()

Unnamed: 0,user_id,item_id,rating
0,742630,1923283,10.0
1,269712,259136,8.0
2,32925,985087,10.0
3,45337,1310167,10.0
4,491589,2958376,8.0


In [16]:
#Import the train_test_split function
from sklearn.model_selection import train_test_split

#Assign X as the original ratings dataframe and y as the user_id column of ratings.
X = ratings.copy()
y = ratings['user_id']

In [17]:
#Split into training and test datasets, stratified along user_id
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify=y, random_state=42)

In [18]:
#Import the mean_squared_error function
from sklearn.metrics import mean_squared_error

#Function that computes the root mean squared error (or RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [19]:
#Define the baseline model to always return 5.
def baseline(user_id, item_id):
    return 5.0

In [20]:
#Function to compute the RMSE score obtained on the testing set by a model
def score(cf_model):
    
    #Construct a list of user-item tuples from the testing dataset
    id_pairs = zip(X_test['user_id'], X_test['item_id'].values)
    
    #Predict the rating for every user-item tuple
    y_pred = np.array([cf_model(user, item) for (user, item) in id_pairs])
    
    #Extract the actual ratings given by the users in the test data
    y_true = np.array(X_test['rating'].values)
    
    #Return the final RMSE score
    return rmse(y_true, y_pred)

In [21]:
score(baseline)

4.596409917205397

Our baseline score is  4.5. For the models build in the subsequent sections we are going to aim to get a lower score.

## 2. Collaborative Filtering

## 2.1 User filtering

#### Ratings Matrix

In [22]:
#Build the ratings matrix using pivot_table function
r_matrix = X_train.pivot_table(values='rating', index='user_id', columns='item_id')

#### Mean

In [23]:
#User Based Collaborative Filter using Mean Ratings
def cf_user_mean(user_id, item_id):
    
    #Check if movie_id exists in r_matrix
    if item_id in r_matrix:
        #Compute the mean of all the ratings given to the movie
        mean_rating = r_matrix[item_id].mean()
    
    else:
        #Default to a rating of 5.0 in the absence of any information
        mean_rating = 5.0
    
    return mean_rating

In [24]:
#Compute RMSE for the Mean model
score(cf_user_mean)

4.596409917205397

The score obtained on the user based collaborative filtering is lower, this is good and means that is better than the baseline model.

#### Weighted Mean

In [25]:
#Create a dummy ratings matrix with all null values imputed to 0
r_matrix = r_matrix.copy().fillna(0)

In [26]:
# Import cosine_score 
from sklearn.metrics.pairwise import cosine_similarity

#Compute the cosine similarity matrix using the dummy ratings matrix
cosine_sim = cosine_similarity(r_matrix, r_matrix)

In [27]:
#Convert into pandas dataframe 
cosine_sim = pd.DataFrame(cosine_sim, index=r_matrix.index, columns=r_matrix.index)

cosine_sim.head(10)

user_id,32925,45337,45387,77034,88342,113975,215971,252311,257847,269712,...,570030,576202,630850,667260,676222,691468,733207,742630,833675,958548
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
32925,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45337,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45387,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
77034,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
88342,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
113975,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
215971,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
252311,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
257847,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
269712,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
#User Based Collaborative Filter using Weighted Mean Ratings
def cf_user_wmean(user_id, item_id):
    
    #Check if item_id exists in r_matrix
    if item_id in r_matrix:
        
        #Get the similarity scores for the user in question with every other user
        sim_scores = cosine_sim[user_id]
        
        #Get the user ratings for the item in question
        m_ratings = r_matrix[item_id]
        
        #Extract the indices containing NaN in the m_ratings series
        idx = m_ratings[m_ratings.isnull()].index
        
        #Drop the NaN values from the m_ratings Series
        m_ratings = m_ratings.dropna()
        
        #Drop the corresponding cosine scores from the sim_scores series
        sim_scores = sim_scores.drop(idx)
        
        #Compute the final weighted mean
        wmean_rating = np.dot(sim_scores, m_ratings)/ sim_scores.sum()
    
    else:
        #Default to a rating of 3.0 in the absence of any information
        wmean_rating = 5.0
    
    return wmean_rating

In [29]:
score(cf_user_wmean)

4.596409917205397

#### Demographics

In [30]:
#Set the index of the users dataframe to the user_id
users = users[['user_id','fit','body type']]

users.head()

Unnamed: 0,user_id,fit,body type
0,742630,fit,straight & narrow
1,269712,fit,full bust
2,32925,fit,straight & narrow
3,45337,fit,pear
4,491589,fit,full bust


In [31]:
X_train.head()

Unnamed: 0,user_id,item_id,rating
1030,32925,2605640,10.0
614,833675,570147,10.0
1048,77034,1494967,10.0
1975,733207,2216229,8.0
212,472391,2953707,10.0


In [32]:
merged_df = pd.merge(X_train, users)
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180722 entries, 0 to 180721
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   user_id    180722 non-null  int64  
 1   item_id    180722 non-null  int64  
 2   rating     180722 non-null  float64
 3   fit        180722 non-null  object 
 4   body type  180722 non-null  object 
dtypes: float64(1), int64(2), object(2)
memory usage: 6.9+ MB


In [33]:
#Compute the mean rating of every movie by gender
bt_mean = merged_df[['item_id', 'body type', 'rating']].groupby(['item_id', 'body type'])['rating'].mean()
bt_mean.head()

item_id  body type
125424   full bust    10.0
128730   athletic      8.0
145906   athletic     10.0
152998   athletic      8.0
153475   pear         10.0
Name: rating, dtype: float64

In [34]:
users = users.set_index('user_id')
users.head()

Unnamed: 0_level_0,fit,body type
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
742630,fit,straight & narrow
269712,fit,full bust
32925,fit,straight & narrow
45337,fit,pear
491589,fit,full bust


In [35]:
r_matrix.head()

item_id,125424,128730,145906,152998,153475,162634,164051,168012,172027,187164,...,2953707,2954118,2955092,2955734,2956453,2956720,2958376,2959486,2960025,2960969
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
32925,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0
45337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0
45387,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
77034,0.0,0.0,0.0,8.0,0.0,10.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0
88342,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
#Gender Based Collaborative Filter using Mean Ratings
def cf_bt(user_id, item_id):
    
    #Check if movie_id exists in r_matrix (or training set)
    if item_id in r_matrix:
        #Identify the gender of the user
        bt = str(users.loc[user_id]['body type'])
        if bt in bt_mean[item_id]:
            bt_rating = bt_mean[item_id][bt]
        #bt = users.loc[user_id]['body type']
        
        #Check if the gender has rated the movie
        #if bt in bt_mean[item_id]:
            
            #Compute the mean rating given by that gender to the movie
            #bt_rating = bt_mean[item_id][bt]
        
        else:
            bt_rating = 5.0
    
    else:
        #Default to a rating of 3.0 in the absence of any information
        bt_rating = 5.0
    
    return bt_rating

In [37]:
score(cf_bt)

4.596409917205397

## 2.2 Item Collaborative  System

In [38]:
items = pd.read_json('renttherunway_final_data.json', lines=True)
#data = data.iloc[:10000, :]
items = items.dropna()

In [39]:
# 1. Count the frequency of each user_id
items_counts = items['item_id'].value_counts()

In [40]:
# 2. Filter the dataset to only include user_id's with count >= 5
frequent_items = items_counts[items_counts >= 10].index
items = items[items['item_id'].isin(frequent_items)]
items = items.reset_index(drop=True)
# Display the filtered dataset
items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135425 entries, 0 to 135424
Data columns (total 15 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   fit             135425 non-null  object 
 1   user_id         135425 non-null  int64  
 2   bust size       135425 non-null  object 
 3   item_id         135425 non-null  int64  
 4   weight          135425 non-null  object 
 5   rating          135425 non-null  float64
 6   rented for      135425 non-null  object 
 7   review_text     135425 non-null  object 
 8   body type       135425 non-null  object 
 9   review_summary  135425 non-null  object 
 10  category        135425 non-null  object 
 11  height          135425 non-null  object 
 12  size            135425 non-null  int64  
 13  age             135425 non-null  float64
 14  review_date     135425 non-null  object 
dtypes: float64(2), int64(3), object(10)
memory usage: 15.5+ MB


In [41]:
ratings = items[['user_id','item_id','rating']]
ratings.head()

Unnamed: 0,user_id,item_id,rating
0,420272,2260466,10.0
1,273551,153475,10.0
2,909926,126335,8.0
3,151944,616682,10.0
4,734848,364092,8.0


In [42]:
#Import the train_test_split function
from sklearn.model_selection import train_test_split

#Assign X as the original ratings dataframe and y as the user_id column of ratings.
X = ratings.copy()
y = ratings['item_id']

In [43]:
#Split into training and test datasets, stratified along user_id
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify=y, random_state=42)

In [44]:
#Import the mean_squared_error function
from sklearn.metrics import mean_squared_error

#Function that computes the root mean squared error (or RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [45]:
#Import the mean_squared_error function
from sklearn.metrics import mean_squared_error

#Function that computes the root mean squared error (or RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [46]:
#Define the baseline model to always return 5.
def baseline(user_id, item_id):
    return 5.0

In [47]:
#Function to compute the RMSE score obtained on the testing set by a model
def score(cf_model):
    
    #Construct a list of user-item tuples from the testing dataset
    id_pairs = zip(X_test['user_id'], X_test['item_id'].values)
    
    #Predict the rating for every user-item tuple
    y_pred = np.array([cf_model(user, item) for (user, item) in id_pairs])
    
    #Extract the actual ratings given by the users in the test data
    y_true = np.array(X_test['rating'].values)
    
    #Return the final RMSE score
    return rmse(y_true, y_pred)

In [48]:
score(baseline)

4.343170526721022

In [49]:
#Build the ratings matrix using pivot_table function
r_matrix = X_train.pivot_table(values='rating', index='item_id', columns='user_id')
r_matrix

user_id,9,25,47,82,92,97,155,207,214,224,...,999843,999865,999892,999910,999913,999922,999952,999971,999987,999997
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
123373,,,,,,,,,,,...,,,,,,,8.0,,,
123793,,,,,,,,,,,...,,,,,,,,,,
124204,,,,,,,,,,,...,,,,,,,,,,
124553,,,,,,,,,,,...,,,,,,,,,,
125424,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2955734,,,,,,,,,,,...,,,,,,,,,,
2957481,,,,,,,,,,,...,,,,,,,,,,
2958376,,,,,,,,,,,...,,,,,,,,,,
2960969,,,,,,,,,,,...,,,,,,,,,,


In [50]:
#User Based Collaborative Filter using Mean Ratings
def cf_item_mean(user_id, item_id):
    
    #Check if user_id exists in r_matrix
    if item_id in r_matrix:
        #Compute the mean of all the ratings given to the movie
        mean_rating = r_matrix[item_id].mean()
    
    else:
        #Default to a rating of 5.0 in the absence of any information
        mean_rating = 5.0
    
    return mean_rating

In [51]:
score(cf_item_mean)

4.290639555449915

# 3. Market basket analysis

In [52]:
data = pd.read_csv('bread basket.csv')

In [53]:
data.head()

Unnamed: 0,Transaction,Item,date_time,period_day,weekday_weekend
0,1,Bread,30-10-2016 09:58,morning,weekend
1,2,Scandinavian,30-10-2016 10:05,morning,weekend
2,2,Scandinavian,30-10-2016 10:05,morning,weekend
3,3,Hot chocolate,30-10-2016 10:07,morning,weekend
4,3,Jam,30-10-2016 10:07,morning,weekend


In [54]:
data = data.drop(['date_time','period_day','weekday_weekend'], axis=1)

In [55]:
data

Unnamed: 0,Transaction,Item
0,1,Bread
1,2,Scandinavian
2,2,Scandinavian
3,3,Hot chocolate
4,3,Jam
...,...,...
20502,9682,Coffee
20503,9682,Tea
20504,9683,Coffee
20505,9683,Pastry


In [56]:
data = data.groupby('Transaction')['Item'].apply(list)
data

Transaction
1                                                 [Bread]
2                            [Scandinavian, Scandinavian]
3                           [Hot chocolate, Jam, Cookies]
4                                                [Muffin]
5                                 [Coffee, Pastry, Bread]
                              ...                        
9680                                              [Bread]
9681    [Truffles, Tea, Spanish Brunch, Christmas common]
9682                  [Muffin, Tacos/Fajita, Coffee, Tea]
9683                                     [Coffee, Pastry]
9684                                          [Smoothies]
Name: Item, Length: 9465, dtype: object

In [57]:
data.info()

<class 'pandas.core.series.Series'>
Index: 9465 entries, 1 to 9684
Series name: Item
Non-Null Count  Dtype 
--------------  ----- 
9465 non-null   object
dtypes: object(1)
memory usage: 147.9+ KB


In [83]:
transactions = data.tolist()

In [84]:
items = list(set(item for transaction in transactions for item in transaction))

In [60]:
encoded_data = pd.DataFrame(
    [[item in transaction for item in items] for transaction in transactions],
    columns=items
)


encoded_data.head(50)

Unnamed: 0,Tiffin,Hot chocolate,Vegan mincepie,Art Tray,Coffee granules,Cherry me Dried fruit,Adjustment,Tartine,Spanish Brunch,Frittata,...,Cookies,Bread,Postcard,Kids biscuit,Caramel bites,Dulce de Leche,Juice,Chicken sand,Tacos/Fajita,Pastry
0,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,True,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,True
5,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
6,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
7,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,True
8,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## 3.1 Apriori Algorithm

In [61]:
import time

start_time = time.time()

frequent_itemsets = apriori(encoded_data, min_support=0.01, use_colnames=True)
frequent_itemsets

# End timing
end_time = time.time()

# Calculate the time taken
calculation_time = end_time - start_time

print("Frequent Itemsets calculated in {:.2f} seconds.".format(calculation_time))

Frequent Itemsets calculated in 0.03 seconds.


In [80]:
rules = association_rules(frequent_itemsets, metric = "support", min_threshold = 0.01, num_itemsets=0)
rules.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74 entries, 0 to 73
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   antecedents         74 non-null     object 
 1   consequents         74 non-null     object 
 2   antecedent support  74 non-null     float64
 3   consequent support  74 non-null     float64
 4   support             74 non-null     float64
 5   confidence          74 non-null     float64
 6   lift                74 non-null     float64
 7   representativity    74 non-null     float64
 8   leverage            74 non-null     float64
 9   conviction          74 non-null     float64
 10  zhangs_metric       74 non-null     float64
 11  jaccard             74 non-null     float64
 12  certainty           74 non-null     float64
 13  kulczynski          74 non-null     float64
dtypes: float64(12), object(2)
memory usage: 8.2+ KB


In [81]:
# Display the results
from IPython.display import display

print("Frequent Itemsets:")
display(frequent_itemsets.head())

Frequent Itemsets:


Unnamed: 0,support,itemsets
0,0.015425,(Tiffin)
1,0.05832,(Hot chocolate)
2,0.018172,(Spanish Brunch)
3,0.103856,(Cake)
4,0.034443,(Soup)


In [82]:
print("\nAssociation Rules:")
rules


Association Rules:


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(Hot chocolate),(Cake),0.058320,0.103856,0.011410,0.195652,1.883874,1.0,0.005354,1.114125,0.498236,0.075683,0.102434,0.152760
1,(Cake),(Hot chocolate),0.103856,0.058320,0.011410,0.109868,1.883874,1.0,0.005354,1.057910,0.523553,0.075683,0.054740,0.152760
2,(Hot chocolate),(Coffee),0.058320,0.478394,0.029583,0.507246,1.060311,1.0,0.001683,1.058553,0.060403,0.058333,0.055314,0.284542
3,(Coffee),(Hot chocolate),0.478394,0.058320,0.029583,0.061837,1.060311,1.0,0.001683,1.003749,0.109048,0.058333,0.003735,0.284542
4,(Bread),(Hot chocolate),0.327205,0.058320,0.013418,0.041007,0.703144,1.0,-0.005665,0.981947,-0.385564,0.036059,-0.018385,0.135540
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69,"(Bread, Coffee)",(Pastry),0.090016,0.086107,0.011199,0.124413,1.444872,1.0,0.003448,1.043749,0.338354,0.067905,0.041916,0.127237
70,"(Pastry, Coffee)",(Bread),0.047544,0.327205,0.011199,0.235556,0.719901,1.0,-0.004357,0.880109,-0.290026,0.030805,-0.136223,0.134891
71,(Bread),"(Pastry, Coffee)",0.327205,0.047544,0.011199,0.034227,0.719901,1.0,-0.004357,0.986211,-0.366409,0.030805,-0.013982,0.134891
72,(Pastry),"(Bread, Coffee)",0.086107,0.090016,0.011199,0.130061,1.444872,1.0,0.003448,1.046033,0.336907,0.067905,0.044007,0.127237


In [79]:
print(encoded_data['Bread'].sum())
print(encoded_data['Coffee'].sum())

3097
4528


## 3.2 FP Growth Algorithm

In [66]:
from mlxtend.frequent_patterns import fpgrowth
start_time = time.time()


# Evaluate the frequent item rules for fpgrowth function
frequent_itemsets_fp = fpgrowth(encoded_data, min_support = 0.01, use_colnames = True)

print(frequent_itemsets_fp)

# End timing
end_time = time.time()

# Calculate the time taken
calculation_time = end_time - start_time

print("Frequent Itemsets calculated in {:.2f} seconds.".format(calculation_time))

     support                  itemsets
0   0.327205                   (Bread)
1   0.029054            (Scandinavian)
2   0.058320           (Hot chocolate)
3   0.054411                 (Cookies)
4   0.015003                     (Jam)
..       ...                       ...
56  0.019651         (Coffee, Brownie)
57  0.010777          (Bread, Brownie)
58  0.023666           (Toast, Coffee)
59  0.018067           (Coffee, Scone)
60  0.010882  (Coffee, Spanish Brunch)

[61 rows x 2 columns]
Frequent Itemsets calculated in 0.75 seconds.


In [67]:
from mlxtend.frequent_patterns import association_rules

# Display the reles due to fp-growth algorithm
rules_fp = association_rules(frequent_itemsets_fp, metric = "confidence", min_threshold = 0.3, num_itemsets=0)

rules_fp

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(Hot chocolate),(Coffee),0.05832,0.478394,0.029583,0.507246,1.060311,1.0,0.001683,1.058553,0.060403,0.058333,0.055314,0.284542
1,(Cookies),(Coffee),0.054411,0.478394,0.028209,0.518447,1.083723,1.0,0.002179,1.083174,0.0817,0.055905,0.076787,0.288707
2,(Muffin),(Coffee),0.038457,0.478394,0.018806,0.489011,1.022193,1.0,0.000408,1.020777,0.022579,0.03776,0.020354,0.264161
3,(Pastry),(Coffee),0.086107,0.478394,0.047544,0.552147,1.154168,1.0,0.006351,1.164682,0.146161,0.091968,0.141396,0.325764
4,(Pastry),(Bread),0.086107,0.327205,0.02916,0.33865,1.034977,1.0,0.000985,1.017305,0.03698,0.075908,0.017011,0.213884
5,"(Bread, Pastry)",(Coffee),0.02916,0.478394,0.011199,0.384058,0.802807,1.0,-0.002751,0.846843,-0.20192,0.022563,-0.180857,0.203734
6,(Medialuna),(Coffee),0.061807,0.478394,0.035182,0.569231,1.189878,1.0,0.005614,1.210871,0.170091,0.069665,0.174148,0.321387
7,(Tea),(Coffee),0.142631,0.478394,0.049868,0.34963,0.73084,1.0,-0.018366,0.802014,-0.300482,0.08731,-0.246862,0.226935
8,(Juice),(Coffee),0.038563,0.478394,0.020602,0.534247,1.11675,1.0,0.002154,1.119919,0.108738,0.041507,0.107078,0.288656
9,(Soup),(Coffee),0.034443,0.478394,0.015848,0.460123,0.961807,1.0,-0.000629,0.966156,-0.039502,0.031888,-0.035029,0.246625
