# Recommendation Systems

In [1]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules

import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

## 1. Content based recommender

In [2]:
users = pd.read_json('renttherunway_final_data.json', lines=True)
#data = data.iloc[:10000, :]
users = users.dropna()

In [3]:
# 1. Count the frequency of each user_id
user_counts = users['user_id'].value_counts()

In [4]:
# 2. Filter the dataset to only include user_id's with count >= 5
frequent_users = user_counts[user_counts >= 50].index
users = users[users['user_id'].isin(frequent_users)]
users = users.reset_index(drop=True)
# Display the filtered dataset
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2713 entries, 0 to 2712
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   fit             2713 non-null   object 
 1   user_id         2713 non-null   int64  
 2   bust size       2713 non-null   object 
 3   item_id         2713 non-null   int64  
 4   weight          2713 non-null   object 
 5   rating          2713 non-null   float64
 6   rented for      2713 non-null   object 
 7   review_text     2713 non-null   object 
 8   body type       2713 non-null   object 
 9   review_summary  2713 non-null   object 
 10  category        2713 non-null   object 
 11  height          2713 non-null   object 
 12  size            2713 non-null   int64  
 13  age             2713 non-null   float64
 14  review_date     2713 non-null   object 
dtypes: float64(2), int64(3), object(10)
memory usage: 318.1+ KB


In [5]:
#Import TfIdfVectorizer from the scikit-learn librarydata.head()
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stopwords
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
users['review_text'] = users['review_text'].fillna('')

#Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature
tfidf_matrix = tfidf.fit_transform(users['review_text'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(2713, 4255)

In [6]:
# Import linear_kernel to compute the dot product
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [7]:
#Construct a reverse mapping of indices and item titles, and drop duplicate titles, if any
indices = pd.Series(users.index, index=users['item_id']).drop_duplicates()

In [8]:
indices.head()

item_id
1923283    0
259136     1
985087     2
1310167    3
2958376    4
dtype: int64

In [9]:
# Function that takes in item title as input and gives recommendations 
def content_recommender(item_id, cosine_sim=cosine_sim):
    # Obtain the index of the item that matches the title
    idx = indices[item_id]

    # Get the pairwsie similarity scores of all movies with that movie
    # And convert it into a list of tuples as described above
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the cosine similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies. Ignore the first movie.
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    clothes_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return users['item_id'].iloc[clothes_indices]

In [10]:
#Get recommendations for item: 2260466
content_recommender(2260466)

2415    2150854
2313    2730831
1180     295072
436      597613
873     2649640
2068    2036606
2605    1448326
71      1820237
2696     726174
1462    1933787
Name: item_id, dtype: int64

### The framework

In [11]:
users.head(5)

Unnamed: 0,fit,user_id,bust size,item_id,weight,rating,rented for,review_text,body type,review_summary,category,height,size,age,review_date
0,fit,742630,32a,1923283,110lbs,10.0,date,This dress is extremely flattering and comfort...,straight & narrow,STYLIST REVIEW,dress,"5' 6""",4,24.0,"July 13, 2017"
1,fit,269712,34d,259136,125lbs,8.0,work,I was wavering between true to size and large....,full bust,Great work dress,sheath,"5' 2""",8,32.0,"March 1, 2016"
2,fit,32925,34b,985087,122lbs,10.0,other,This is one of my favorite gowns on the site. ...,straight & narrow,Stylist Review!,gown,"5' 7""",4,26.0,"April 21, 2015"
3,fit,45337,34b,1310167,130lbs,10.0,formal affair,Can't go wrong with a Badgley Mischka gown! I ...,pear,STYLIST REVIEW!,gown,"5' 7""",8,26.0,"November 12, 2015"
4,fit,491589,38dd,2958376,210lbs,8.0,party,"Super cute. Loved this Romper, even though we ...",full bust,Stylist Review,romper,"5' 9""",24,24.0,"May 14, 2016"


In [12]:
ratings = users[['user_id','item_id','rating']]
ratings.head()

Unnamed: 0,user_id,item_id,rating
0,742630,1923283,10.0
1,269712,259136,8.0
2,32925,985087,10.0
3,45337,1310167,10.0
4,491589,2958376,8.0


In [13]:
#Import the train_test_split function
from sklearn.model_selection import train_test_split

#Assign X as the original ratings dataframe and y as the user_id column of ratings.
X = ratings.copy()
y = ratings['user_id']

In [14]:
#Split into training and test datasets, stratified along user_id
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify=y, random_state=42)

In [15]:
#Import the mean_squared_error function
from sklearn.metrics import mean_squared_error

#Function that computes the root mean squared error (or RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [16]:
#Define the baseline model to always return 5.
def baseline(user_id, item_id):
    return 5.0

In [17]:
#Function to compute the RMSE score obtained on the testing set by a model
def score(cf_model):
    
    #Construct a list of user-item tuples from the testing dataset
    id_pairs = zip(X_test['user_id'], X_test['item_id'].values)
    
    #Predict the rating for every user-item tuple
    y_pred = np.array([cf_model(user, item) for (user, item) in id_pairs])
    
    #Extract the actual ratings given by the users in the test data
    y_true = np.array(X_test['rating'].values)
    
    #Return the final RMSE score
    return rmse(y_true, y_pred)

In [18]:
score(baseline)

4.602458202114293

Our baseline score is  4.2. For the models build in the subsequent sections we are going to aim to get a lower score.

## 2. Collaborative Filtering

## 2.1 User filtering

#### Ratings Matrix

In [19]:
#Build the ratings matrix using pivot_table function
r_matrix = X_train.pivot_table(values='rating', index='user_id', columns='item_id')

#### Mean

In [20]:
#User Based Collaborative Filter using Mean Ratings
def cf_user_mean(user_id, item_id):
    
    #Check if movie_id exists in r_matrix
    if item_id in r_matrix:
        #Compute the mean of all the ratings given to the movie
        mean_rating = r_matrix[item_id].mean()
    
    else:
        #Default to a rating of 5.0 in the absence of any information
        mean_rating = 5.0
    
    return mean_rating

In [21]:
#Compute RMSE for the Mean model
score(cf_user_mean)

3.8011282979862173

The score obtained on the user based collaborative filtering is lower, this is good and means that is better than the baseline model.

#### Weighted Mean

In [22]:
#Create a dummy ratings matrix with all null values imputed to 0
r_matrix = r_matrix.copy().fillna(0)

In [23]:
# Import cosine_score 
from sklearn.metrics.pairwise import cosine_similarity

#Compute the cosine similarity matrix using the dummy ratings matrix
cosine_sim = cosine_similarity(r_matrix, r_matrix)

In [24]:
#Convert into pandas dataframe 
cosine_sim = pd.DataFrame(cosine_sim, index=r_matrix.index, columns=r_matrix.index)

cosine_sim.head(10)

user_id,32925,45337,45387,77034,88342,113975,215971,252311,257847,269712,...,570030,576202,630850,667260,676222,691468,733207,742630,833675,958548
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
32925,1.0,0.038841,0.009304,0.049749,0.0,0.049695,0.016341,0.0,0.042142,0.021258,...,0.0106,0.0,0.017649,0.065376,0.009863,0.065903,0.010631,0.007007,0.020992,0.011644
45337,0.038841,1.0,0.008479,0.0,0.0,0.0,0.019857,0.013035,0.023044,0.0,...,0.016101,0.016874,0.0,0.040342,0.0,0.048712,0.016148,0.019158,0.0,0.0
45387,0.009304,0.008479,1.0,0.0,0.027919,0.047464,0.0,0.013115,0.021466,0.031584,...,0.0,0.0,0.022475,0.020814,0.056522,0.04533,0.044407,0.014277,0.019095,0.078292
77034,0.049749,0.0,0.0,1.0,0.0,0.073563,0.031737,0.020326,0.0,0.046993,...,0.0,0.0,0.0209,0.0,0.046721,0.017895,0.02518,0.016596,0.092334,0.027578
88342,0.0,0.0,0.027919,0.0,1.0,0.0,0.015692,0.0,0.0,0.012758,...,0.0,0.0,0.0,0.024523,0.0,0.058447,0.0,0.016822,0.0,0.0
113975,0.049695,0.0,0.047464,0.073563,0.0,1.0,0.0,0.019458,0.01911,0.061053,...,0.0,0.0,0.060023,0.037058,0.0,0.106976,0.0,0.015887,0.016998,0.0
215971,0.016341,0.019857,0.0,0.031737,0.015692,0.0,1.0,0.042997,0.015081,0.048182,...,0.018967,0.0,0.01579,0.014623,0.010589,0.076612,0.015218,0.012538,0.045611,0.020835
252311,0.0,0.013035,0.013115,0.020326,0.0,0.019458,0.042997,1.0,0.01584,0.026636,...,0.0,0.0,0.0,0.0,0.023172,0.023667,0.0,0.013169,0.01409,0.0
257847,0.042142,0.023044,0.021466,0.0,0.0,0.01911,0.015081,0.01584,1.0,0.013079,...,0.0,0.0,0.0,0.03771,0.0,0.041839,0.0,0.012934,0.0,0.0
269712,0.021258,0.0,0.031584,0.046993,0.012758,0.061053,0.048182,0.026636,0.013079,1.0,...,0.0,0.0,0.044506,0.015852,0.066008,0.050158,0.041246,0.010874,0.02385,0.013552


In [25]:
#User Based Collaborative Filter using Weighted Mean Ratings
def cf_user_wmean(user_id, item_id):
    
    #Check if item_id exists in r_matrix
    if item_id in r_matrix:
        
        #Get the similarity scores for the user in question with every other user
        sim_scores = cosine_sim[user_id]
        
        #Get the user ratings for the item in question
        m_ratings = r_matrix[item_id]
        
        #Extract the indices containing NaN in the m_ratings series
        idx = m_ratings[m_ratings.isnull()].index
        
        #Drop the NaN values from the m_ratings Series
        m_ratings = m_ratings.dropna()
        
        #Drop the corresponding cosine scores from the sim_scores series
        sim_scores = sim_scores.drop(idx)
        
        #Compute the final weighted mean
        wmean_rating = np.dot(sim_scores, m_ratings)/ sim_scores.sum()
    
    else:
        #Default to a rating of 3.0 in the absence of any information
        wmean_rating = 5.0
    
    return wmean_rating

In [26]:
score(cf_user_wmean)

6.617243670512864

#### Demographics

In [27]:
#Set the index of the users dataframe to the user_id
users = users[['user_id','fit','body type']]

users.head()

Unnamed: 0,user_id,fit,body type
0,742630,fit,straight & narrow
1,269712,fit,full bust
2,32925,fit,straight & narrow
3,45337,fit,pear
4,491589,fit,full bust


In [28]:
X_train.head()

Unnamed: 0,user_id,item_id,rating
1057,45387,2660685,10.0
1783,833675,547674,6.0
233,676222,931157,10.0
2452,733207,2859490,10.0
348,269712,986324,10.0


In [29]:
merged_df = pd.merge(X_train, users)
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 328035 entries, 0 to 328034
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   user_id    328035 non-null  int64  
 1   item_id    328035 non-null  int64  
 2   rating     328035 non-null  float64
 3   fit        328035 non-null  object 
 4   body type  328035 non-null  object 
dtypes: float64(1), int64(2), object(2)
memory usage: 12.5+ MB


In [30]:
#Compute the mean rating of every movie by gender
bt_mean = merged_df[['item_id', 'body type', 'rating']].groupby(['item_id', 'body type'])['rating'].mean()
bt_mean.head()

item_id  body type        
134393   hourglass             8.0
145417   straight & narrow    10.0
152510   full bust             4.0
         hourglass            10.0
153475   pear                 10.0
Name: rating, dtype: float64

In [31]:
users = users.set_index('user_id')
users.head()

Unnamed: 0_level_0,fit,body type
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
742630,fit,straight & narrow
269712,fit,full bust
32925,fit,straight & narrow
45337,fit,pear
491589,fit,full bust


In [32]:
r_matrix.head()

item_id,134393,145417,152510,153475,157448,160346,164051,172027,174799,178058,...,2949219,2949937,2950886,2953707,2955092,2955734,2956453,2958376,2960969,2962646
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
32925,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0
45387,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
77034,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
88342,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
#Gender Based Collaborative Filter using Mean Ratings
def cf_bt(user_id, item_id):
    
    #Check if movie_id exists in r_matrix (or training set)
    if item_id in r_matrix:
        #Identify the gender of the user
        bt = str(users.loc[user_id]['body type'])
        if bt in bt_mean[item_id]:
            bt_rating = bt_mean[item_id][bt]
        #bt = users.loc[user_id]['body type']
        
        #Check if the gender has rated the movie
        #if bt in bt_mean[item_id]:
            
            #Compute the mean rating given by that gender to the movie
            #bt_rating = bt_mean[item_id][bt]
        
        else:
            bt_rating = 5.0
    
    else:
        #Default to a rating of 3.0 in the absence of any information
        bt_rating = 5.0
    
    return bt_rating

In [34]:
score(cf_bt)

4.602458202114293

## 2.2 Item Collaborative  System

In [35]:
items = pd.read_json('renttherunway_final_data.json', lines=True)
#data = data.iloc[:10000, :]
items = items.dropna()

In [36]:
# 1. Count the frequency of each user_id
items_counts = items['item_id'].value_counts()

In [37]:
# 2. Filter the dataset to only include user_id's with count >= 5
frequent_items = items_counts[items_counts >= 10].index
items = items[items['item_id'].isin(frequent_items)]
items = items.reset_index(drop=True)
# Display the filtered dataset
items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135425 entries, 0 to 135424
Data columns (total 15 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   fit             135425 non-null  object 
 1   user_id         135425 non-null  int64  
 2   bust size       135425 non-null  object 
 3   item_id         135425 non-null  int64  
 4   weight          135425 non-null  object 
 5   rating          135425 non-null  float64
 6   rented for      135425 non-null  object 
 7   review_text     135425 non-null  object 
 8   body type       135425 non-null  object 
 9   review_summary  135425 non-null  object 
 10  category        135425 non-null  object 
 11  height          135425 non-null  object 
 12  size            135425 non-null  int64  
 13  age             135425 non-null  float64
 14  review_date     135425 non-null  object 
dtypes: float64(2), int64(3), object(10)
memory usage: 15.5+ MB


In [38]:
ratings = items[['user_id','item_id','rating']]
ratings.head()

Unnamed: 0,user_id,item_id,rating
0,420272,2260466,10.0
1,273551,153475,10.0
2,909926,126335,8.0
3,151944,616682,10.0
4,734848,364092,8.0


In [39]:
#Import the train_test_split function
from sklearn.model_selection import train_test_split

#Assign X as the original ratings dataframe and y as the user_id column of ratings.
X = ratings.copy()
y = ratings['item_id']

In [40]:
#Split into training and test datasets, stratified along user_id
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify=y, random_state=42)

In [41]:
#Import the mean_squared_error function
from sklearn.metrics import mean_squared_error

#Function that computes the root mean squared error (or RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [42]:
#Import the mean_squared_error function
from sklearn.metrics import mean_squared_error

#Function that computes the root mean squared error (or RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [43]:
#Define the baseline model to always return 5.
def baseline(user_id, item_id):
    return 5.0

In [44]:
#Function to compute the RMSE score obtained on the testing set by a model
def score(cf_model):
    
    #Construct a list of user-item tuples from the testing dataset
    id_pairs = zip(X_test['user_id'], X_test['item_id'].values)
    
    #Predict the rating for every user-item tuple
    y_pred = np.array([cf_model(user, item) for (user, item) in id_pairs])
    
    #Extract the actual ratings given by the users in the test data
    y_true = np.array(X_test['rating'].values)
    
    #Return the final RMSE score
    return rmse(y_true, y_pred)

In [45]:
score(baseline)

4.343170526721022

# 3. Market basket analysis

In [46]:
data = pd.read_csv('bread basket.csv')

In [47]:
data.head()

Unnamed: 0,Transaction,Item,date_time,period_day,weekday_weekend
0,1,Bread,30-10-2016 09:58,morning,weekend
1,2,Scandinavian,30-10-2016 10:05,morning,weekend
2,2,Scandinavian,30-10-2016 10:05,morning,weekend
3,3,Hot chocolate,30-10-2016 10:07,morning,weekend
4,3,Jam,30-10-2016 10:07,morning,weekend


In [48]:
data = data.drop(['date_time','period_day','weekday_weekend'], axis=1)

In [49]:
data

Unnamed: 0,Transaction,Item
0,1,Bread
1,2,Scandinavian
2,2,Scandinavian
3,3,Hot chocolate
4,3,Jam
...,...,...
20502,9682,Coffee
20503,9682,Tea
20504,9683,Coffee
20505,9683,Pastry


In [50]:
data = data.groupby('Transaction')['Item'].apply(list)
data

Transaction
1                                                 [Bread]
2                            [Scandinavian, Scandinavian]
3                           [Hot chocolate, Jam, Cookies]
4                                                [Muffin]
5                                 [Coffee, Pastry, Bread]
                              ...                        
9680                                              [Bread]
9681    [Truffles, Tea, Spanish Brunch, Christmas common]
9682                  [Muffin, Tacos/Fajita, Coffee, Tea]
9683                                     [Coffee, Pastry]
9684                                          [Smoothies]
Name: Item, Length: 9465, dtype: object

In [51]:
data.info()

<class 'pandas.core.series.Series'>
Index: 9465 entries, 1 to 9684
Series name: Item
Non-Null Count  Dtype 
--------------  ----- 
9465 non-null   object
dtypes: object(1)
memory usage: 147.9+ KB


In [52]:
transactions = data.tolist()
transactions

[['Bread'],
 ['Scandinavian', 'Scandinavian'],
 ['Hot chocolate', 'Jam', 'Cookies'],
 ['Muffin'],
 ['Coffee', 'Pastry', 'Bread'],
 ['Medialuna', 'Pastry', 'Muffin'],
 ['Medialuna', 'Pastry', 'Coffee', 'Tea'],
 ['Pastry', 'Bread'],
 ['Bread', 'Muffin'],
 ['Scandinavian', 'Medialuna'],
 ['Bread', 'Medialuna', 'Bread'],
 ['Jam', 'Coffee', 'Tartine', 'Pastry', 'Tea'],
 ['Basket', 'Bread', 'Coffee'],
 ['Bread', 'Medialuna', 'Pastry'],
 ['Mineral water', 'Scandinavian'],
 ['Bread', 'Medialuna', 'Coffee'],
 ['Hot chocolate'],
 ['Farm House'],
 ['Farm House', 'Bread'],
 ['Bread', 'Medialuna'],
 ['Coffee', 'Coffee', 'Medialuna', 'Bread'],
 ['Jam'],
 ['Scandinavian', 'Muffin'],
 ['Bread'],
 ['Scandinavian'],
 ['Fudge'],
 ['Scandinavian'],
 ['Coffee', 'Bread'],
 ['Bread', 'Jam'],
 ['Bread'],
 ['Basket'],
 ['Scandinavian', 'Muffin'],
 ['Coffee'],
 ['Coffee', 'Muffin'],
 ['Muffin', 'Scandinavian'],
 ['Tea', 'Bread'],
 ['Coffee', 'Bread'],
 ['Bread', 'Tea'],
 ['Scandinavian'],
 ['Juice', 'Tartine', 

In [53]:
items = list(set(item for transaction in transactions for item in transaction))

items

['Spread',
 'Tartine',
 'Sandwich',
 'Scone',
 "Ella's Kitchen Pouches",
 'Tacos/Fajita',
 'The BART',
 'Soup',
 'Fairy Doors',
 'Pick and Mix Bowls',
 'Art Tray',
 'Jam',
 'Coffee',
 'Spanish Brunch',
 'Bread Pudding',
 'Tea',
 'Half slice Monster ',
 'Bowl Nic Pitt',
 'Cherry me Dried fruit',
 'Victorian Sponge',
 'Bakewell',
 'Duck egg',
 'Vegan Feast',
 'Coke',
 'Postcard',
 'Brioche and salami',
 'The Nomad',
 'Vegan mincepie',
 'Muffin',
 'Cookies',
 'Crisps',
 'Smoothies',
 'Adjustment',
 'Salad',
 'Bread',
 'Keeping It Local',
 'Hearty & Seasonal',
 'Alfajores',
 'Gingerbread syrup',
 'Raw bars',
 'Medialuna',
 'Mighty Protein',
 'Polenta',
 'My-5 Fruit Shoot',
 'Caramel bites',
 'Drinking chocolate spoons ',
 'Baguette',
 'Tshirt',
 'Brownie',
 'Chicken sand',
 'Panatone',
 'Truffles',
 'Frittata',
 'Argentina Night',
 'Gift voucher',
 'Lemon and coconut',
 'Mineral water',
 'Jammie Dodgers',
 "Valentine's card",
 'Coffee granules ',
 'Crepes',
 'Pintxos',
 'Dulce de Leche',
 

In [54]:
encoded_data = pd.DataFrame(
    [[item in transaction for item in items] for transaction in transactions],
    columns=items
)


encoded_data.head()

Unnamed: 0,Spread,Tartine,Sandwich,Scone,Ella's Kitchen Pouches,Tacos/Fajita,The BART,Soup,Fairy Doors,Pick and Mix Bowls,...,Olum & polenta,Chocolates,Afternoon with the baker,Chimichurri Oil,Toast,Scandinavian,Hot chocolate,Chicken Stew,Hack the stack,Bare Popcorn
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## 3.1 Apriori Algorithm

In [55]:
import time

start_time = time.time()

frequent_itemsets = apriori(encoded_data, min_support=0.01, use_colnames=True)
frequent_itemsets

# End timing
end_time = time.time()

# Calculate the time taken
calculation_time = end_time - start_time

print("Frequent Itemsets calculated in {:.2f} seconds.".format(calculation_time))

Frequent Itemsets calculated in 0.02 seconds.


In [56]:
rules = association_rules(frequent_itemsets, metric = "antecedent support", min_threshold = 0.2, num_itemsets=0)
rules.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31 entries, 0 to 30
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   antecedents         31 non-null     object 
 1   consequents         31 non-null     object 
 2   antecedent support  31 non-null     float64
 3   consequent support  31 non-null     float64
 4   support             31 non-null     float64
 5   confidence          31 non-null     float64
 6   lift                31 non-null     float64
 7   representativity    31 non-null     float64
 8   leverage            31 non-null     float64
 9   conviction          31 non-null     float64
 10  zhangs_metric       31 non-null     float64
 11  jaccard             31 non-null     float64
 12  certainty           31 non-null     float64
 13  kulczynski          31 non-null     float64
dtypes: float64(12), object(2)
memory usage: 3.5+ KB


In [57]:
# Display the results
from IPython.display import display

print("Frequent Itemsets:")
display(frequent_itemsets.head())

Frequent Itemsets:


Unnamed: 0,support,itemsets
0,0.071844,(Sandwich)
1,0.034548,(Scone)
2,0.034443,(Soup)
3,0.015003,(Jam)
4,0.478394,(Coffee)


In [58]:
print("\nAssociation Rules:")
rules


Association Rules:


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(Coffee),(Sandwich),0.478394,0.071844,0.038246,0.079947,1.112792,1.0,0.003877,1.008807,0.194321,0.074701,0.008731,0.30615
1,(Bread),(Sandwich),0.327205,0.071844,0.01701,0.051986,0.723596,1.0,-0.006498,0.979053,-0.362147,0.044524,-0.021395,0.144375
2,(Coffee),(Scone),0.478394,0.034548,0.018067,0.037765,1.093107,1.0,0.001539,1.003343,0.163296,0.036507,0.003332,0.28035
3,(Coffee),(Soup),0.478394,0.034443,0.015848,0.033127,0.961807,1.0,-0.000629,0.998639,-0.070744,0.031888,-0.001362,0.246625
4,(Coffee),(Spanish Brunch),0.478394,0.018172,0.010882,0.022747,1.251766,1.0,0.002189,1.004682,0.385594,0.022406,0.00466,0.310792
5,(Coffee),(Tea),0.478394,0.142631,0.049868,0.10424,0.73084,1.0,-0.018366,0.957142,-0.413856,0.08731,-0.044777,0.226935
6,(Coffee),(Muffin),0.478394,0.038457,0.018806,0.039311,1.022193,1.0,0.000408,1.000888,0.041623,0.03776,0.000888,0.264161
7,(Coffee),(Cookies),0.478394,0.054411,0.028209,0.058966,1.083723,1.0,0.002179,1.004841,0.14811,0.055905,0.004818,0.288707
8,(Bread),(Coffee),0.327205,0.478394,0.090016,0.275105,0.575059,1.0,-0.066517,0.719561,-0.523431,0.125794,-0.389737,0.231634
9,(Coffee),(Bread),0.478394,0.327205,0.090016,0.188163,0.575059,1.0,-0.066517,0.828731,-0.58621,0.125794,-0.206665,0.231634


In [59]:
print(encoded_data['Bread'].sum())
print(encoded_data['Coffee'].sum())

3097
4528


## 3.2 FP Growth Algorithm

In [60]:
from mlxtend.frequent_patterns import fpgrowth
start_time = time.time()


# Evaluate the frequent item rules for fpgrowth function
frequent_itemsets_fp = fpgrowth(encoded_data, min_support = 0.01, use_colnames = True)

print(frequent_itemsets_fp)

# End timing
end_time = time.time()

# Calculate the time taken
calculation_time = end_time - start_time

print("Frequent Itemsets calculated in {:.2f} seconds.".format(calculation_time))

     support                  itemsets
0   0.327205                   (Bread)
1   0.029054            (Scandinavian)
2   0.058320           (Hot chocolate)
3   0.054411                 (Cookies)
4   0.015003                     (Jam)
..       ...                       ...
56  0.019651         (Brownie, Coffee)
57  0.010777          (Bread, Brownie)
58  0.023666           (Coffee, Toast)
59  0.018067           (Scone, Coffee)
60  0.010882  (Spanish Brunch, Coffee)

[61 rows x 2 columns]
Frequent Itemsets calculated in 0.74 seconds.


In [61]:
from mlxtend.frequent_patterns import association_rules

# Display the reles due to fp-growth algorithm
rules_fp = association_rules(frequent_itemsets_fp, metric = "confidence", min_threshold = 0.3, num_itemsets=0)

rules_fp

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(Hot chocolate),(Coffee),0.05832,0.478394,0.029583,0.507246,1.060311,1.0,0.001683,1.058553,0.060403,0.058333,0.055314,0.284542
1,(Cookies),(Coffee),0.054411,0.478394,0.028209,0.518447,1.083723,1.0,0.002179,1.083174,0.0817,0.055905,0.076787,0.288707
2,(Muffin),(Coffee),0.038457,0.478394,0.018806,0.489011,1.022193,1.0,0.000408,1.020777,0.022579,0.03776,0.020354,0.264161
3,(Pastry),(Coffee),0.086107,0.478394,0.047544,0.552147,1.154168,1.0,0.006351,1.164682,0.146161,0.091968,0.141396,0.325764
4,(Pastry),(Bread),0.086107,0.327205,0.02916,0.33865,1.034977,1.0,0.000985,1.017305,0.03698,0.075908,0.017011,0.213884
5,"(Pastry, Bread)",(Coffee),0.02916,0.478394,0.011199,0.384058,0.802807,1.0,-0.002751,0.846843,-0.20192,0.022563,-0.180857,0.203734
6,(Medialuna),(Coffee),0.061807,0.478394,0.035182,0.569231,1.189878,1.0,0.005614,1.210871,0.170091,0.069665,0.174148,0.321387
7,(Tea),(Coffee),0.142631,0.478394,0.049868,0.34963,0.73084,1.0,-0.018366,0.802014,-0.300482,0.08731,-0.246862,0.226935
8,(Juice),(Coffee),0.038563,0.478394,0.020602,0.534247,1.11675,1.0,0.002154,1.119919,0.108738,0.041507,0.107078,0.288656
9,(Soup),(Coffee),0.034443,0.478394,0.015848,0.460123,0.961807,1.0,-0.000629,0.966156,-0.039502,0.031888,-0.035029,0.246625
