# Recommendation System

In [1]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules

### Content based recommender

In [2]:
data = pd.read_json('renttherunway_final_data.json', lines=True)
#data = data.iloc[:10000, :]
data = data.dropna()

In [3]:
# 1. Count the frequency of each user_id
user_counts = data['user_id'].value_counts()

In [4]:
# 2. Filter the dataset to only include user_id's with count >= 5
frequent_users = user_counts[user_counts >= 5].index
filtered_data = data[data['user_id'].isin(frequent_users)]

# Display the filtered dataset
filtered_data.head()


Unnamed: 0,fit,user_id,bust size,item_id,weight,rating,rented for,review_text,body type,review_summary,category,height,size,age,review_date
0,fit,420272,34d,2260466,137lbs,10.0,vacation,An adorable romper! Belt and zipper were a lit...,hourglass,So many compliments!,romper,"5' 8""",14,28.0,"April 20, 2016"
4,fit,151944,34b,616682,145lbs,10.0,wedding,I have always been petite in my upper body and...,athletic,Was in love with this dress !!!,gown,"5' 9""",12,27.0,"September 26, 2016"
6,fit,336066,34c,568429,112lbs,10.0,everyday,This dress is so sweet. I loved the print. The...,hourglass,LITERALLY THE CUTEST DRESS EVER,dress,"5' 3""",4,27.0,"December 7, 2017"
9,fit,154309,32b,1729232,114lbs,10.0,formal affair,The dress was very flattering and fit perfectl...,petite,This dress was everything! It was perfect for ...,gown,"5' 3""",1,33.0,"October 17, 2016"
17,fit,339899,34d,1622747,143lbs,10.0,party,"Little tight, but loved this dress!!!",athletic,LOVED,dress,"5' 5""",12,26.0,"November 13, 2017"


In [5]:
filtered_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 41912 entries, 0 to 192541
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   fit             41912 non-null  object 
 1   user_id         41912 non-null  int64  
 2   bust size       41912 non-null  object 
 3   item_id         41912 non-null  int64  
 4   weight          41912 non-null  object 
 5   rating          41912 non-null  float64
 6   rented for      41912 non-null  object 
 7   review_text     41912 non-null  object 
 8   body type       41912 non-null  object 
 9   review_summary  41912 non-null  object 
 10  category        41912 non-null  object 
 11  height          41912 non-null  object 
 12  size            41912 non-null  int64  
 13  age             41912 non-null  float64
 14  review_date     41912 non-null  object 
dtypes: float64(2), int64(3), object(10)
memory usage: 5.1+ MB


In [6]:
grouped_reviews = filtered_data.groupby('item_id')['review_text'].apply(' '.join).reset_index()
grouped_reviews

Unnamed: 0,item_id,review_text
0,123373,This dress was perfect for my black and white ...
1,123793,This dress is STUNNING! Everyone loved this dr...
2,124204,Wore this w/ red patent pumps to match the red...
3,124553,I went a size down on this dress and it fit pe...
4,125424,This dress was very flattering and comfortable...
...,...,...
5293,2963850,This skirt is super cute. The waistline hits a...
5294,2964470,I loved this sweater from Tory Burch. I didn't...
5295,2965009,"This fur coat was SO fun to wear, and actually..."
5296,2965924,I ordered a size 10 and should have gone down ...


In [7]:
#Import TfIdfVectorizer from the scikit-learn librarydata.head()
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stopwords
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
grouped_reviews['review_text'] = grouped_reviews['review_text'].fillna('')

#Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature
tfidf_matrix = tfidf.fit_transform(grouped_reviews['review_text'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(5298, 15406)

In [8]:
# Import linear_kernel to compute the dot product
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
#cosine_sim = linear_kernel(tfidf_matrix[:10000], tfidf_matrix[:10000])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [9]:
cosine_sim.shape

(5298, 5298)

In [10]:
cosine_sim[1]

array([0.61765174, 1.        , 0.62895881, ..., 0.06885026, 0.09520279,
       0.1844313 ])

In [11]:
#Construct a reverse mapping of indices and item titles, and drop duplicate titles, if any
indices = pd.Series(grouped_reviews.index, index=grouped_reviews['item_id']).drop_duplicates()
indices.info

<bound method Series.info of item_id
123373        0
123793        1
124204        2
124553        3
125424        4
           ... 
2963850    5293
2964470    5294
2965009    5295
2965924    5296
2966087    5297
Length: 5298, dtype: int64>

In [12]:
indices[:10]

item_id
123373    0
123793    1
124204    2
124553    3
125424    4
125465    5
125564    6
126335    7
127081    8
127495    9
dtype: int64

In [13]:
# Function that takes in item title as input and gives recommendations 
def content_recommender(item_id, cosine_sim=cosine_sim):
    # Obtain the index of the item that matches the title
    idx = indices[item_id]

    # Get the pairwsie similarity scores of all movies with that movie
    # And convert it into a list of tuples as described above
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the cosine similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies. Ignore the first movie.
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    clothes_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return grouped_reviews['item_id'].iloc[clothes_indices]

In [14]:
grouped_reviews.shape

(5298, 2)

In [15]:
#Get recommendations for item: 2260466
content_recommender(123793)

20       132738
7        126335
10       127865
47       145906
91       168592
17       131533
16       131117
37       141688
2927    1687082
1119     709832
Name: item_id, dtype: int64

## Collaborative Filters

In [16]:
filtered_data.head(5)

Unnamed: 0,fit,user_id,bust size,item_id,weight,rating,rented for,review_text,body type,review_summary,category,height,size,age,review_date
0,fit,420272,34d,2260466,137lbs,10.0,vacation,An adorable romper! Belt and zipper were a lit...,hourglass,So many compliments!,romper,"5' 8""",14,28.0,"April 20, 2016"
4,fit,151944,34b,616682,145lbs,10.0,wedding,I have always been petite in my upper body and...,athletic,Was in love with this dress !!!,gown,"5' 9""",12,27.0,"September 26, 2016"
6,fit,336066,34c,568429,112lbs,10.0,everyday,This dress is so sweet. I loved the print. The...,hourglass,LITERALLY THE CUTEST DRESS EVER,dress,"5' 3""",4,27.0,"December 7, 2017"
9,fit,154309,32b,1729232,114lbs,10.0,formal affair,The dress was very flattering and fit perfectl...,petite,This dress was everything! It was perfect for ...,gown,"5' 3""",1,33.0,"October 17, 2016"
17,fit,339899,34d,1622747,143lbs,10.0,party,"Little tight, but loved this dress!!!",athletic,LOVED,dress,"5' 5""",12,26.0,"November 13, 2017"


In [17]:
data = filtered_data.drop(['fit','bust size','weight','rented for','review_text','body type','review_summary','category','height','size','age','review_date'], axis=1)

In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 41912 entries, 0 to 192541
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   user_id  41912 non-null  int64  
 1   item_id  41912 non-null  int64  
 2   rating   41912 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 1.3 MB


In [19]:
#Import the train_test_split function
from sklearn.model_selection import train_test_split

#Assign X as the original ratings dataframe and y as the user_id column of ratings.
X = data.copy()
y = data['user_id']

In [20]:
#Split into training and test datasets, stratified along user_id
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify=y, random_state=42)

In [21]:
# #Split into training and test datasets, stratified along user_id
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)

In [22]:
print(y)


0         420272
4         151944
6         336066
9         154309
17        339899
           ...  
192531    136279
192534    582401
192535    431280
192539     66386
192541     47002
Name: user_id, Length: 41912, dtype: int64


In [23]:
#Import the mean_squared_error function
from sklearn.metrics import mean_squared_error

#Function that computes the root mean squared error (or RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [24]:
#Define the baseline model to always return 5.
def baseline(user_id, item_id):
    return 5.0

In [25]:
#Function to compute the RMSE score obtained on the testing set by a model
def score(cf_model):
    
    #Construct a list of user-item tuples from the testing dataset
    id_pairs = zip(X_test['user_id'], X_test['item_id'])
    
    #Predict the rating for every user-item tuple
    y_pred = np.array([cf_model(user, item) for (user, item) in id_pairs])
    
    #Extract the actual ratings given by the users in the test data
    y_true = np.array(X_test['rating'])
    
    #Return the final RMSE score
    return rmse(y_true, y_pred)

In [26]:
score(baseline)

4.264203554318523

## User based Collaborative Filtering

### Ratings Matrix

In [36]:
#Build the ratings matrix using pivot_table function
r_matrix = X_train.pivot_table(values='rating', index='user_id', columns='item_id')

### Mean

In [28]:
#User Based Collaborative Filter using Mean Ratings
def cf_user_mean(user_id, item_id):
    
    #Check if movie_id exists in r_matrix
    if item_id in r_matrix:
        #Compute the mean of all the ratings given to the movie
        mean_rating = r_matrix[item_id].mean()
    
    else:
        #Default to a rating of 5.0 in the absence of any information
        mean_rating = 5.0
    
    return mean_rating

In [29]:
#Compute RMSE for the Mean model
score(cf_user_mean)

1.6912718628825594

### Weighted Mean

In [40]:
#Create a dummy ratings matrix with all null values imputed to 0
r_matrix = r_matrix.copy().fillna(0)

In [41]:
# Import cosine_score 
from sklearn.metrics.pairwise import cosine_similarity

#Compute the cosine similarity matrix using the dummy ratings matrix
cosine_sim = cosine_similarity(r_matrix, r_matrix)

In [42]:
#Convert into pandas dataframe 
cosine_sim = pd.DataFrame(cosine_sim, index=r_matrix.index, columns=r_matrix.index)

cosine_sim.head(10)

user_id,47,332,657,1023,1089,1171,1231,1328,1384,1434,...,998267,998336,998470,998563,999183,999274,999425,999431,999561,999910
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
47,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
332,0.0,1.0,0.202031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
657,0.0,0.202031,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1023,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1089,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.117202,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1171,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1231,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1384,0.0,0.0,0.0,0.0,0.117202,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1434,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
#User Based Collaborative Filter using Weighted Mean Ratings
def cf_user_wmean(user_id, item_id):
    
    #Check if item_id exists in r_matrix
    if item_id in r_matrix:
        
        #Get the similarity scores for the user in question with every other user
        sim_scores = cosine_sim[user_id]
        
        #Get the user ratings for the item in question
        m_ratings = r_matrix[item_id]
        
        #Extract the indices containing NaN in the m_ratings series
        idx = m_ratings[m_ratings.isnull()].index
        
        #Drop the NaN values from the m_ratings Series
        m_ratings = m_ratings.dropna()
        
        #Drop the corresponding cosine scores from the sim_scores series
        sim_scores = sim_scores.drop(idx)
        
        #Compute the final weighted mean
        wmean_rating = np.dot(sim_scores, m_ratings)/ sim_scores.sum()
    
    else:
        #Default to a rating of 3.0 in the absence of any information
        wmean_rating = 5.0
    
    return wmean_rating

In [44]:
score(cf_user_wmean)

8.978087992027666

## Demographics

In [45]:
#Merge the original users dataframe with the training set 
merged_df = pd.merge(X_train, filtered_data)

merged_df.head()

Unnamed: 0,user_id,item_id,rating,fit,bust size,weight,rented for,review_text,body type,review_summary,category,height,size,age,review_date
0,472391,699783,10.0,small,34c,130lbs,wedding,QUALITY! The quality of this dress is absolute...,athletic,RTR Team Review,dress,"5' 6""",8,25.0,"September 27, 2017"
1,726335,1845435,10.0,large,34a,160lbs,other,This dress does run somewhat big and small in ...,hourglass,Perfect dress for any formal Event!,gown,"5' 4""",12,35.0,"June 2, 2017"
2,128764,1859039,10.0,fit,34d,140lbs,party,"Love this dress! Great fit, very comfortable, ...",hourglass,Love the lace back,dress,"5' 8""",14,30.0,"March 10, 2014"
3,248933,147594,8.0,fit,34d,110lbs,wedding,This is my first rent the runway experience an...,hourglass,Black tie optional wedding,shift,"5' 0""",8,26.0,"November 23, 2015"
4,520935,1090219,10.0,fit,34b,130lbs,work,This dress was PERFECT. Material was thick and...,athletic,Great Work Outfit,dress,"5' 6""",4,28.0,"September 6, 2016"


In [46]:
merged_df.shape

(31608, 15)

In [57]:
#Compute the mean rating by fit and weight
gen_occ_mean = merged_df[['fit', 'rating', 'item_id', 'weight']].pivot_table(
    values='rating', index='item_id', columns=['fit', 'weight'], aggfunc='mean')

gen_occ_mean.head()

fit,fit,fit,fit,fit,fit,fit,fit,fit,fit,fit,...,small,small,small,small,small,small,small,small,small,small
weight,100lbs,101lbs,102lbs,103lbs,104lbs,105lbs,106lbs,107lbs,108lbs,109lbs,...,220lbs,225lbs,230lbs,250lbs,255lbs,260lbs,265lbs,89lbs,95lbs,98lbs
item_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
123373,,,,,,,,,,,...,,,,,,,,,,
123793,,10.0,,,,,,,,,...,,,,,,,,,,
124204,,,8.0,,,,,,,,...,,,,,,,,,,
124553,,,,,,,,,8.0,,...,,,,,,,,,,
125424,,,,,,,,,10.0,,...,,,,,,,,,,


In [58]:
#Gender and Occupation Based Collaborative Filter using Mean Ratings
def cf_gen_occ(user_id, item_id):
    
    #Check if movie_id exists in gen_occ_mean
    if item_id in gen_occ_mean.index:
        
        #Identify the user
        user = filtered_data.loc[user_id]
        
        #Identify the gender and occupation
        fit = user['fit']
        occ = user['weight']
        
        #Check if the occupation has rated the movie
        if occ in gen_occ_mean.loc[item_id]:
            
            #Check if the gender has rated the movie
            if fit in gen_occ_mean.loc[item_id][occ]:
                
                #Extract the required rating
                rating = gen_occ_mean.loc[item_id][occ][fit]
                
                #Default to 5.0 if the rating is null
                if np.isnan(rating):
                    rating = 5.0
                
                return rating
            
    #Return the default rating    
    return 5.0

In [59]:
score(cf_gen_occ)

  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)


TypeError: unhashable type: 'Series'

In [54]:
#Compute the mean rating of every movie by gender
fit_mean = merged_df[['item_id', 'body type', 'rating']].groupby(['item_id', 'body type'])['rating'].mean()

In [55]:
fit_mean

item_id  body type
123373   apple         6.000000
         athletic      9.333333
         full bust     8.666667
         hourglass     9.111111
         pear          7.500000
                        ...    
2965924  hourglass     9.000000
         petite       10.000000
2966087  athletic      8.000000
         hourglass    10.000000
         petite       10.000000
Name: rating, Length: 15810, dtype: float64

In [56]:
filtered_data = filtered_data.set_index('user_id')

KeyError: "None of ['user_id'] are in the columns"

In [52]:
#Fit Based Collaborative Filter using Mean Ratings
def cf_fit(user_id, item_id):
    
    #Check if movie_id exists in r_matrix (or training set)
    if item_id in r_matrix:
        #Identify the fit of the user
        fit = users.loc[user_id]['fit']
        
        #Check if the fit has rated the movie
        if fit in fit_mean[_id]:item
            
            #Compute the mean rating given by that fit to the item
            fit_rating = fit_mean[item_id][fit]
        
        else:
            fit_rating = 5.0
    
    else:
        #Default to a rating of 3.0 in the absence of any information
        fit_rating = 5.0
    
    return fit_rating

IndentationError: unexpected indent (1185798008.py, line 13)

# Market basket analysis

In [None]:
data = pd.read_csv('bread basket.csv')

In [None]:
data.head()

In [None]:
data = data.drop(['date_time','period_day','weekday_weekend'], axis=1)

In [None]:
data

In [None]:
data = data.groupby('Transaction')['Item'].apply(list)
data

In [None]:
data.info()

In [None]:
transactions = data.tolist()
transactions

In [None]:
items = list(set(item for transaction in transactions for item in transaction))

items

In [None]:
encoded_data = pd.DataFrame(
    [[item in transaction for item in items] for transaction in transactions],
    columns=items
)


encoded_data.head()

## Apriori Algorithm

In [None]:
import time

start_time = time.time()

frequent_itemsets = apriori(encoded_data, min_support=0.01, use_colnames=True)
frequent_itemsets

# End timing
end_time = time.time()

# Calculate the time taken
calculation_time = end_time - start_time

print("Frequent Itemsets calculated in {:.2f} seconds.".format(calculation_time))

In [None]:
rules = association_rules(frequent_itemsets, metric = "antecedent support", min_threshold = 0.2, num_itemsets=0)
rules.info()

In [None]:
# Display the results
from IPython.display import display

print("Frequent Itemsets:")
display(frequent_itemsets.head())

In [None]:
print("\nAssociation Rules:")
rules

In [None]:
print(encoded_data['Bread'].sum())
print(encoded_data['Coffee'].sum())

## FP Growth Algorithm

In [None]:
from mlxtend.frequent_patterns import fpgrowth
start_time = time.time()


# Evaluate the frequent item rules for fpgrowth function
frequent_itemsets_fp = fpgrowth(encoded_data, min_support = 0.01, use_colnames = True)

print(frequent_itemsets_fp)

# End timing
end_time = time.time()

# Calculate the time taken
calculation_time = end_time - start_time

print("Frequent Itemsets calculated in {:.2f} seconds.".format(calculation_time))

In [None]:
from mlxtend.frequent_patterns import association_rules

# Display the reles due to fp-growth algorithm
rules_fp = association_rules(frequent_itemsets_fp, metric = "confidence", min_threshold = 0.3, num_itemsets=0)

rules_fp