# Recommendation System

In [1]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules

### Content based recommender

In [2]:
data = pd.read_json('renttherunway_final_data.json', lines=True)
data = data.iloc[:10000, :]
data = data.dropna()

In [3]:
# Converting 'employee_id' to string
#data['item_id'] = data['item_id'].astype(str)
# Displaying the types of data after conversion
print("\nTypes of data after conversion:\n", data.dtypes)


Types of data after conversion:
 fit                object
user_id             int64
bust size          object
item_id             int64
weight             object
rating            float64
rented for         object
review_text        object
body type          object
review_summary     object
category           object
height             object
size                int64
age               float64
review_date        object
dtype: object


In [4]:
data = data.dropna().drop_duplicates()
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7600 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   fit             7600 non-null   object 
 1   user_id         7600 non-null   int64  
 2   bust size       7600 non-null   object 
 3   item_id         7600 non-null   int64  
 4   weight          7600 non-null   object 
 5   rating          7600 non-null   float64
 6   rented for      7600 non-null   object 
 7   review_text     7600 non-null   object 
 8   body type       7600 non-null   object 
 9   review_summary  7600 non-null   object 
 10  category        7600 non-null   object 
 11  height          7600 non-null   object 
 12  size            7600 non-null   int64  
 13  age             7600 non-null   float64
 14  review_date     7600 non-null   object 
dtypes: float64(2), int64(3), object(10)
memory usage: 950.0+ KB


In [5]:
data['user_id'].unique()

array([420272, 273551, 909926, ..., 737193, 330374, 240023], dtype=int64)

In [6]:
data.head()

Unnamed: 0,fit,user_id,bust size,item_id,weight,rating,rented for,review_text,body type,review_summary,category,height,size,age,review_date
0,fit,420272,34d,2260466,137lbs,10.0,vacation,An adorable romper! Belt and zipper were a lit...,hourglass,So many compliments!,romper,"5' 8""",14,28.0,"April 20, 2016"
1,fit,273551,34b,153475,132lbs,10.0,other,I rented this dress for a photo shoot. The the...,straight & narrow,I felt so glamourous!!!,gown,"5' 6""",12,36.0,"June 18, 2013"
3,fit,909926,34c,126335,135lbs,8.0,formal affair,I rented this for my company's black tie award...,pear,Dress arrived on time and in perfect condition.,dress,"5' 5""",8,34.0,"February 12, 2014"
4,fit,151944,34b,616682,145lbs,10.0,wedding,I have always been petite in my upper body and...,athletic,Was in love with this dress !!!,gown,"5' 9""",12,27.0,"September 26, 2016"
5,fit,734848,32b,364092,138lbs,8.0,date,Didn't actually wear it. It fit perfectly. The...,athletic,Traditional with a touch a sass,dress,"5' 8""",8,45.0,"April 30, 2016"


In [7]:
grouped_reviews = data.groupby('item_id')['review_text'].apply(' '.join).reset_index()
grouped_reviews

Unnamed: 0,item_id,review_text
0,123373,The dress was beautiful and very comfortable. ...
1,123793,"Fit great, super flattering Limited range of a..."
2,124204,"This dress is a ""WOW."" It steals the show, sp..."
3,124553,Loved the dress. A little short in the front ...
4,125424,The dress would have been perfect in a size 10...
...,...,...
2737,2957481,"perfect fit all the way around, I would keep i..."
2738,2958376,"Super cute. Loved this Romper, even though we ..."
2739,2960025,"It could be interesting, but this dress didn't..."
2740,2960969,"I got this through Unlimited, so didn't have t..."


In [8]:
#Import TfIdfVectorizer from the scikit-learn librarydata.head()
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stopwords
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
grouped_reviews['review_text'] = grouped_reviews['review_text'].fillna('')

#Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature
tfidf_matrix = tfidf.fit_transform(grouped_reviews['review_text'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(2742, 7701)

In [9]:
# Import linear_kernel to compute the dot product
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
#cosine_sim = linear_kernel(tfidf_matrix[:10000], tfidf_matrix[:10000])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [10]:
cosine_sim.shape

(2742, 2742)

In [11]:
cosine_sim[1]

array([0.57584662, 1.        , 0.48359436, ..., 0.17127339, 0.11695978,
       0.04557235])

In [12]:
#Construct a reverse mapping of indices and movie titles, and drop duplicate titles, if any
indices = pd.Series(grouped_reviews.index, index=grouped_reviews['item_id']).drop_duplicates()
indices.info

<bound method Series.info of item_id
123373        0
123793        1
124204        2
124553        3
125424        4
           ... 
2957481    2737
2958376    2738
2960025    2739
2960969    2740
2963850    2741
Length: 2742, dtype: int64>

In [13]:
indices[:10]

item_id
123373    0
123793    1
124204    2
124553    3
125424    4
125465    5
125564    6
126335    7
127081    8
127495    9
dtype: int64

In [14]:
# Function that takes in movie title as input and gives recommendations 
def content_recommender(item_id, cosine_sim=cosine_sim):
    # Obtain the index of the movie that matches the title
    idx = indices[item_id]

    # Get the pairwsie similarity scores of all movies with that movie
    # And convert it into a list of tuples as described above
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the cosine similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies. Ignore the first movie.
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    clothes_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return grouped_reviews['item_id'].iloc[clothes_indices]

In [15]:
grouped_reviews.shape

(2742, 2)

In [16]:
#Get recommendations for item: 2260466
content_recommender(123793)

20    132738
17    131533
10    127865
47    145906
16    131117
7     126335
5     125465
30    138431
65    154002
14    130259
Name: item_id, dtype: int64

## Collaborative Filters

In [17]:
data.head(5)

Unnamed: 0,fit,user_id,bust size,item_id,weight,rating,rented for,review_text,body type,review_summary,category,height,size,age,review_date
0,fit,420272,34d,2260466,137lbs,10.0,vacation,An adorable romper! Belt and zipper were a lit...,hourglass,So many compliments!,romper,"5' 8""",14,28.0,"April 20, 2016"
1,fit,273551,34b,153475,132lbs,10.0,other,I rented this dress for a photo shoot. The the...,straight & narrow,I felt so glamourous!!!,gown,"5' 6""",12,36.0,"June 18, 2013"
3,fit,909926,34c,126335,135lbs,8.0,formal affair,I rented this for my company's black tie award...,pear,Dress arrived on time and in perfect condition.,dress,"5' 5""",8,34.0,"February 12, 2014"
4,fit,151944,34b,616682,145lbs,10.0,wedding,I have always been petite in my upper body and...,athletic,Was in love with this dress !!!,gown,"5' 9""",12,27.0,"September 26, 2016"
5,fit,734848,32b,364092,138lbs,8.0,date,Didn't actually wear it. It fit perfectly. The...,athletic,Traditional with a touch a sass,dress,"5' 8""",8,45.0,"April 30, 2016"


In [18]:
data = data.drop(['fit','bust size','weight','rented for','review_text','body type','review_summary','category','height','size','age','review_date'], axis=1)

In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7600 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   user_id  7600 non-null   int64  
 1   item_id  7600 non-null   int64  
 2   rating   7600 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 237.5 KB


In [22]:
data['rating'].unique()

array([10.,  8.,  4.,  6.,  2.])

In [23]:
#Import the train_test_split function
from sklearn.model_selection import train_test_split

#Assign X as the original ratings dataframe and y as the user_id column of ratings.
X = data.copy()
y = data['user_id']

#Split into training and test datasets, stratified along user_id
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)

In [24]:
#Import the mean_squared_error function
from sklearn.metrics import mean_squared_error

#Function that computes the root mean squared error (or RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [25]:
#Define the baseline model to always return 5.
def baseline(user_id, item_id):
    return 5.0

In [36]:
print(X_test)

      user_id  item_id  rating
9356   733876  1190600    10.0
1352   873225  1057664     8.0
3761    51267   126335    10.0
1919   683657   182578     8.0
613    327702   172027    10.0
...       ...      ...     ...
9318   852729  1336309     4.0
1369   679959   145906     8.0
9774   881184  2275614    10.0
8370   455575   162634    10.0
2894   229447   479018    10.0

[1900 rows x 3 columns]


In [26]:
#Function to compute the RMSE score obtained on the testing set by a model
def score(cf_model):
    
    #Construct a list of user-item tuples from the testing dataset
    id_pairs = zip(X_test['user_id'], X_test['item_id'])
    
    #Predict the rating for every user-movie tuple
    y_pred = np.array([cf_model(user, item) for (user, item) in id_pairs])
    
    #Extract the actual ratings given by the users in the test data
    y_true = np.array(X_test['rating'])
    
    #Return the final RMSE score
    return rmse(y_true, y_pred)

In [27]:
score(baseline)

4.331281565541543

## User based Collaborative Filtering

### Ratings Matrix

In [28]:
#Build the ratings matrix using pivot_table function
r_matrix = X_train.pivot_table(values='rating', index='user_id', columns='item_id')

r_matrix.head()

item_id,123373,123793,124204,124553,125424,125465,125564,126335,127081,127495,...,2934869,2937389,2946611,2949937,2953681,2954118,2955734,2958376,2960025,2960969
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
321,,,,,,,,,,,...,,,,,,,,,,
611,,,,,,,,,,,...,,,,,,,,,,
657,,,,,,,,,,,...,,,,,,,,,,
772,,,,,,,,,,,...,,,,,,,,,,
1066,,,,,,,10.0,,,,...,,,,,,,,,,


### Mean

In [29]:
#User Based Collaborative Filter using Mean Ratings
def cf_user_mean(user_id, item_id):
    
    #Check if movie_id exists in r_matrix
    if item_id in r_matrix:
        #Compute the mean of all the ratings given to the movie
        mean_rating = r_matrix[item_id].mean()
    
    else:
        #Default to a rating of 5.0 in the absence of any information
        mean_rating = 5.0
    
    return mean_rating

In [30]:
#Compute RMSE for the Mean model
score(cf_user_mean)

2.4907621546385093

### Weighted Mean

In [31]:
#Create a dummy ratings matrix with all null values imputed to 0
r_matrix_dummy = r_matrix.copy().fillna(0)

In [32]:
# Import cosine_score 
from sklearn.metrics.pairwise import cosine_similarity

#Compute the cosine similarity matrix using the dummy ratings matrix
cosine_sim = cosine_similarity(r_matrix_dummy, r_matrix_dummy)

In [33]:
#Convert into pandas dataframe 
cosine_sim = pd.DataFrame(cosine_sim, index=r_matrix.index, columns=r_matrix.index)

cosine_sim.head(10)

user_id,321,611,657,772,1066,1070,1089,1231,1296,1384,...,998380,998431,998459,998673,999016,999183,999274,999621,999726,999910
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
321,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
611,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
657,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
772,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1066,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1070,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1089,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1296,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1384,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
#User Based Collaborative Filter using Weighted Mean Ratings
def cf_user_wmean(user_id, item_id):
    if user_id not in cosine_sim.index or item_id not in r_matrix:
        return 5.0
    
    #Check if movie_id exists in r_matrix
    if item_id in r_matrix:
        
        #Get the similarity scores for the user in question with every other user
        sim_scores = cosine_sim[user_id]
        
        #Get the user ratings for the movie in question
        m_ratings = r_matrix[item_id]
        
        #Extract the indices containing NaN in the m_ratings series
        idx = m_ratings[m_ratings.isnull()].index
        
        #Drop the NaN values from the m_ratings Series
        m_ratings = m_ratings.dropna()
        
        #Drop the corresponding cosine scores from the sim_scores series
        sim_scores = sim_scores.drop(idx)
        
        #Compute the final weighted mean
        wmean_rating = np.dot(sim_scores, m_ratings)/ sim_scores.sum()
    
    else:
        #Default to a rating of 3.0 in the absence of any information
        wmean_rating = 5.0
    
    return wmean_rating

In [38]:
score(cf_user_wmean)

  wmean_rating = np.dot(sim_scores, m_ratings)/ sim_scores.sum()
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)


ValueError: Input contains NaN.

# Market basket analysis

In [None]:
data = pd.read_csv('bread basket.csv')

In [None]:
data.head()

In [None]:
data = data.drop(['date_time','period_day','weekday_weekend'], axis=1)

In [None]:
data

In [None]:
data = data.groupby('Transaction')['Item'].apply(list)
data

In [None]:
data.info()

In [None]:
transactions = data.tolist()
transactions

In [None]:
items = list(set(item for transaction in transactions for item in transaction))

items

In [None]:
encoded_data = pd.DataFrame(
    [[item in transaction for item in items] for transaction in transactions],
    columns=items
)


encoded_data.head()

## Apriori Algorithm

In [None]:
import time

start_time = time.time()

frequent_itemsets = apriori(encoded_data, min_support=0.01, use_colnames=True)
frequent_itemsets

# End timing
end_time = time.time()

# Calculate the time taken
calculation_time = end_time - start_time

print("Frequent Itemsets calculated in {:.2f} seconds.".format(calculation_time))

In [None]:
rules = association_rules(frequent_itemsets, metric = "antecedent support", min_threshold = 0.2, num_itemsets=0)
rules.info()

In [None]:
# Display the results
from IPython.display import display

print("Frequent Itemsets:")
display(frequent_itemsets.head())

In [None]:
print("\nAssociation Rules:")
rules

In [None]:
print(encoded_data['Bread'].sum())
print(encoded_data['Coffee'].sum())

## FP Growth Algorithm

In [None]:
from mlxtend.frequent_patterns import fpgrowth
start_time = time.time()


# Evaluate the frequent item rules for fpgrowth function
frequent_itemsets_fp = fpgrowth(encoded_data, min_support = 0.01, use_colnames = True)

print(frequent_itemsets_fp)

# End timing
end_time = time.time()

# Calculate the time taken
calculation_time = end_time - start_time

print("Frequent Itemsets calculated in {:.2f} seconds.".format(calculation_time))

In [None]:
from mlxtend.frequent_patterns import association_rules

# Display the reles due to fp-growth algorithm
rules_fp = association_rules(frequent_itemsets_fp, metric = "confidence", min_threshold = 0.3, num_itemsets=0)

rules_fp