# Recommendation System

In [1]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules

### Content based recommender

In [39]:
data = pd.read_json('renttherunway_final_data.json', lines=True)
data = data.iloc[:10000, :]
data = data.dropna()

In [40]:
# Converting 'employee_id' to string
#data['item_id'] = data['item_id'].astype(str)
# Displaying the types of data after conversion
print("\nTypes of data after conversion:\n", data.dtypes)


Types of data after conversion:
 fit                object
user_id             int64
bust size          object
item_id             int64
weight             object
rating            float64
rented for         object
review_text        object
body type          object
review_summary     object
category           object
height             object
size                int64
age               float64
review_date        object
dtype: object


In [41]:
data = data.dropna().drop_duplicates()
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7600 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   fit             7600 non-null   object 
 1   user_id         7600 non-null   int64  
 2   bust size       7600 non-null   object 
 3   item_id         7600 non-null   int64  
 4   weight          7600 non-null   object 
 5   rating          7600 non-null   float64
 6   rented for      7600 non-null   object 
 7   review_text     7600 non-null   object 
 8   body type       7600 non-null   object 
 9   review_summary  7600 non-null   object 
 10  category        7600 non-null   object 
 11  height          7600 non-null   object 
 12  size            7600 non-null   int64  
 13  age             7600 non-null   float64
 14  review_date     7600 non-null   object 
dtypes: float64(2), int64(3), object(10)
memory usage: 950.0+ KB


In [42]:
data['user_id'].unique()

array([420272, 273551, 909926, ..., 737193, 330374, 240023], dtype=int64)

In [43]:
data.head()

Unnamed: 0,fit,user_id,bust size,item_id,weight,rating,rented for,review_text,body type,review_summary,category,height,size,age,review_date
0,fit,420272,34d,2260466,137lbs,10.0,vacation,An adorable romper! Belt and zipper were a lit...,hourglass,So many compliments!,romper,"5' 8""",14,28.0,"April 20, 2016"
1,fit,273551,34b,153475,132lbs,10.0,other,I rented this dress for a photo shoot. The the...,straight & narrow,I felt so glamourous!!!,gown,"5' 6""",12,36.0,"June 18, 2013"
3,fit,909926,34c,126335,135lbs,8.0,formal affair,I rented this for my company's black tie award...,pear,Dress arrived on time and in perfect condition.,dress,"5' 5""",8,34.0,"February 12, 2014"
4,fit,151944,34b,616682,145lbs,10.0,wedding,I have always been petite in my upper body and...,athletic,Was in love with this dress !!!,gown,"5' 9""",12,27.0,"September 26, 2016"
5,fit,734848,32b,364092,138lbs,8.0,date,Didn't actually wear it. It fit perfectly. The...,athletic,Traditional with a touch a sass,dress,"5' 8""",8,45.0,"April 30, 2016"


In [44]:
grouped_reviews = data.groupby('item_id')['review_text'].apply(' '.join).reset_index()
grouped_reviews

Unnamed: 0,item_id,review_text
0,123373,The dress was beautiful and very comfortable. ...
1,123793,"Fit great, super flattering Limited range of a..."
2,124204,"This dress is a ""WOW."" It steals the show, sp..."
3,124553,Loved the dress. A little short in the front ...
4,125424,The dress would have been perfect in a size 10...
...,...,...
2737,2957481,"perfect fit all the way around, I would keep i..."
2738,2958376,"Super cute. Loved this Romper, even though we ..."
2739,2960025,"It could be interesting, but this dress didn't..."
2740,2960969,"I got this through Unlimited, so didn't have t..."


In [45]:
#Import TfIdfVectorizer from the scikit-learn librarydata.head()
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stopwords
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
grouped_reviews['review_text'] = grouped_reviews['review_text'].fillna('')

#Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature
tfidf_matrix = tfidf.fit_transform(grouped_reviews['review_text'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(2742, 7701)

In [46]:
# Import linear_kernel to compute the dot product
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
#cosine_sim = linear_kernel(tfidf_matrix[:10000], tfidf_matrix[:10000])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [47]:
cosine_sim.shape

(2742, 2742)

In [48]:
cosine_sim[1]

array([0.57584662, 1.        , 0.48359436, ..., 0.17127339, 0.11695978,
       0.04557235])

In [49]:
#Construct a reverse mapping of indices and movie titles, and drop duplicate titles, if any
indices = pd.Series(grouped_reviews.index, index=grouped_reviews['item_id']).drop_duplicates()
indices.info

<bound method Series.info of item_id
123373        0
123793        1
124204        2
124553        3
125424        4
           ... 
2957481    2737
2958376    2738
2960025    2739
2960969    2740
2963850    2741
Length: 2742, dtype: int64>

In [50]:
indices[:10]

item_id
123373    0
123793    1
124204    2
124553    3
125424    4
125465    5
125564    6
126335    7
127081    8
127495    9
dtype: int64

In [51]:
# Function that takes in movie title as input and gives recommendations 
def content_recommender(item_id, cosine_sim=cosine_sim):
    # Obtain the index of the movie that matches the title
    idx = indices[item_id]

    # Get the pairwsie similarity scores of all movies with that movie
    # And convert it into a list of tuples as described above
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the cosine similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies. Ignore the first movie.
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    clothes_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return grouped_reviews['item_id'].iloc[clothes_indices]

In [52]:
grouped_reviews.shape

(2742, 2)

In [55]:
#Get recommendations for item: 2260466
content_recommender(123793)

20    132738
17    131533
10    127865
47    145906
16    131117
7     126335
5     125465
30    138431
65    154002
14    130259
Name: item_id, dtype: int64

In [None]:
para el kernel

In [None]:
nan_count_per_column = filtered_data.isna().sum()
print("\nNumber of NaN values in each column:")
print(nan_count_per_column)

In [None]:
filtered_data = filtered_data.dropna()


In [None]:
filtered_data.shape

In [None]:
filtered_data.head()

user_id  item_id  rating

In [None]:
filtered_data = filtered_data[['user_id', 'item_id', 'rating']]

In [None]:
from scipy.sparse import csr_matrix

n_users =  filtered_data["user_id"].nunique()
n_items =  filtered_data["item_id"].nunique()
n_items = filtered_data['item_id'].max()


In [None]:
print(n_users)
print(n_items)

In [None]:
print(filtered_data.head())
print(filtered_data.describe())


In [None]:
A = np.zeros((n_users,n_items))
for line in filtered_data.itertuples():
    A[line[1]-1,line[2]-1] = line[3]
print("Original rating matrix : ",A)

In [None]:


#sparse_matrix list
rows = filtered_data["user_id"].values
cols = filtered_data["item_id"].values
data = np.where(filtered_data["rating"] >= 3, 1, 0) 

sparse_matrix = csr_matrix((data, (rows, cols)), shape=(n_users,n_items))

In [None]:
n_users = filtered_data.user_id.unique().shape[0]
n_items = filtered_data.item_id.unique().shape[0]
n_items = filtered_data['item_id'].max()
A = np.zeros((n_users,n_items))
for line in dataset.itertuples():
    A[line[1]-1,line[2]-1] = line[3]
print("Original rating matrix : ",A)

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer


data = data.dropna(subset=['rating', 'user_id', 'item_id'])

# Crear matriz de utilidad (usuario-ítem)
utility_matrix = data.pivot_table(index='item_id', columns='user_id', values='rating', fill_value=0)

# Calcular la similitud entre ítems
item_similarity = cosine_similarity(utility_matrix)
item_similarity_df = pd.DataFrame(item_similarity, index=utility_matrix.index, columns=utility_matrix.index)

# Ejemplo de ítems similares
item_id = 123  # Cambiar por un ID válido
similar_items = item_similarity_df[item_id].sort_values(ascending=False).head(5)
print(similar_items)


In [None]:
n_items

In [None]:
data.describe()

In [None]:
counts = data['item_id'].value_counts()
counts

In [None]:
para

In [None]:
data.head()

# Market basket analysis

In [None]:
data = pd.read_csv('bread basket.csv')

In [None]:
data.head()

In [None]:
data = data.drop(['date_time','period_day','weekday_weekend'], axis=1)

In [None]:
data

In [None]:
data = data.groupby('Transaction')['Item'].apply(list)
data

In [None]:
data.info()

In [None]:
transactions = data.tolist()
transactions

In [None]:
items = list(set(item for transaction in transactions for item in transaction))

items

In [None]:
encoded_data = pd.DataFrame(
    [[item in transaction for item in items] for transaction in transactions],
    columns=items
)


encoded_data.head()

# Apriori Algorithm

In [None]:
import time

start_time = time.time()

frequent_itemsets = apriori(encoded_data, min_support=0.01, use_colnames=True)
frequent_itemsets

# End timing
end_time = time.time()

# Calculate the time taken
calculation_time = end_time - start_time

print("Frequent Itemsets calculated in {:.2f} seconds.".format(calculation_time))

In [None]:
rules = association_rules(frequent_itemsets, metric = "antecedent support", min_threshold = 0.2, num_itemsets=0)
rules.info()

In [None]:
# Display the results
from IPython.display import display

print("Frequent Itemsets:")
display(frequent_itemsets.head())

In [None]:
print("\nAssociation Rules:")
rules

In [None]:
print(encoded_data['Bread'].sum())
print(encoded_data['Coffee'].sum())

# FP Growth Algorithm

In [None]:
from mlxtend.frequent_patterns import fpgrowth
start_time = time.time()


# Evaluate the frequent item rules for fpgrowth function
frequent_itemsets_fp = fpgrowth(encoded_data, min_support = 0.01, use_colnames = True)

print(frequent_itemsets_fp)

# End timing
end_time = time.time()

# Calculate the time taken
calculation_time = end_time - start_time

print("Frequent Itemsets calculated in {:.2f} seconds.".format(calculation_time))

In [None]:
from mlxtend.frequent_patterns import association_rules

# Display the reles due to fp-growth algorithm
rules_fp = association_rules(frequent_itemsets_fp, metric = "confidence", min_threshold = 0.3, num_itemsets=0)

rules_fp