<h1 style="margin: 0; padding: 0; color: #d9f4e4;">Import Libraries<h1/>

In [1]:
import glob
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from surprise import SVD, Dataset, Reader, accuracy
from surprise.model_selection import train_test_split, GridSearchCV

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

<h1 style="margin: 0; padding: 0; color: #d9f4e4;">Global Variables<h1/>

In [3]:
PATH = "/kaggle/input/hyb-reco-amazone/Dataset"
FASHION_FILE = "/AMAZON_FASHION.json"
META_FILE = "/meta_AMAZON_FASHION.json"
STOP_WORDS = set(stopwords.words('english'))
LEMM = WordNetLemmatizer()

<h1 style="margin: 0; padding: 0; color: #d9f4e4;">Functions<h1/>

In [4]:
def clean_text(x):
    if isinstance(x, list):
        x = ' '.join(map(str, x))
    if not isinstance(x, str):
        x = str(x)
    return x.lower().strip()

In [5]:
def get_similar_items(item_index, matrix, top_n=10):
    cosine_scores = cosine_similarity(matrix[item_index], matrix)
    
    similar_indices = cosine_scores.argsort()[0][::-1]
    
    top_indices = similar_indices[1:top_n+1]
    
    return top_indices, cosine_scores[0][top_indices]

In [6]:
def clean(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', ' ', text)
    text = re.sub(r'http\S+', ' ', text)
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    words = text.split()
    words = [LEMM.lemmatize(w) for w in words if w not in STOP_WORDS]
    return " ".join(words)

In [7]:
def content_based_recommend (rounds, recommends_n, matrix, dataset):
    for i in range(rounds):
        print(dataset.iloc[i]["title"], "\n")
        num = random.randint(1, 1000)
        ids, scores = get_similar_items(num, matrix, recommends_n)
        for ind, s in zip(ids, scores):
            print(dataset.iloc[ind]['title'], "— similarity:", round(s, 3))
        print("\n------------------\n")

In [8]:
def collaborative_filtering_recommend(user_id, ratings_df, meta_df, model, top_n=10):
    items = ratings_df['item_id'].unique()
    rated_items = ratings_df[ratings_df['user_id'] == user_id]['item_id'].values
    items_to_predict = [item for item in items if item not in rated_items]

    predictions = [(item, model.predict(user_id, item).est) for item in items_to_predict]
    predictions_sorted = sorted(predictions, key=lambda x: x[1], reverse=True)[:top_n]

    result = pd.DataFrame(predictions_sorted, columns=['item_id', 'predicted_rating'])
    
    return result.merge(meta_df, on='item_id', how='left')

<h1 style="margin: 0; padding: 0; color: #d9f4e4;">Display Data<h1/>

In [9]:
fashion_path = PATH + FASHION_FILE
meta_path = PATH + META_FILE

In [10]:
print(f"Fashion file: {fashion_path}")
print(f"Meta file: {meta_path}")

Fashion file: /kaggle/input/hyb-reco-amazone/Dataset/AMAZON_FASHION.json
Meta file: /kaggle/input/hyb-reco-amazone/Dataset/meta_AMAZON_FASHION.json


<p style="margin: 0; padding: 0; font-size: 16px; color: #549ef1;">Fashion Reviewd</p>

In [11]:
fashion_df = pd.read_json(fashion_path, lines=True)

In [12]:
fashion_df.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,5,True,"10 20, 2014",A1D4G1SNUZWQOT,7106116521,Tracy,Exactly what I needed.,perfect replacements!!,1413763200,,,
1,2,True,"09 28, 2014",A3DDWDH9PX2YX2,7106116521,Sonja Lau,"I agree with the other review, the opening is ...","I agree with the other review, the opening is ...",1411862400,3.0,,
2,4,False,"08 25, 2014",A2MWC41EW7XL15,7106116521,Kathleen,Love these... I am going to order another pack...,My New 'Friends' !!,1408924800,,,
3,2,True,"08 24, 2014",A2UH2QQ275NV45,7106116521,Jodi Stoner,too tiny an opening,Two Stars,1408838400,,,
4,3,False,"07 27, 2014",A89F3LQADZBS5,7106116521,Alexander D.,Okay,Three Stars,1406419200,,,


In [13]:
fashion_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 883636 entries, 0 to 883635
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   overall         883636 non-null  int64  
 1   verified        883636 non-null  bool   
 2   reviewTime      883636 non-null  object 
 3   reviewerID      883636 non-null  object 
 4   asin            883636 non-null  object 
 5   reviewerName    883544 non-null  object 
 6   reviewText      882403 non-null  object 
 7   summary         883103 non-null  object 
 8   unixReviewTime  883636 non-null  int64  
 9   vote            79900 non-null   float64
 10  style           304569 non-null  object 
 11  image           28807 non-null   object 
dtypes: bool(1), float64(1), int64(2), object(8)
memory usage: 75.0+ MB


<p style="margin: 0; padding: 0; font-size: 16px; color: #549ef1;">Fashion Meta</p>

In [14]:
meta_df = pd.read_json(meta_path, lines=True)

In [15]:
meta_df.head()

Unnamed: 0,title,brand,feature,rank,date,asin,imageURL,imageURLHighRes,description,price,also_view,also_buy,fit,details,similar_item,tech1
0,Slime Time Fall Fest [With CDROM and Collector...,Group Publishing (CO),[Product Dimensions:\n \n8....,"13,052,976inClothing,Shoesamp;Jewelry(",8.70 inches,764443682,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,,,,,,,,
1,XCC Qi promise new spider snake preparing men'...,,,"11,654,581inClothing,Shoesamp;Jewelry(",5 star,1291691480,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,,,,,,,,
2,Magical Things I Really Do Do Too!,Christopher Manos,[Package Dimensions:\n \n8....,"19,308,073inClothing,ShoesJewelry(",5 star,1940280001,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,[For the professional or amateur magician. Ro...,,,,,,,
3,"Ashes to Ashes, Oranges to Oranges",Flickerlamp Publishing,[Package Dimensions:\n \n8....,"19,734,184inClothing,ShoesJewelry(",5 star,1940735033,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,,,,,,,,
4,Aether & Empire #1 - 2016 First Printing Comic...,,[Package Dimensions:\n \n10...,"10,558,646inClothing,Shoesamp;Jewelry(",5 star,1940967805,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,,$4.50,,,,,,


In [16]:
meta_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186637 entries, 0 to 186636
Data columns (total 16 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   title            186632 non-null  object
 1   brand            139957 non-null  object
 2   feature          123875 non-null  object
 3   rank             180222 non-null  object
 4   date             185001 non-null  object
 5   asin             186637 non-null  object
 6   imageURL         132017 non-null  object
 7   imageURLHighRes  132017 non-null  object
 8   description      15869 non-null   object
 9   price            17799 non-null   object
 10  also_view        11595 non-null   object
 11  also_buy         21642 non-null   object
 12  fit              4831 non-null    object
 13  details          885 non-null     object
 14  similar_item     317 non-null     object
 15  tech1            97 non-null      object
dtypes: object(16)
memory usage: 22.8+ MB


In [17]:
meta_df.isnull().sum()

title                   5
brand               46680
feature             62762
rank                 6415
date                 1636
asin                    0
imageURL            54620
imageURLHighRes     54620
description        170768
price              168838
also_view          175042
also_buy           164995
fit                181806
details            185752
similar_item       186320
tech1              186540
dtype: int64

<h1 style="margin: 0; padding: 0; color: #d9f4e4;">Pre-processing<h1/>

In [18]:
for col in ['title', 'brand', 'description']:
    meta_df[col] = meta_df[col].fillna(" ")

for col in ['title', 'brand', 'description']:
    meta_df[col] = meta_df[col].fillna('').apply(clean_text)

In [19]:
meta_df['title'] = meta_df['title'].apply(clean)
meta_df['brand'] = meta_df['brand'].apply(clean)
meta_df['description'] = meta_df['description'].apply(clean)

In [20]:
meta_df = meta_df.rename(columns={'asin': 'item_id'})

In [21]:
meta_df.isnull().sum()

title                   0
brand                   0
feature             62762
rank                 6415
date                 1636
item_id                 0
imageURL            54620
imageURLHighRes     54620
description             0
price              168838
also_view          175042
also_buy           164995
fit                181806
details            185752
similar_item       186320
tech1              186540
dtype: int64

<h1 style="margin: 0; padding: 0; color: #f00;">Content - Based - Recommendation<h1/>
<h2 style="margin: 0; padding: 0; color: #d9f4e4;">Vectorization<h1/>

<p style="margin: 0; padding: 0; font-size: 16px; color: #549ef1;">Training</p>

In [22]:
vectorizer = TfidfVectorizer(
    stop_words='english',
    max_features=50000
)

In [23]:
combines = meta_df[["title", "description", "brand"]]

In [24]:

combines['combined'] = meta_df['title'] + meta_df["description"] + meta_df['brand']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combines['combined'] = meta_df['title'] + meta_df["description"] + meta_df['brand']


In [25]:
combines.head(5)

Unnamed: 0,title,description,brand,combined
0,slime time fall fest cdrom collector card neut...,,group publishing co,slime time fall fest cdrom collector card neut...
1,xcc qi promise new spider snake preparing men ...,,,xcc qi promise new spider snake preparing men ...
2,magical thing really,professional amateur magician routine include ...,christopher manos,magical thing reallyprofessional amateur magic...
3,ash ash orange orange,,flickerlamp publishing,ash ash orange orangeflickerlamp publishing
4,aether empire 1 2016 first printing comic book...,,,aether empire 1 2016 first printing comic book...


In [26]:
combines.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186637 entries, 0 to 186636
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   title        186637 non-null  object
 1   description  186637 non-null  object
 2   brand        186637 non-null  object
 3   combined     186637 non-null  object
dtypes: object(4)
memory usage: 5.7+ MB


In [27]:
combines.isnull().sum()

title          0
description    0
brand          0
combined       0
dtype: int64

In [28]:
tfidf_matrix = vectorizer.fit_transform(combines["combined"])

In [29]:
content_based_recommend(5, 5, tfidf_matrix, combines)

slime time fall fest cdrom collector card neutron ball incredi ball glow stick necklace paper fram 

lady round collar school uniform blouse long sleeve — similarity: 1.0
military uniform shirt stay — similarity: 0.46
genuine school uniform boy flat front navy uniform pant — similarity: 0.399
genuine school uniform boy flat front navy uniform pant — similarity: 0.399
genuine school uniform boy flat front navy uniform pant — similarity: 0.399

------------------

xcc qi promise new spider snake preparing men accessory alloy fitting magnet buckle bracelet jewelry 

kipling bryce canyon boarding tote blue jean — similarity: 1.0
kipling jerimiah tote cayenne — similarity: 0.496
alfred dunner bryce canyon tribal print sweater beige multi small — similarity: 0.431
kipling sady tote handbag pink orchd — similarity: 0.413
kipling pixi medium wallet cflatwsttm — similarity: 0.329

------------------

magical thing really 

baggallini currency organizer nylon black — similarity: 1.0
baggallini f

<p style="margin: 0; padding: 0; font-size: 16px; color: #549ef1;">Enhancement</p>

In [30]:
combines['combined'] = (
    combines['title'] * 3 + " " +
    combines['brand'] * 2 + " " +
    combines['description']
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combines['combined'] = (


In [31]:
vectorizer = TfidfVectorizer(
    stop_words='english',
    min_df=3,
    ngram_range=(1, 2)   # bigrams + unigrams
)

In [32]:
tfidf_matrix = vectorizer.fit_transform(combines['combined'])

In [33]:
content_based_recommend(5, 5, tfidf_matrix, combines)

slime time fall fest cdrom collector card neutron ball incredi ball glow stick necklace paper fram 

sleeptop low neck sleep top bra 32 34 dd f white — similarity: 1.0
sleeptop low neck sleep top bra 32 34 dd f black — similarity: 0.907
sleeptop low neck sleep top bra 32 34 dd f black — similarity: 0.907
sleeptop high neck sleep bra 32 34 dd f black — similarity: 0.828
sleeptop low neck sleep top bra 32 34 c black — similarity: 0.79

------------------

xcc qi promise new spider snake preparing men accessory alloy fitting magnet buckle bracelet jewelry 

pda skin pda skin pro black — similarity: 1.0
rabbit skin juvenile cotton shirt 3301j white 5 6t — similarity: 0.292
boblbee amphib pro 30 — similarity: 0.262
breo b ti sk10l skin red large watch — similarity: 0.252
vapor untouchable pro chmp — similarity: 0.251

------------------

magical thing really 

china silk balaclava black — similarity: 1.0
puresilk balaclava black one size woman — similarity: 0.497
regular men guide gear silk

<h1 style="margin: 0; padding: 0; color: #f00;">Collaborative - Filtering - Recommendation<h1/>

<p style="margin: 0; padding: 0; font-size: 16px; color: #549ef1;">Training</p>

In [34]:
ratings_df = fashion_df[['reviewerID', 'asin', 'overall']]
ratings_df = ratings_df.dropna(subset=['reviewerID', 'asin'])

avg_rating = ratings_df['overall'].mean()
ratings_df['overall'] = ratings_df['overall'].fillna(avg_rating)

ratings_df.rename(columns={'reviewerID': 'user_id', 'asin': 'item_id', 'overall': 'rating'}, inplace=True)

In [35]:
ratings_df.head(5)

Unnamed: 0,user_id,item_id,rating
0,A1D4G1SNUZWQOT,7106116521,5
1,A3DDWDH9PX2YX2,7106116521,2
2,A2MWC41EW7XL15,7106116521,4
3,A2UH2QQ275NV45,7106116521,2
4,A89F3LQADZBS5,7106116521,3


In [36]:
ratings_df.isnull().sum()

user_id    0
item_id    0
rating     0
dtype: int64

In [40]:
product_df = meta_df[['item_id', 'title', 'brand', "description"]].dropna()

In [41]:
product_df.head()

Unnamed: 0,item_id,title,brand,description
0,764443682,slime time fall fest cdrom collector card neut...,group publishing co,
1,1291691480,xcc qi promise new spider snake preparing men ...,,
2,1940280001,magical thing really,christopher manos,professional amateur magician routine include ...
3,1940735033,ash ash orange orange,flickerlamp publishing,
4,1940967805,aether empire 1 2016 first printing comic book...,,


In [42]:
product_df.isnull().sum()

item_id        0
title          0
brand          0
description    0
dtype: int64

In [43]:
user_counts = ratings_df['user_id'].value_counts()
ratings_df = ratings_df[ratings_df['user_id'].isin(user_counts[user_counts >= 5].index)]

In [44]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['user_id', 'item_id', 'rating']], reader)

In [45]:
trainset, testset = train_test_split(data, test_size=0.2)

In [46]:
model = SVD()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x782c24217150>

In [47]:
predictions = model.test(testset)

In [48]:
accuracy.rmse(predictions)

RMSE: 1.0999


1.0999220342916443

In [49]:
for i in range(5):
    idx = random.randint(0, len(ratings_df) - 1)
    user_id = ratings_df.iloc[idx]['user_id']
    
    recommendations = collaborative_filtering_recommend(user_id, ratings_df, meta_df, model, top_n=5)

    print(f"\n user {user_id} has this recommends:\n")
    print(recommendations['title'])


 user A2BDM8QP5HBWAG has this recommends:

0    qiyun z geometric oil drop silver chain tribal...
1    diamond plate lady rock design genuine buffalo...
2     lnlclothing junior distressed skinny jean blue 9
3    mj metal jewelry 2mm 10mm white tungsten carbi...
4    allegra k woman crossover v neck self tie wais...
Name: title, dtype: object

 user A2KTDPJK3FMGGW has this recommends:

0    mj metal jewelry 2mm 10mm white tungsten carbi...
1    qiyun z geometric oil drop silver chain tribal...
2    allegra k woman shoulder side shirred slim fit...
3                 havaianas woman top flip flop sandal
4    goson cowhid leather handbag purse cell phone ...
Name: title, dtype: object

 user AU17AY07NB5XC has this recommends:

0    chunky oval turquoise tibet silver pendant tor...
1    mj metal jewelry 2mm 10mm white tungsten carbi...
2    persun woman plunge neck pullover sweater top ...
3     ingrid isabel woman maternity everyday bellaband
4    cocobla woman boat neck loose dolman sle

In [50]:
param_grid = {
    'n_factors': [50, 100],
    'reg_all': [0.02, 0.1],
    'n_epochs': [1, 5]
}

gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
gs.fit(data)

print("Best RMSE:", gs.best_score['rmse'])
print("Best params:", gs.best_params['rmse'])

model_svd_best = gs.best_estimator['rmse']
model_svd_best.fit(trainset)

Best RMSE: 1.1955313698268941
Best params: {'n_factors': 100, 'reg_all': 0.02, 'n_epochs': 5}


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x782c22dddbd0>

In [51]:
predictions = model_svd_best.test(testset)

In [52]:
accuracy.rmse(predictions)

RMSE: 1.1846


1.1846460791519517

In [53]:
for i in range(5):
    idx = random.randint(0, len(ratings_df) - 1)
    user_id = ratings_df.iloc[idx]['user_id']
    
    recommendations = collaborative_filtering_recommend(user_id, ratings_df, meta_df, model_svd_best, top_n=5)

    print(f"\n user {user_id} has this recommends:\n")
    print(recommendations['title'])


 user AZ7LUSY20ZFSC has this recommends:

0    fashion womens boyfriend pocket cardigan shrug...
1    mj metal jewelry 2mm 10mm white tungsten carbi...
2                 havaianas woman top flip flop sandal
3    mj metal jewelry 2mm 10mm white tungsten carbi...
4    chunky oval turquoise tibet silver pendant tor...
Name: title, dtype: object

 user A3GO78M9PUFGHS has this recommends:

0           nike woman flex supreme tr 4 cross trainer
1                      woman cotton hoodie wine medium
2                                     jaja 8 0 b black
3    finejo womens vintage crop cut cold shoulder b...
4    yazilind punk gothic style skull design silver...
Name: title, dtype: object

 user A22FYN2DKXSWS0 has this recommends:

0    winter white ivory thick slouchy knit oversize...
1    kingfansion womens unisex fingerless warm glov...
2    lady colour woman necklace sweet heart bermuda...
3    unique style 3 pack classic lace bralettes cag...
4    mj metal jewelry 2mm 10mm white tungsten

<h1 style="margin: 0; padding: 0; color: #f00;">Hybrid - Recommendation<h1/>

In [54]:
def hybrid_recommend(user_id, tfidf_matrix, ratings_df, meta_df, model, top_n=10, alpha=0.7):
    items = ratings_df['item_id'].unique()
    rated_items = ratings_df[ratings_df['user_id'] == user_id]['item_id'].values
    items_to_predict = [i for i in items if i not in rated_items]

    cf_scores_dict = {item: model.predict(user_id, item).est for item in items_to_predict}

    cb_similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    cb_scores_dict = {item_id: cb_similarity[idx].mean() for idx, item_id in enumerate(items)}

    hybrid_scores = []
    for item_id in items_to_predict:
        cf_score = cf_scores_dict[item_id]
        cb_score = cb_scores_dict[item_id]
        score = alpha * cf_score + (1 - alpha) * cb_score
        hybrid_scores.append((item_id, score))

    hybrid_scores_sorted = sorted(hybrid_scores, key=lambda x: x[1], reverse=True)[:top_n]
    result = pd.DataFrame(hybrid_scores_sorted, columns=['item_id', 'hybrid_score'])

    return result.merge(meta_df, on='item_id', how='left')


In [None]:
idx1 = random.randint(0, len(ratings_df) - 1)
idx2 = random.randint(0, len(ratings_df) - 1)

item_idx = ratings_df.iloc[idx1]['item_id']
user_idx = ratings_df.iloc[idx2]['user_id']

recommendations = hybrid_recommend(
    user_id=user_idx,
    tfidf_matrix=tfidf_matrix,
    ratings_df=ratings_df,
    meta_df=meta_df,
    model=model_svd_best,
    top_n=5,
    alpha=0.7
)

print(recommendations[['title', 'brand', 'hybrid_score']])
