<h1 style="margin: 0; padding: 0; color: #d9f4e4;">Import Libraries<h1/>

In [None]:
import glob
import pandas as pd
from IPython.display import display, Image, HTML
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random
import re
import nltk
from math import floor
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from surprise import SVD, Dataset, Reader, accuracy, SVDpp
from surprise.model_selection import train_test_split, GridSearchCV
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

<h1 style="margin: 0; padding: 0; color: #d9f4e4;">Global Variables<h1/>

In [3]:
PATH = "/kaggle/input/hyb-reco-amazone/Dataset"
FASHION_FILE = "/AMAZON_FASHION.json"
META_FILE = "/meta_AMAZON_FASHION.json"
STOP_WORDS = set(stopwords.words('english'))
LEMM = WordNetLemmatizer()
BERT_MODEL_NAME = "bert-base-uncased"
EPOCHS = 30

<h1 style="margin: 0; padding: 0; color: #d9f4e4;">Functions<h1/>

In [4]:
def clean_text(x):
    if isinstance(x, list):
        x = ' '.join(map(str, x))
    if not isinstance(x, str):
        x = str(x)
    return x.lower().strip()

In [5]:
def get_similar_items_tdidf(item_index, matrix, top_n=10):
    cosine_scores = cosine_similarity(matrix[item_index], matrix)
    
    similar_indices = cosine_scores.argsort()[0][::-1]
    
    top_indices = similar_indices[1:top_n+1]
    
    return top_indices, cosine_scores[0][top_indices]

In [6]:
def clean(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', ' ', text)
    text = re.sub(r'http\S+', ' ', text)
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    words = text.split()
    words = [LEMM.lemmatize(w) for w in words if w not in STOP_WORDS]
    return " ".join(words)

In [None]:
def content_based_recommend_tdidf(rounds, recommends_n, matrix, dataset):
    for i in range(rounds):

        num = random.randint(0, len(dataset)-1)
        display(Image(url=dataset.iloc[num]["imageURL"][0], width=150))
        print(dataset.iloc[num]["title"], "\n")
        ids, scores = get_similar_items_tdidf(num, matrix, recommends_n)

        for ind, s in zip(ids, scores):
            display(Image(url=dataset.iloc[ind]["imageURL"], width=100))
            print(dataset.iloc[ind]['title'], "— similarity:", round(s, 3))
        print("\n------------------\n")

In [8]:
def collaborative_filtering_recommend(user_id, ratings_df, meta_df, model, top_n=10):
    items = ratings_df['item_id'].unique()
    rated_items = ratings_df[ratings_df['user_id'] == user_id]['item_id'].values
    items_to_predict = [item for item in items if item not in rated_items]

    predictions = [(item, model.predict(user_id, item).est) for item in items_to_predict]
    predictions_sorted = sorted(predictions, key=lambda x: x[1], reverse=True)[:top_n]

    result = pd.DataFrame(predictions_sorted, columns=['item_id', 'predicted_rating'])
    
    return result.merge(meta_df, on='item_id', how='left')

In [9]:
def preprocess_images(dataset):
    def first_image(img_list):
        if isinstance(img_list, list) and len(img_list) > 0:
            return img_list[0]
        else:
            return ""
    dataset['imageURL'] = dataset['imageURL'].apply(first_image)
    return dataset

In [10]:
def build_tfidf_index_map(items_ordered):
    return {item: idx for idx, item in enumerate(items_ordered)}

In [11]:
def build_user_content_vector(user_id, ratings_df, item_to_idx, tfidf_matrix, rating_weighting='global_center'):
    user_ratings = ratings_df[ratings_df['user_id'] == user_id]
    if user_ratings.empty:
        return None

    user_ratings = user_ratings[user_ratings['item_id'].isin(item_to_idx)]
    if user_ratings.empty:
        return None

    if rating_weighting == 'global_center':
        global_mean = ratings_df['rating'].mean()
        weights = (user_ratings['rating'] - global_mean).values
    elif rating_weighting == 'user_center':
        user_mean = user_ratings['rating'].mean()
        weights = (user_ratings['rating'] - user_mean).values
    else:
        weights = user_ratings['rating'].values

    if np.allclose(weights, 0):
        weights = np.ones_like(weights)

    vecs = []
    for item, w in zip(user_ratings['item_id'], weights):
        idx = item_to_idx[item]
        vec = tfidf_matrix[idx]
        vecs.append((vec, w))

    sum_w = np.sum(np.abs(weights))
    if sum_w == 0:
        sum_w = len(weights)

    first_vec = vecs[0][0]
    if hasattr(first_vec, "toarray"):
        feature_dim = first_vec.shape[1]
        user_profile = np.zeros((feature_dim,), dtype=np.float32)
        for vec, w in vecs:
            user_profile += (vec.toarray().ravel() * w)
    else:
        feature_dim = first_vec.shape[0]
        user_profile = np.zeros((feature_dim,), dtype=np.float32)
        for vec, w in vecs:
            user_profile += (vec.ravel() * w)

    user_profile = user_profile / sum_w
    return user_profile.reshape(1, -1)

<h1 style="margin: 0; padding: 0; color: #d9f4e4;">Display Data<h1/>

In [14]:
fashion_path = PATH + FASHION_FILE
meta_path = PATH + META_FILE

In [15]:
print(f"Fashion file: {fashion_path}")
print(f"Meta file: {meta_path}")

Fashion file: /kaggle/input/hyb-reco-amazone/Dataset/AMAZON_FASHION.json
Meta file: /kaggle/input/hyb-reco-amazone/Dataset/meta_AMAZON_FASHION.json


<p style="margin: 0; padding: 0; font-size: 16px; color: #549ef1;">Fashion Reviewd</p>

In [16]:
fashion_df = pd.read_json(fashion_path, lines=True)

In [17]:
fashion_df.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,5,True,"10 20, 2014",A1D4G1SNUZWQOT,7106116521,Tracy,Exactly what I needed.,perfect replacements!!,1413763200,,,
1,2,True,"09 28, 2014",A3DDWDH9PX2YX2,7106116521,Sonja Lau,"I agree with the other review, the opening is ...","I agree with the other review, the opening is ...",1411862400,3.0,,
2,4,False,"08 25, 2014",A2MWC41EW7XL15,7106116521,Kathleen,Love these... I am going to order another pack...,My New 'Friends' !!,1408924800,,,
3,2,True,"08 24, 2014",A2UH2QQ275NV45,7106116521,Jodi Stoner,too tiny an opening,Two Stars,1408838400,,,
4,3,False,"07 27, 2014",A89F3LQADZBS5,7106116521,Alexander D.,Okay,Three Stars,1406419200,,,


In [18]:
fashion_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 883636 entries, 0 to 883635
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   overall         883636 non-null  int64  
 1   verified        883636 non-null  bool   
 2   reviewTime      883636 non-null  object 
 3   reviewerID      883636 non-null  object 
 4   asin            883636 non-null  object 
 5   reviewerName    883544 non-null  object 
 6   reviewText      882403 non-null  object 
 7   summary         883103 non-null  object 
 8   unixReviewTime  883636 non-null  int64  
 9   vote            79900 non-null   float64
 10  style           304569 non-null  object 
 11  image           28807 non-null   object 
dtypes: bool(1), float64(1), int64(2), object(8)
memory usage: 75.0+ MB


<p style="margin: 0; padding: 0; font-size: 16px; color: #549ef1;">Fashion Meta</p>

In [19]:
meta_df = pd.read_json(meta_path, lines=True)

In [20]:
meta_df.head()

Unnamed: 0,title,brand,feature,rank,date,asin,imageURL,imageURLHighRes,description,price,also_view,also_buy,fit,details,similar_item,tech1
0,Slime Time Fall Fest [With CDROM and Collector...,Group Publishing (CO),[Product Dimensions:\n \n8....,"13,052,976inClothing,Shoesamp;Jewelry(",8.70 inches,764443682,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,,,,,,,,
1,XCC Qi promise new spider snake preparing men'...,,,"11,654,581inClothing,Shoesamp;Jewelry(",5 star,1291691480,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,,,,,,,,
2,Magical Things I Really Do Do Too!,Christopher Manos,[Package Dimensions:\n \n8....,"19,308,073inClothing,ShoesJewelry(",5 star,1940280001,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,[For the professional or amateur magician. Ro...,,,,,,,
3,"Ashes to Ashes, Oranges to Oranges",Flickerlamp Publishing,[Package Dimensions:\n \n8....,"19,734,184inClothing,ShoesJewelry(",5 star,1940735033,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,,,,,,,,
4,Aether & Empire #1 - 2016 First Printing Comic...,,[Package Dimensions:\n \n10...,"10,558,646inClothing,Shoesamp;Jewelry(",5 star,1940967805,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,,$4.50,,,,,,


In [21]:
meta_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186637 entries, 0 to 186636
Data columns (total 16 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   title            186632 non-null  object
 1   brand            139957 non-null  object
 2   feature          123875 non-null  object
 3   rank             180222 non-null  object
 4   date             185001 non-null  object
 5   asin             186637 non-null  object
 6   imageURL         132017 non-null  object
 7   imageURLHighRes  132017 non-null  object
 8   description      15869 non-null   object
 9   price            17799 non-null   object
 10  also_view        11595 non-null   object
 11  also_buy         21642 non-null   object
 12  fit              4831 non-null    object
 13  details          885 non-null     object
 14  similar_item     317 non-null     object
 15  tech1            97 non-null      object
dtypes: object(16)
memory usage: 22.8+ MB


In [22]:
meta_df.isnull().sum()

title                   5
brand               46680
feature             62762
rank                 6415
date                 1636
asin                    0
imageURL            54620
imageURLHighRes     54620
description        170768
price              168838
also_view          175042
also_buy           164995
fit                181806
details            185752
similar_item       186320
tech1              186540
dtype: int64

In [23]:
random_descs = meta_df['feature'].sample(50, random_state=42)

for i, desc in enumerate(random_descs):
    print(f"\n{i+1}: --- {desc}")


1: --- nan

2: --- ['Package Dimensions:\n                    \n9.8 x 5.7 x 0.8 inches', 'Shipping Weight:\n                    \n1 pounds']

3: --- ['Package Dimensions:\n                    \n5.5 x 4 x 0.6 inches', 'Shipping Weight:\n                    \n0.32 ounces']

4: --- ['Shipping Weight:\n                    \n1.2 pounds']

5: --- ['Package Dimensions:\n                    \n9.8 x 6.9 x 1.6 inches', 'Shipping Weight:\n                    \n7.2 ounces']

6: --- ['Package Dimensions:\n                    \n15.3 x 12.1 x 3.6 inches', 'Shipping Weight:\n                    \n1.3 pounds']

7: --- ['Package Dimensions:\n                    \n12.2 x 9.6 x 0.8 inches', 'Shipping Weight:\n                    \n7.2 ounces']

8: --- ['Package Dimensions:\n                    \n8.9 x 7.5 x 0.8 inches', 'Shipping Weight:\n                    \n7.2 ounces']

9: --- ['Package Dimensions:\n                    \n15.8 x 10.9 x 1.5 inches', 'Shipping Weight:\n                    \n12.8 ounces'

In [24]:
random_descs = meta_df['imageURL'].sample(50, random_state=42)

for i, desc in enumerate(random_descs):
    print(f"\n{i+1}: --- {desc}")


1: --- nan

2: --- ['https://images-na.ssl-images-amazon.com/images/I/41oXbLw29FL._SR38,50_.jpg']

3: --- ['https://images-na.ssl-images-amazon.com/images/I/41GEF61UGWL._US40_.jpg', 'https://images-na.ssl-images-amazon.com/images/I/41%2BEd7Uel8L._US40_.jpg']

4: --- ['https://images-na.ssl-images-amazon.com/images/I/41R6WvDa5zL._SR38,50_.jpg', 'https://images-na.ssl-images-amazon.com/images/I/51q03LjJ3SL._SR38,50_.jpg', 'https://images-na.ssl-images-amazon.com/images/I/41JTTDK0YAL._SR38,50_.jpg']

5: --- nan

6: --- ['https://images-na.ssl-images-amazon.com/images/I/51DnrdhcH4L._SR38,50_.jpg', 'https://images-na.ssl-images-amazon.com/images/I/51IxnsEHMqL._SR38,50_.jpg', 'https://images-na.ssl-images-amazon.com/images/I/61uaUSSf%2B6L._SR38,50_.jpg']

7: --- ['https://images-na.ssl-images-amazon.com/images/I/41dB2RPT8tL._SR38,50_.jpg', 'https://images-na.ssl-images-amazon.com/images/I/41YnTnmY3yL._SR38,50_.jpg', 'https://images-na.ssl-images-amazon.com/images/I/41yTvx75Y3L._SR38,50_.jpg

<h1 style="margin: 0; padding: 0; color: #d9f4e4;">Pre-processing<h1/>

In [25]:
for col in ['title', 'brand', 'description']:
    meta_df[col] = meta_df[col].fillna(" ")

for col in ['title', 'brand', 'description']:
    meta_df[col] = meta_df[col].fillna('').apply(clean_text)

In [26]:
meta_df['title'] = meta_df['title'].apply(clean)
meta_df['brand'] = meta_df['brand'].apply(clean)
meta_df['description'] = meta_df['description'].apply(clean)

In [27]:
meta_df = meta_df.rename(columns={'asin': 'item_id'})

In [28]:
meta_df = preprocess_images(meta_df)

In [29]:
meta_df.isnull().sum()

title                   0
brand                   0
feature             62762
rank                 6415
date                 1636
item_id                 0
imageURL                0
imageURLHighRes     54620
description             0
price              168838
also_view          175042
also_buy           164995
fit                181806
details            185752
similar_item       186320
tech1              186540
dtype: int64

<h1 style="margin: 0; padding: 0; color: #d9f4e4;">Display Pre-processed Data<h1/>

In [30]:
random_descs = meta_df['description'].sample(50, random_state=42)

for i, desc in enumerate(random_descs):
    print(f"\n{i+1}: --- {desc}")


1: --- 

2: --- 

3: --- lovely earring showcase created ruby stone gem surrounded 14 sparkling cz stone earring crafted sterling silver secured leverbacks total gem weight 2ct total cz weight 38ct

4: --- 

5: --- 

6: --- 

7: --- 

8: --- 

9: --- 

10: --- 

11: --- 

12: --- 

13: --- 

14: --- 

15: --- 

16: --- 

17: --- 

18: --- 

19: --- 

20: --- 

21: --- 

22: --- 

23: --- 

24: --- 

25: --- 

26: --- cozy unicorn feature zipper front shooting star fleece dress furry rainbow cuff attached authentic unicorn fur tail mystical horn hood glorious mane

27: --- 

28: --- beautiful rom leather concealment purse made premium leather reinforced stitching quality zipper color black width 12 height 9 depth 4 strap drop 15 gun compartment side access measure 10 x 8 7 lockable zipper shown 2 key included velcro lined precise gun positioning double zipper top open large center compartment 1 2 deep zippered compartment one wall open pocket along side elastic closed open pocket wall 

In [31]:
random_descs = meta_df['imageURL'].sample(50, random_state=42)

for i, desc in enumerate(random_descs):
    print(f"\n{i+1}: --- {desc}")


1: --- 

2: --- https://images-na.ssl-images-amazon.com/images/I/41oXbLw29FL._SR38,50_.jpg

3: --- https://images-na.ssl-images-amazon.com/images/I/41GEF61UGWL._US40_.jpg

4: --- https://images-na.ssl-images-amazon.com/images/I/41R6WvDa5zL._SR38,50_.jpg

5: --- 

6: --- https://images-na.ssl-images-amazon.com/images/I/51DnrdhcH4L._SR38,50_.jpg

7: --- https://images-na.ssl-images-amazon.com/images/I/41dB2RPT8tL._SR38,50_.jpg

8: --- https://images-na.ssl-images-amazon.com/images/I/51haAT1vAiL._SR38,50_.jpg

9: --- https://images-na.ssl-images-amazon.com/images/I/41qGmCnt%2BtL._SR38,50_.jpg

10: --- https://images-na.ssl-images-amazon.com/images/I/41uFvYlmGyL._SR38,50_.jpg

11: --- https://images-na.ssl-images-amazon.com/images/I/41GNvspiqDL._SR38,50_.jpg

12: --- https://images-na.ssl-images-amazon.com/images/I/41MavSzbzmL._US40_.jpg

13: --- https://images-na.ssl-images-amazon.com/images/I/41M1Z7gJurL._US40_.jpg

14: --- https://images-na.ssl-images-amazon.com/images/I/41u7su-oKPL._S

<h1 style="margin: 0; padding: 0; color: #f00;">Content - Based - Recommendation<h1/>
<h2 style="margin: 0; padding: 0; color: #d9f4e4;">TFIDF<h2/>

<p style="margin: 0; padding: 0; font-size: 16px; color: #549ef1;">Training</p>

In [32]:
vectorizer = TfidfVectorizer(
    stop_words='english',
    max_features=50000
)

In [33]:
combines = meta_df[["title", "description", "brand", "imageURL"]]

In [34]:

combines['combined'] = meta_df['title'] + meta_df["description"] + meta_df['brand']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combines['combined'] = meta_df['title'] + meta_df["description"] + meta_df['brand']


In [35]:
combines.head(5)

Unnamed: 0,title,description,brand,imageURL,combined
0,slime time fall fest cdrom collector card neut...,,group publishing co,https://images-na.ssl-images-amazon.com/images...,slime time fall fest cdrom collector card neut...
1,xcc qi promise new spider snake preparing men ...,,,https://images-na.ssl-images-amazon.com/images...,xcc qi promise new spider snake preparing men ...
2,magical thing really,professional amateur magician routine include ...,christopher manos,https://images-na.ssl-images-amazon.com/images...,magical thing reallyprofessional amateur magic...
3,ash ash orange orange,,flickerlamp publishing,https://images-na.ssl-images-amazon.com/images...,ash ash orange orangeflickerlamp publishing
4,aether empire 1 2016 first printing comic book...,,,https://images-na.ssl-images-amazon.com/images...,aether empire 1 2016 first printing comic book...


In [36]:
combines.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186637 entries, 0 to 186636
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   title        186637 non-null  object
 1   description  186637 non-null  object
 2   brand        186637 non-null  object
 3   imageURL     186637 non-null  object
 4   combined     186637 non-null  object
dtypes: object(5)
memory usage: 7.1+ MB


In [37]:
combines.isnull().sum()

title          0
description    0
brand          0
imageURL       0
combined       0
dtype: int64

In [38]:
tfidf_matrix_TDB = vectorizer.fit_transform(combines["combined"])

In [39]:
content_based_recommend_tdidf(5, 5, tfidf_matrix_TDB, combines)

slime time fall fest cdrom collector card neutron ball incredi ball glow stick necklace paper fram 



coromose womens casual chiffon short tulip sleeve blouse shirt top hot pink — similarity: 1.0


coromose womens casual chiffon short tulip sleeve blouse shirt top xl hot pink — similarity: 0.976


coromose womens casual chiffon short tulip sleeve blouse shirt top l blue — similarity: 0.662


coromose womens casual chiffon short tulip sleeve blouse shirt top blue — similarity: 0.662


coromose womens casual chiffon short tulip sleeve blouse shirt top green — similarity: 0.646

------------------



xcc qi promise new spider snake preparing men accessory alloy fitting magnet buckle bracelet jewelry 



sterling silver 925 pendant genuine swarovski crystal baroque 22mm amethyst — similarity: 0.713


sterling silver 925 pendant genuine swarovski crystal heart 10mm crystal ab — similarity: 0.61


sterling silver 925 pendant genuine swarovski crystal heart 14mm aquamarine — similarity: 0.592


sterling silver polished tear drop shape earring — similarity: 0.523


oyang tear drop shape 925 sterling silver plated earring — similarity: 0.493

------------------



magical thing really 



tobeinstyle woman long tank top w adjustable spaghetti strap 2 pk banana red large — similarity: 1.0


tobeinstyle woman long tank top w adjustable spaghetti strap banana large — similarity: 0.906


tobeinstyle woman long tank top w adjustable spaghetti strap 2 pk white black large — similarity: 0.879


tobeinstyle woman long tank top w adjustable spaghetti strap 2 pk white coral large — similarity: 0.845


tobeinstyle woman long tank top w adjustable spaghetti strap 2 pk white teal large — similarity: 0.841

------------------



ash ash orange orange 



cross sword light sabre power man sword omen thundercats lion tee — similarity: 1.0


cross sword light sabre power man sword omen thundercats lion tee — similarity: 1.0


cross sword light sabre power man sword omen thundercats lion shirt — similarity: 0.891


cross sword light sabre power man sword omen thundercats lion shirt — similarity: 0.891


bandai gokaiger sentai gokai saber sabre sword key power ranger kaizoku — similarity: 0.35

------------------



aether empire 1 2016 first printing comic book special edition rare blue juice comic 



men classic zipper tie — similarity: 0.664


solid color poly woven clip tie olive — similarity: 0.454


alizeal adjustable men multi solid color self bow tie necktie tie purple — similarity: 0.416


100 silk extra long tie dot big tall men 63 xl 70 xxl — similarity: 0.402


100 silk extra long tie big tall men men x long necktie 63 inch 70 inch — similarity: 0.397

------------------



<h2 style="margin: 0; padding: 0; color: #d9f4e4;">TFIDF Enhanced with weights<h2/>

In [40]:
combines['combined'] = (
    combines['title'] * 3 + " " +
    combines['brand'] * 2 + " " +
    combines['description']
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combines['combined'] = (


In [41]:
vectorizer = TfidfVectorizer(
    stop_words='english',
    min_df=3,
    ngram_range=(1, 2)   # bigrams + unigrams
)

In [42]:
tfidf_matrix_TDB = vectorizer.fit_transform(combines['combined'])

In [43]:
content_based_recommend_tdidf(5, 5, tfidf_matrix_TDB, combines)

slime time fall fest cdrom collector card neutron ball incredi ball glow stick necklace paper fram 



funoc new unisex men lady winter ski slouch knitted baggy hip hop hat cap beanie black — similarity: 0.961


funocnew unisex men lady winter ski slouch knitted baggy hip hop hat cap beanie red — similarity: 0.946


funoc new unisex men lady winter ski slouch knitted baggy hip hop hat cap beanie light gray — similarity: 0.933


norbi winter warm knit ski crochet slouch cap beanie hip hop hat white — similarity: 0.378


funoc unisex men woman winter warm ski knit hip hop cool hiphop cap beanie hat — similarity: 0.289

------------------



xcc qi promise new spider snake preparing men accessory alloy fitting magnet buckle bracelet jewelry 



ilovesia tm woman tights capri ankle legging black grey u size — similarity: 1.0


ilovesia tm woman tights capri ankle legging black pink u size — similarity: 0.877


ilovesia tm woman tights capri ankle legging black pink u size — similarity: 0.877


ilovesia tm woman tights capri ankle legging black grey u size l — similarity: 0.82


ilovesia tm woman tights capri ankle legging black grey u size xl — similarity: 0.805

------------------



magical thing really 



beancase tm fashion blue dragonfly necklace 1 pc 1 — similarity: 0.814


beancase tm fashion green dragonfly necklace 1 pc 1 — similarity: 0.74


beancase tm black crystal dragonfly necklace 1 pc 1 — similarity: 0.638


beancase tm fashion love daughter necklace 1 pc 1 — similarity: 0.603


beancase tm fashion love mum necklace 1 pc 1 — similarity: 0.595

------------------



ash ash orange orange 



xiami woman black office flared pant black u 4 — similarity: 1.0


xiami woman black office flared pant black x u 2 — similarity: 1.0


xiami woman black office flared pant black xl u 12 — similarity: 0.939


hurley mpt0000140 men corman 3 pant black 42 — similarity: 0.306


ashlynne pant black l — similarity: 0.293

------------------



aether empire 1 2016 first printing comic book special edition rare blue juice comic 



2 pack zenana woman plus size lace trim cami tank top 2x black h gray — similarity: 0.883


2 pack zenana woman plus size lace trim cami tank top 2x h gray white — similarity: 0.798


2 pack zenana woman plus size lace trim cami tank top 2x black brown — similarity: 0.778


2 pack zenana woman plus size lace trim cami tank top 2x black navy — similarity: 0.769


2 pack zenana woman plus size lace trim cami tank top 2x black white — similarity: 0.768

------------------



<h2 style="margin: 0; padding: 0; color: #d9f4e4;">TFIDF (With no description)<h2/>

In [44]:
combines['combined'] = (
    combines['title'] * 3 + " " +
    combines['brand'] * 2 + " " 
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combines['combined'] = (


In [45]:
tfidf_matrix_TB = vectorizer.fit_transform(combines['combined'])

In [46]:
content_based_recommend_tdidf(5, 5, tfidf_matrix_TB, combines)

slime time fall fest cdrom collector card neutron ball incredi ball glow stick necklace paper fram 



azbro woman vintage floral mini shift dress — similarity: 0.398


azbro woman vintage floral mini shift dress — similarity: 0.398


azbro woman white contrast print flare sleeveless vintage floral dress — similarity: 0.319


cotton zipper back vintage floral sleeveless casual banquet dress small 2 — similarity: 0.302


azbro woman loose plaid print cotton linen short sleeve plus size dress — similarity: 0.286

------------------



xcc qi promise new spider snake preparing men accessory alloy fitting magnet buckle bracelet jewelry 



neoprene half face mask biomech osfm — similarity: 0.862


neoprene half face mask hannibal osfm — similarity: 0.862


neoprene half face mask glow venetian osfm — similarity: 0.765


neoprene half face mask purple venetian osfm — similarity: 0.709


neoprene half face mask watercolor flower osfm — similarity: 0.699

------------------



magical thing really 



cross1946 woman mesh one piece sexy bikini set push swimsuit white l — similarity: 0.436


cross1946 woman mesh one piece sexy bikini set push swimsuit white xl — similarity: 0.434


ebuddy bikini top bottom set swimsuit swimwear — similarity: 0.41


girl two piece bikini set swimsuit 7 8 green grey — similarity: 0.374


sexyarn woman hollow bandage bodycon sexy bikini set swimsuit beachwear — similarity: 0.368

------------------



ash ash orange orange 



womens sexy gold body belly waist chain bikini beach harness necklace — similarity: 0.233


kingfansion woman bikini cross harness waist belly body turquoise chain jewelry sliver — similarity: 0.232


kingfansion woman bikini cross harness waist belly body turquoise chain jewelry gold — similarity: 0.232


pooqdo lady bikini cross harness waist belly body chain necklace body jewelry — similarity: 0.229


meiysh retro fashion golden bra body necklace chain bikini belly chain tassel waist jewelry style01 — similarity: 0.229

------------------



aether empire 1 2016 first printing comic book special edition rare blue juice comic 



lizhoumil womens big hand printed cross back sleeveless shirt vest tank top — similarity: 1.0


lizhoumil womens big hand printed cross back sleeveless shirt vest tank top — similarity: 1.0


lizhoumil womens big hand printed cross back sleeveless shirt vest tank top — similarity: 1.0


lizhoumil womens big hand printed cross back sleeveless shirt vest tank top — similarity: 1.0


ninimour woman digital printed sleeveless shirt vest tank top cat — similarity: 0.332

------------------



<h3 style="margin: 0; padding: 0; color: #d9f4e4;">Delete unneeded data<h3/>

In [47]:
del tfidf_matrix_TB
del vectorizer
# del tfidf_matrix_TDB
del random_descs

<h1 style="margin: 0; padding: 0; color: #f00;">Collaborative - Filtering - Recommendation<h1/>

<p style="margin: 0; padding: 0; font-size: 16px; color: #549ef1;">Training</p>

In [48]:
ratings_df = fashion_df[['reviewerID', 'asin', 'overall']]
ratings_df = ratings_df.dropna(subset=['reviewerID', 'asin'])

avg_rating = ratings_df['overall'].mean()
ratings_df['overall'] = ratings_df['overall'].fillna(avg_rating)

ratings_df.rename(columns={'reviewerID': 'user_id', 'asin': 'item_id', 'overall': 'rating'}, inplace=True)

In [49]:
ratings_df.head(5)

Unnamed: 0,user_id,item_id,rating
0,A1D4G1SNUZWQOT,7106116521,5
1,A3DDWDH9PX2YX2,7106116521,2
2,A2MWC41EW7XL15,7106116521,4
3,A2UH2QQ275NV45,7106116521,2
4,A89F3LQADZBS5,7106116521,3


In [50]:
ratings_df.isnull().sum()

user_id    0
item_id    0
rating     0
dtype: int64

In [51]:
product_df = meta_df[['item_id', 'title', 'brand', "description"]].dropna()

In [52]:
product_df.head()

Unnamed: 0,item_id,title,brand,description
0,764443682,slime time fall fest cdrom collector card neut...,group publishing co,
1,1291691480,xcc qi promise new spider snake preparing men ...,,
2,1940280001,magical thing really,christopher manos,professional amateur magician routine include ...
3,1940735033,ash ash orange orange,flickerlamp publishing,
4,1940967805,aether empire 1 2016 first printing comic book...,,


In [53]:
product_df.isnull().sum()

item_id        0
title          0
brand          0
description    0
dtype: int64

In [54]:
user_counts = ratings_df['user_id'].value_counts()
ratings_df = ratings_df[ratings_df['user_id'].isin(user_counts[user_counts >= 5].index)]

In [55]:
user_counts

user_id
A3G5KDMFNRUXHB    40
A3JBQHQZEZPQK4    36
A1RRX286ZRI830    32
A2PBHVTPTIIGKR    31
A2GP4EJIAA2OE0    29
                  ..
A3F8ZWBOAZUZE2     1
A2HI8EO6V5PST8     1
A3GKQ3VSTGLAHW     1
A3HEDQXYV9OCVR     1
A2UZ5SVYOG3748     1
Name: count, Length: 749233, dtype: int64

In [56]:
ratings_df.head(5)

Unnamed: 0,user_id,item_id,rating
114,AAQO19HKS86MQ,B00008JOQI,4
1345,A3FOL5CECUQJKV,B0006HB4XE,5
1519,AQG16QCMT344N,B0007MV6PO,5
1962,A2R0KB6P9AWB3N,B0008F6WMM,5
2218,A1KPFFU7NOVNCY,B0009A1EA6,3


In [57]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['user_id', 'item_id', 'rating']], reader)

In [58]:
trainset, testset = train_test_split(data, test_size=0.2)

<h2 style="margin: 0; padding: 0; color: #d9f4e4;">Normal SVD<h2/>

In [98]:
model = SVD()

In [99]:
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7e2b90456050>

In [100]:
predictions = model.test(testset)

In [101]:
print("Final Test Results:")
accuracy.rmse(predictions)
accuracy.mae(predictions)
print("Recall@10:", recall_at_k(predictions))

Final Test Results:
RMSE: 1.1087
MAE:  0.8382
Recall@10: 0.9999201150343506


In [102]:
for i in range(5):
    idx = random.randint(0, len(ratings_df) - 1)
    user_id = ratings_df.iloc[idx]['user_id']
    
    recommendations = collaborative_filtering_recommend(user_id, ratings_df, meta_df, model, top_n=5)

    print(f"\n🔹 User {user_id} has these recommendations:\n")
    for title, image, rating in zip(recommendations['title'], recommendations['imageURL'], recommendations['predicted_rating']):
        if isinstance(image, str):
            display(Image(url=image, width=100))
        print(f"{title[:50]} rating:{rating}")


🔹 User A1REAVAC2GA7VN has these recommendations:



mj metal jewelry 2mm 10mm white tungsten carbide m rating:4.894469770959647


artificial amber pattern multicolor baltic necklac rating:4.781826319246983


qiyun z geometric oil drop silver chain tribal luc rating:4.724094333739911


multicolor evil eye bracelet rating:4.675541205081084


lnlclothing junior distressed skinny jean blue 9 rating:4.6282694416100005

🔹 User A2JTH05N68E06F has these recommendations:



nike woman flex supreme tr 4 cross trainer rating:4.811711438887628


nike woman flex supreme tr 4 cross trainer rating:4.79858032913316


mj metal jewelry 2mm 10mm white tungsten carbide m rating:4.7869602600916314


var apagestart new date gettime var ue t0 ue t0 ne rating:4.779938558200642


nike woman flex supreme tr 4 cross trainer rating:4.759449622609461

🔹 User AASAT3YLO3WTB has these recommendations:



mj metal jewelry 2mm 10mm white tungsten carbide m rating:5.0


black cat necklace peeking black cat pendant cute  rating:4.866034409065626


mj metal jewelry 2mm 10mm white tungsten carbide m rating:4.82356976378092


lnlclothing junior distressed skinny jean blue 9 rating:4.809491620170282


persun woman plunge neck pullover sweater top lace rating:4.802007083477863

🔹 User A1ZEIDBY2FSFNT has these recommendations:



lnlclothing junior distressed skinny jean blue 9 rating:4.947578724163791


1pc sky blue stylish artificial gem love heart sha rating:4.889392804259612


black cat necklace peeking black cat pendant cute  rating:4.85541349441453


qiyun z tribal jewelry flat round dangle coin tass rating:4.828261042275179


nike woman flex supreme tr 4 cross trainer rating:4.804053908602732

🔹 User A1IGQZIHKOQGE7 has these recommendations:



artificial amber pattern multicolor baltic necklac rating:4.745297178475182


mj metal jewelry 2mm 10mm white tungsten carbide m rating:4.7225784452660085


nike woman flex supreme tr 4 cross trainer rating:4.711559041775459


qiyun z tribal jewelry flat round dangle coin tass rating:4.709593815354325


nike woman flex supreme tr 4 cross trainer rating:4.687798139691016


<h2 style="margin: 0; padding: 0; color: #d9f4e4;">Grided SVD<h2/>

In [103]:
param_grid = {
    'n_factors': [25, 50, 100, 150, 200, 250, 300],
    'reg_all': [0.02, 0.05, 0.08],
    'n_epochs': [20, 35, 50]
}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', "mae"], cv=3)
gs.fit(data)

In [104]:
print("Best RMSE:", gs.best_score['rmse'])
print("Best params RMSE:", gs.best_params['rmse'])

model_svd_best_RMSE = gs.best_estimator['rmse']
model_svd_best_RMSE.fit(trainset)

Best RMSE: 1.0900353591699414
Best params RMSE: {'n_factors': 50, 'reg_all': 0.02, 'n_epochs': 50}


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7e2c30eac310>

In [105]:
predictions_RMSE = model_svd_best_RMSE.test(testset)

In [106]:
accuracy.rmse(predictions_RMSE)

RMSE: 1.0887


1.088706267434125

In [113]:
for i in range(5):
    idx = random.randint(0, len(ratings_df) - 1)
    user_id = ratings_df.iloc[idx]['user_id']
    
    recommendations = collaborative_filtering_recommend(user_id, ratings_df, meta_df, model_svd_best_RMSE, top_n=5)

    print(f"\n🔹 User {user_id} has these recommendations:\n")
    for title, image, rating in zip(recommendations['title'], recommendations['imageURL'], recommendations['predicted_rating']):
        if isinstance(image, str):
            display(Image(url=image, width=100))
        print(f"{title[:50]} rating:{rating}")

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.

🔹 User AG89YTABQXUAE has these recommendations:



lnlclothing junior distressed skinny jean blue 9 rating:4.213145530032499


bamoer white gold plated cubic zirconia stud earri rating:4.192351118913301


artificial amber pattern multicolor baltic necklac rating:4.0905376027865366


ninimour woman maxiskit elegant shirt dress party  rating:4.062494124762381


sunnow woman grils bohemian jewelry alloy pendant  rating:4.061982219473447

🔹 User A26THUHP9J0UQP has these recommendations:



bamoer white gold plated cubic zirconia stud earri rating:5.0


1pc sky blue stylish artificial gem love heart sha rating:5.0


lnlclothing junior distressed skinny jean blue 9 rating:5.0


girl dog pajama kid cotton 2 piece sleepwear shirt rating:5.0


havaianas woman top flip flop sandal rating:4.997663952141478

🔹 User AWBGHDHH7E51F has these recommendations:



california costume fashion flapper adult costume rating:5


havaianas woman top flip flop sandal rating:5


play baby boy flap sun protection hat aqua wavy bo rating:5


havaianas woman top flip flop sandal rating:5


var apagestart new date gettime var ue t0 ue t0 ne rating:5

🔹 User A1I8KIQI1WV83O has these recommendations:



california costume fashion flapper adult costume rating:5


play baby boy flap sun protection hat aqua wavy bo rating:5


incharacter costume men ninja warrior rating:5


var apagestart new date gettime var ue t0 ue t0 ne rating:5


bearington baby soft plush stuffed animal sock top rating:5

🔹 User A16TZRHTLDUBU4 has these recommendations:



mj metal jewelry 2mm 10mm white tungsten carbide m rating:4.488504579844264


90 degree reflex power flex yoga capri cationic he rating:4.460257260344428


poshsquare woman sleeveless lace floral sweetheart rating:4.4549352539082925


winter white ivory thick slouchy knit oversized be rating:4.355766430476663


e sterling silver ring cute pink cat opening finge rating:4.333867587403869


In [114]:
print("Best MAE:", gs.best_score['mae'])
print("Best params MAR:", gs.best_params['mae'])

model_svd_best_MAE = gs.best_estimator['mae']
model_svd_best_MAE.fit(trainset)

Best MAE: 0.7964301029297115
Best params MAR: {'n_factors': 25, 'reg_all': 0.02, 'n_epochs': 50}


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7e2c0725e250>

In [115]:
predictions_MAE = model_svd_best_MAE.test(testset)

In [137]:
accuracy.mae(predictions_MAE)

MAE:  0.7837


0.7837323371037224

In [116]:
for i in range(5):
    idx = random.randint(0, len(ratings_df) - 1)
    user_id = ratings_df.iloc[idx]['user_id']
    
    recommendations = collaborative_filtering_recommend(user_id, ratings_df, meta_df, model_svd_best_MAE, top_n=5)

    print(f"\n🔹 User {user_id} has these recommendations:\n")
    for title, image, rating in zip(recommendations['title'], recommendations['imageURL'], recommendations['predicted_rating']):
        if isinstance(image, str):
            display(Image(url=image, width=100))
        print(f"{title[:50]} rating:{rating}")


🔹 User A243ZDA6MVJAEP has these recommendations:



poshsquare woman sleeveless lace floral sweetheart rating:4.90211162565678


ear mitt bandless ear muff men woman soft fleece e rating:4.862471409116986


lnlclothing junior distressed skinny jean blue 9 rating:4.861180791925766


e sterling silver ring cute pink cat opening finge rating:4.847049815945921


womens drape front aztec long cardigan mtc 4 6 uk  rating:4.845649377033354

🔹 User A2AM9TXVCZW720 has these recommendations:



play baby boy flap sun protection hat aqua wavy bo rating:5.0


90 degree reflex power flex yoga capri cationic he rating:4.9153915226269564


lnlclothing junior distressed skinny jean blue 9 rating:4.881007231828066


artificial amber pattern multicolor baltic necklac rating:4.8557296030231845


1pc sky blue stylish artificial gem love heart sha rating:4.8417264244536655

🔹 User A2G9JO5SZ3GUXH has these recommendations:



djt woman scoop neck long sleeve stretchy irregula rating:5.0


lnlclothing junior distressed skinny jean blue 9 rating:5.0


iyun tm hollow butterfly antique festoon bib choke rating:4.9988781077063305


ear mitt bandless ear muff men woman soft fleece e rating:4.99307183027176


hot hollywood woman plus size denim top rating:4.989305015744082

🔹 User A1LCV2UNXGHB0W has these recommendations:



lnlclothing junior distressed skinny jean blue 9 rating:4.990036538487889


90 degree reflex power flex yoga capri cationic he rating:4.926044875789234


djt woman scoop neck long sleeve stretchy irregula rating:4.926026797533627


persun woman plunge neck pullover sweater top lace rating:4.925716331896299


poshsquare woman sleeveless lace floral sweetheart rating:4.912424901740636

🔹 User AM0SOYJG1WMF4 has these recommendations:



poshsquare woman sleeveless lace floral sweetheart rating:4.636505995873269


allegra k woman elegant peter pan collar petal sle rating:4.617576609426069


mj metal jewelry 2mm 10mm white tungsten carbide m rating:4.605817830424707


ear mitt bandless ear muff men woman soft fleece e rating:4.590557583436643


var apagestart new date gettime var ue t0 ue t0 ne rating:4.5536644375294095


<h2 style="margin: 0; padding: 0; color: #d9f4e4;">Grided SVDPP<h2/>

In [126]:
param_grid = {
    'n_factors': [25, 50, 100, 150, 200, 250, 300],
    'reg_all': [0.02, 0.05, 0.08],
    'n_epochs': [20, 35, 50]
}

gspp = GridSearchCV(
    SVDpp,
    param_grid,
    measures=['rmse', 'mae'],
    cv=3,
    n_jobs=-1
)

gspp.fit(data)

In [127]:
print("Best RMSE:", gspp.best_score['rmse'])
print("Best params RMSE:", gspp.best_params['rmse'])

model_svd_pp_best_RMSE = gspp.best_estimator['rmse']
model_svd_pp_best_RMSE.fit(trainset)

Best RMSE: 1.0880641104371525
Best params RMSE: {'n_factors': 25, 'reg_all': 0.02, 'n_epochs': 35}


<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x7e2b910e4ed0>

In [128]:
predictions_RMSE = model_svd_pp_best_RMSE.test(testset)

In [129]:
accuracy.rmse(predictions_RMSE)

RMSE: 1.0876


1.0876220739288296

In [130]:
for i in range(5):
    idx = random.randint(0, len(ratings_df) - 1)
    user_id = ratings_df.iloc[idx]['user_id']
    
    recommendations = collaborative_filtering_recommend(user_id, ratings_df, meta_df, model_svd_pp_best_RMSE, top_n=5)

    print(f"\n🔹 User {user_id} has these recommendations:\n")
    for title, image, rating in zip(recommendations['title'], recommendations['imageURL'], recommendations['predicted_rating']):
        if isinstance(image, str):
            display(Image(url=image, width=100))
        print(f"{title[:50]} rating:{rating}")


🔹 User A25SOIZ0IZE1YY has these recommendations:



leg avenue woman athletic thigh high sock rating:5


artificial amber pattern multicolor baltic necklac rating:5


lnlclothing junior distressed skinny jean blue 9 rating:5


sunnow woman grils bohemian jewelry alloy pendant  rating:5


poshsquare woman sleeveless lace floral sweetheart rating:5

🔹 User A8IAUH3ZCTNXU has these recommendations:



california costume fashion flapper adult costume rating:5


van adult classic slip checkerboard black pewter rating:5


leg avenue woman athletic thigh high sock rating:5


sterling silver green amethyst ring rating:5


kid collection baby girl ruffle tiered dress rating:5

🔹 User AENH50GW3OKDA has these recommendations:



california costume fashion flapper adult costume rating:5


van adult classic slip checkerboard black pewter rating:5


calvin klein boy assorted boxer brief pack 2 rating:5


havaianas woman top flip flop sandal rating:5


play baby boy flap sun protection hat aqua wavy bo rating:5

🔹 User A83UL8JVYX0E0 has these recommendations:



qiyun z woman girl lady girl turquoise oval shape  rating:4.256902492557104


sunnow woman grils bohemian jewelry alloy pendant  rating:4.246021319529476


retro polarized sunglass rating:4.157820310293795


bamoer white gold plated cubic zirconia stud earri rating:4.155396281936725


90 degree reflex power flex yoga capri cationic he rating:4.122411578608969

🔹 User A3L5TUJ127SR51 has these recommendations:



90 degree reflex power flex yoga capri cationic he rating:4.397089345775976


tommy john cool cotton boxer brief black xx large rating:4.36672891024162


van adult classic slip checkerboard black pewter rating:4.317975550034383


autumnfall sexy womens basic short sleeve jersey c rating:4.316005998047276


doinshop lady black batwing sleeve chiffon shirt b rating:4.288383829995884


In [131]:
print("Best MAE:", gspp.best_score['mae'])
print("Best params MAE:", gspp.best_params['mae'])

model_svd_pp_best_MAE = gspp.best_estimator['mae']
model_svd_pp_best_MAE.fit(trainset)

Best MAE: 0.7850644955685385
Best params MAE: {'n_factors': 25, 'reg_all': 0.02, 'n_epochs': 50}


<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x7e2b910e71d0>

In [132]:
predictions_MAE = model_svd_pp_best_MAE.test(testset)

In [133]:
accuracy.mae(predictions_MAE)

MAE:  0.7837


0.7837323371037224

In [134]:
for i in range(5):
    idx = random.randint(0, len(ratings_df) - 1)
    user_id = ratings_df.iloc[idx]['user_id']
    
    recommendations = collaborative_filtering_recommend(user_id, ratings_df, meta_df, model_svd_pp_best_MAE, top_n=5)

    print(f"\n🔹 User {user_id} has these recommendations:\n")
    for title, image, rating in zip(recommendations['title'], recommendations['imageURL'], recommendations['predicted_rating']):
        if isinstance(image, str):
            display(Image(url=image, width=100))
        print(f"{title[:50]} rating:{rating}")


🔹 User A83UL8JVYX0E0 has these recommendations:



qiyun z colorful wide resin new fashion style woma rating:4.710404046408412


calvin klein boy assorted boxer brief pack 2 rating:4.542184026149622


womens drape front aztec long cardigan mtc 4 6 uk  rating:4.248339816775581


925 sterling silver petite heart cut toe baby ring rating:4.24339853177173


modern kiwi spot leopard chiffon print scarf cafta rating:4.229666107089674

🔹 User AKY3EYDEZVW6C has these recommendations:



havaianas woman top flip flop sandal rating:5


bundle monster womens fancy vintage clear crystal  rating:5


lanyjewelry designer style 316 stainless steel pla rating:5


womens drape front aztec long cardigan mtc 4 6 uk  rating:5


qandsweet baby hairband girl elastic hair accessor rating:5

🔹 User A3B2PJS73SWRBV has these recommendations:



havaianas woman top flip flop sandal rating:5


bearington baby soft plush stuffed animal sock top rating:5


ear mitt bandless ear muff men woman soft fleece e rating:5


sterling silver green amethyst ring rating:5


he09463bl12 blue 10us ever pretty summer dress wom rating:5

🔹 User A3LBE3J28WGC2U has these recommendations:



box1mm nickel free italian sterling silver 1mm box rating:5


henschel breezer hat coolmax band khaki medium kha rating:5


california costume fashion flapper adult costume rating:5


van adult classic slip checkerboard black pewter rating:5


wigwam unisex cool lite pro quarter length sock rating:5

🔹 User A1MEZNNYI5MQHG has these recommendations:



california costume fashion flapper adult costume rating:5


play baby boy flap sun protection hat aqua wavy bo rating:5


sterling silver green amethyst ring rating:5


michael michael kor woman damita wedge rating:5


bundle monster womens fancy vintage clear crystal  rating:5


<h3 style="margin: 0; padding: 0; color: #d9f4e4;">Delete unneeded data<h3/>

In [81]:
del model

<h1 style="margin: 0; padding: 0; color: #f00;">Hybrid - Recommendation<h1/>

In [138]:

def hybrid_recommend(user_id, tfidf_matrix, items_ordered, ratings_df, meta_df, cf_model, top_n=10, alpha=0.7, candidate_size=1000, rating_scale=(1.0, 5.0), rating_weighting='global_center'):
    item_to_idx = build_tfidf_index_map(items_ordered)
    all_items = np.array(items_ordered)

    rated_items = ratings_df[ratings_df['user_id'] == user_id]['item_id'].unique()
    candidate_mask = ~np.isin(all_items, rated_items)
    candidate_items = all_items[candidate_mask]

    user_profile = build_user_content_vector(user_id, ratings_df, item_to_idx, tfidf_matrix, rating_weighting)
    if user_profile is None:
        fallback = (ratings_df.groupby('asin')['rating']
                    .mean()
                    .reset_index()
                    .rename(columns={'rating': 'avg_rating'}))
        top = fallback.merge(meta_df, on='item_id', how='left').sort_values('avg_rating', ascending=False).head(top_n)
        top['hybrid_score'] = top['avg_rating']
        return top[['asub','title','brand','hybrid_score']]
        
    candidate_indices = [item_to_idx[it] for it in candidate_items if it in item_to_idx]
    if len(candidate_indices) == 0:
        return pd.DataFrame(columns=['asin','title','brand','hybrid_score'])

    tfidf_candidates = tfidf_matrix[candidate_indices]
    cb_scores = cosine_similarity(user_profile, tfidf_candidates).flatten()

    top_k = min(candidate_size, len(candidate_indices))
    top_k_idx_local = np.argpartition(-cb_scores, top_k-1)[:top_k] 
    top_candidate_indices = [candidate_indices[i] for i in top_k_idx_local]
    top_candidate_items = [all_items[idx] for idx in top_candidate_indices]
    top_candidate_cb_scores = cb_scores[top_k_idx_local]

    cf_predictions = []
    for item_id in top_candidate_items:
        pred = cf_model.predict(user_id, item_id)
        cf_pred = pred.est
        cf_predictions.append(cf_pred)
    cf_predictions = np.array(cf_predictions)

    rmin, rmax = rating_scale
    cb_norm = np.clip(top_candidate_cb_scores, 0.0, 1.0)
    cb_scaled = cb_norm * (rmax - rmin) + rmin

    hybrid_scores = alpha * cf_predictions + (1.0 - alpha) * cb_scaled
    top_order = np.argsort(-hybrid_scores)[:top_n]

    selected_items = [top_candidate_items[i] for i in top_order]
    selected_scores = hybrid_scores[top_order]

    result = pd.DataFrame({
        'asin': selected_items,
        'hybrid_score': selected_scores
    })

    display(meta_df.head())
    display(result.head())

    result = result.merge(meta_df, left_on='asin', right_on='item_id', how='left')
    cols = ['item_id', 'title'] + ([c for c in ['brand','imageURL'] if c in result.columns]) + ['hybrid_score']
    return result[cols]

In [139]:
idx = random.randint(0, len(ratings_df) - 1)
user_id = ratings_df.iloc[idx]['user_id']
items_ordered = meta_df['item_id'].tolist()
# print(ratings_df)
recommendations = hybrid_recommend(
    user_id=user_id,
    tfidf_matrix=tfidf_matrix_TDB,
    items_ordered=items_ordered,
    ratings_df=ratings_df,
    meta_df=meta_df,
    cf_model=model_svd_pp_best_RMSE,
    top_n=10,
    alpha=0.7,
    candidate_size=2000,      # مرشحين قبل التصفية
    rating_scale=(1.0, 5.0),  # نفس نطاق التقييم
    rating_weighting='global_center'
)
    
print(recommendations[['title', 'brand', 'hybrid_score']])

Unnamed: 0,title,brand,feature,rank,date,item_id,imageURL,imageURLHighRes,description,price,also_view,also_buy,fit,details,similar_item,tech1
0,slime time fall fest cdrom collector card neut...,group publishing co,[Product Dimensions:\n \n8....,"13,052,976inClothing,Shoesamp;Jewelry(",8.70 inches,764443682,https://images-na.ssl-images-amazon.com/images...,[https://images-na.ssl-images-amazon.com/image...,,,,,,,,
1,xcc qi promise new spider snake preparing men ...,,,"11,654,581inClothing,Shoesamp;Jewelry(",5 star,1291691480,https://images-na.ssl-images-amazon.com/images...,[https://images-na.ssl-images-amazon.com/image...,,,,,,,,
2,magical thing really,christopher manos,[Package Dimensions:\n \n8....,"19,308,073inClothing,ShoesJewelry(",5 star,1940280001,https://images-na.ssl-images-amazon.com/images...,[https://images-na.ssl-images-amazon.com/image...,professional amateur magician routine include ...,,,,,,,
3,ash ash orange orange,flickerlamp publishing,[Package Dimensions:\n \n8....,"19,734,184inClothing,ShoesJewelry(",5 star,1940735033,https://images-na.ssl-images-amazon.com/images...,[https://images-na.ssl-images-amazon.com/image...,,,,,,,,
4,aether empire 1 2016 first printing comic book...,,[Package Dimensions:\n \n10...,"10,558,646inClothing,Shoesamp;Jewelry(",5 star,1940967805,https://images-na.ssl-images-amazon.com/images...,[https://images-na.ssl-images-amazon.com/image...,,$4.50,,,,,,


Unnamed: 0,asin,hybrid_score
0,B00V2RSCTG,3.512765
1,B00Q43AV38,3.462967
2,B01B7MW520,3.421723
3,B00Y5645U6,3.402451
4,B015PAW9ZI,3.396368


                                               title       brand  hybrid_score
0  woman strapless bandage dress celebrity midi e...                  3.512765
1  cool punk rock sexy new woman v neck bodycon s...     qiyun z      3.462967
2  autumnfall sexy womens basic short sleeve jers...  autumnfall      3.421723
3  sexyarn woman bandage criss cross one piece mo...     sexyarn      3.402451
4  winson sexy womens long sleeve top deep v lace...      winson      3.396368
5  mooncolour woman newest design color block sti...  mooncolour      3.387939
6  queenmore woman bodycon midi bandage clubwear ...   queenmore      3.361420
7  viishow sexy woman summer boho sleeveless loos...     viishow      3.352182
8  funoc lady womens vintage boho bohemia sleevel...   goodgoods      3.351772
9  pakula woman fashion contrast bodycon pencil e...      pakula      3.316118
