<h1 style="margin: 0; padding: 0; color: #d9f4e4;">Import Libraries<h1/>

In [71]:
import glob
import pandas as pd
from IPython.display import display, Image, HTML
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from surprise import SVD, Dataset, Reader, accuracy
from surprise.model_selection import train_test_split, GridSearchCV
import numpy as np

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

<h1 style="margin: 0; padding: 0; color: #d9f4e4;">Global Variables<h1/>

In [3]:
PATH = "/kaggle/input/hyb-reco-amazone/Dataset"
FASHION_FILE = "/AMAZON_FASHION.json"
META_FILE = "/meta_AMAZON_FASHION.json"
STOP_WORDS = set(stopwords.words('english'))
LEMM = WordNetLemmatizer()

<h1 style="margin: 0; padding: 0; color: #d9f4e4;">Functions<h1/>

In [4]:
def clean_text(x):
    if isinstance(x, list):
        x = ' '.join(map(str, x))
    if not isinstance(x, str):
        x = str(x)
    return x.lower().strip()

In [5]:
def get_similar_items_tdidf(item_index, matrix, top_n=10):
    cosine_scores = cosine_similarity(matrix[item_index], matrix)
    
    similar_indices = cosine_scores.argsort()[0][::-1]
    
    top_indices = similar_indices[1:top_n+1]
    
    return top_indices, cosine_scores[0][top_indices]

In [6]:
def clean(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', ' ', text)
    text = re.sub(r'http\S+', ' ', text)
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    words = text.split()
    words = [LEMM.lemmatize(w) for w in words if w not in STOP_WORDS]
    return " ".join(words)

In [7]:
def content_based_recommend_tdidf(rounds, recommends_n, matrix, dataset):
    for i in range(rounds):
        display(Image(url=dataset.iloc[i]["imageURL"][0], width=150))
        print(dataset.iloc[i]["title"], "\n")

        num = random.randint(0, len(dataset)-1)
        ids, scores = get_similar_items_tdidf(num, matrix, recommends_n)

        for ind, s in zip(ids, scores):
            display(Image(url=dataset.iloc[ind]["imageURL"], width=100))
            print(dataset.iloc[ind]['title'], "— similarity:", round(s, 3))
        print("\n------------------\n")

In [8]:
def collaborative_filtering_recommend(user_id, ratings_df, meta_df, model, top_n=10):
    items = ratings_df['item_id'].unique()
    rated_items = ratings_df[ratings_df['user_id'] == user_id]['item_id'].values
    items_to_predict = [item for item in items if item not in rated_items]

    predictions = [(item, model.predict(user_id, item).est) for item in items_to_predict]
    predictions_sorted = sorted(predictions, key=lambda x: x[1], reverse=True)[:top_n]

    result = pd.DataFrame(predictions_sorted, columns=['item_id', 'predicted_rating'])
    
    return result.merge(meta_df, on='item_id', how='left')

In [9]:
def preprocess_images(dataset):
    def first_image(img_list):
        if isinstance(img_list, list) and len(img_list) > 0:
            return img_list[0]
        else:
            return ""
    dataset['imageURL'] = dataset['imageURL'].apply(first_image)
    return dataset

In [64]:
def build_tfidf_index_map(items_ordered):
    return {item: idx for idx, item in enumerate(items_ordered)}

In [65]:
def build_user_content_vector(user_id, ratings_df, item_to_idx, tfidf_matrix, rating_weighting='global_center'):
    user_ratings = ratings_df[ratings_df['user_id'] == user_id]
    if user_ratings.empty:
        return None

    user_ratings = user_ratings[user_ratings['item_id'].isin(item_to_idx)]
    if user_ratings.empty:
        return None

    if rating_weighting == 'global_center':
        global_mean = ratings_df['rating'].mean()
        weights = (user_ratings['rating'] - global_mean).values
    elif rating_weighting == 'user_center':
        user_mean = user_ratings['rating'].mean()
        weights = (user_ratings['rating'] - user_mean).values
    else:
        weights = user_ratings['rating'].values

    if np.allclose(weights, 0):
        weights = np.ones_like(weights)

    vecs = []
    for item, w in zip(user_ratings['item_id'], weights):
        idx = item_to_idx[item]
        vec = tfidf_matrix[idx]
        vecs.append((vec, w))

    sum_w = np.sum(np.abs(weights))
    if sum_w == 0:
        sum_w = len(weights)

    first_vec = vecs[0][0]
    if hasattr(first_vec, "toarray"):
        feature_dim = first_vec.shape[1]
        user_profile = np.zeros((feature_dim,), dtype=np.float32)
        for vec, w in vecs:
            user_profile += (vec.toarray().ravel() * w)
    else:
        feature_dim = first_vec.shape[0]
        user_profile = np.zeros((feature_dim,), dtype=np.float32)
        for vec, w in vecs:
            user_profile += (vec.ravel() * w)

    user_profile = user_profile / sum_w
    return user_profile.reshape(1, -1)

<h1 style="margin: 0; padding: 0; color: #d9f4e4;">Display Data<h1/>

In [10]:
fashion_path = PATH + FASHION_FILE
meta_path = PATH + META_FILE

In [11]:
print(f"Fashion file: {fashion_path}")
print(f"Meta file: {meta_path}")

Fashion file: /kaggle/input/hyb-reco-amazone/Dataset/AMAZON_FASHION.json
Meta file: /kaggle/input/hyb-reco-amazone/Dataset/meta_AMAZON_FASHION.json


<p style="margin: 0; padding: 0; font-size: 16px; color: #549ef1;">Fashion Reviewd</p>

In [12]:
fashion_df = pd.read_json(fashion_path, lines=True)

In [13]:
fashion_df.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,5,True,"10 20, 2014",A1D4G1SNUZWQOT,7106116521,Tracy,Exactly what I needed.,perfect replacements!!,1413763200,,,
1,2,True,"09 28, 2014",A3DDWDH9PX2YX2,7106116521,Sonja Lau,"I agree with the other review, the opening is ...","I agree with the other review, the opening is ...",1411862400,3.0,,
2,4,False,"08 25, 2014",A2MWC41EW7XL15,7106116521,Kathleen,Love these... I am going to order another pack...,My New 'Friends' !!,1408924800,,,
3,2,True,"08 24, 2014",A2UH2QQ275NV45,7106116521,Jodi Stoner,too tiny an opening,Two Stars,1408838400,,,
4,3,False,"07 27, 2014",A89F3LQADZBS5,7106116521,Alexander D.,Okay,Three Stars,1406419200,,,


In [14]:
fashion_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 883636 entries, 0 to 883635
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   overall         883636 non-null  int64  
 1   verified        883636 non-null  bool   
 2   reviewTime      883636 non-null  object 
 3   reviewerID      883636 non-null  object 
 4   asin            883636 non-null  object 
 5   reviewerName    883544 non-null  object 
 6   reviewText      882403 non-null  object 
 7   summary         883103 non-null  object 
 8   unixReviewTime  883636 non-null  int64  
 9   vote            79900 non-null   float64
 10  style           304569 non-null  object 
 11  image           28807 non-null   object 
dtypes: bool(1), float64(1), int64(2), object(8)
memory usage: 75.0+ MB


<p style="margin: 0; padding: 0; font-size: 16px; color: #549ef1;">Fashion Meta</p>

In [15]:
meta_df = pd.read_json(meta_path, lines=True)

In [16]:
meta_df.head()

Unnamed: 0,title,brand,feature,rank,date,asin,imageURL,imageURLHighRes,description,price,also_view,also_buy,fit,details,similar_item,tech1
0,Slime Time Fall Fest [With CDROM and Collector...,Group Publishing (CO),[Product Dimensions:\n \n8....,"13,052,976inClothing,Shoesamp;Jewelry(",8.70 inches,764443682,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,,,,,,,,
1,XCC Qi promise new spider snake preparing men'...,,,"11,654,581inClothing,Shoesamp;Jewelry(",5 star,1291691480,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,,,,,,,,
2,Magical Things I Really Do Do Too!,Christopher Manos,[Package Dimensions:\n \n8....,"19,308,073inClothing,ShoesJewelry(",5 star,1940280001,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,[For the professional or amateur magician. Ro...,,,,,,,
3,"Ashes to Ashes, Oranges to Oranges",Flickerlamp Publishing,[Package Dimensions:\n \n8....,"19,734,184inClothing,ShoesJewelry(",5 star,1940735033,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,,,,,,,,
4,Aether & Empire #1 - 2016 First Printing Comic...,,[Package Dimensions:\n \n10...,"10,558,646inClothing,Shoesamp;Jewelry(",5 star,1940967805,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,,$4.50,,,,,,


In [17]:
meta_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186637 entries, 0 to 186636
Data columns (total 16 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   title            186632 non-null  object
 1   brand            139957 non-null  object
 2   feature          123875 non-null  object
 3   rank             180222 non-null  object
 4   date             185001 non-null  object
 5   asin             186637 non-null  object
 6   imageURL         132017 non-null  object
 7   imageURLHighRes  132017 non-null  object
 8   description      15869 non-null   object
 9   price            17799 non-null   object
 10  also_view        11595 non-null   object
 11  also_buy         21642 non-null   object
 12  fit              4831 non-null    object
 13  details          885 non-null     object
 14  similar_item     317 non-null     object
 15  tech1            97 non-null      object
dtypes: object(16)
memory usage: 22.8+ MB


In [18]:
meta_df.isnull().sum()

title                   5
brand               46680
feature             62762
rank                 6415
date                 1636
asin                    0
imageURL            54620
imageURLHighRes     54620
description        170768
price              168838
also_view          175042
also_buy           164995
fit                181806
details            185752
similar_item       186320
tech1              186540
dtype: int64

In [19]:
random_descs = meta_df['feature'].sample(50, random_state=42)

for i, desc in enumerate(random_descs):
    print(f"\n{i+1}: --- {desc}")


1: --- nan

2: --- ['Package Dimensions:\n                    \n9.8 x 5.7 x 0.8 inches', 'Shipping Weight:\n                    \n1 pounds']

3: --- ['Package Dimensions:\n                    \n5.5 x 4 x 0.6 inches', 'Shipping Weight:\n                    \n0.32 ounces']

4: --- ['Shipping Weight:\n                    \n1.2 pounds']

5: --- ['Package Dimensions:\n                    \n9.8 x 6.9 x 1.6 inches', 'Shipping Weight:\n                    \n7.2 ounces']

6: --- ['Package Dimensions:\n                    \n15.3 x 12.1 x 3.6 inches', 'Shipping Weight:\n                    \n1.3 pounds']

7: --- ['Package Dimensions:\n                    \n12.2 x 9.6 x 0.8 inches', 'Shipping Weight:\n                    \n7.2 ounces']

8: --- ['Package Dimensions:\n                    \n8.9 x 7.5 x 0.8 inches', 'Shipping Weight:\n                    \n7.2 ounces']

9: --- ['Package Dimensions:\n                    \n15.8 x 10.9 x 1.5 inches', 'Shipping Weight:\n                    \n12.8 ounces'

In [20]:
random_descs = meta_df['imageURL'].sample(50, random_state=42)

for i, desc in enumerate(random_descs):
    print(f"\n{i+1}: --- {desc}")


1: --- nan

2: --- ['https://images-na.ssl-images-amazon.com/images/I/41oXbLw29FL._SR38,50_.jpg']

3: --- ['https://images-na.ssl-images-amazon.com/images/I/41GEF61UGWL._US40_.jpg', 'https://images-na.ssl-images-amazon.com/images/I/41%2BEd7Uel8L._US40_.jpg']

4: --- ['https://images-na.ssl-images-amazon.com/images/I/41R6WvDa5zL._SR38,50_.jpg', 'https://images-na.ssl-images-amazon.com/images/I/51q03LjJ3SL._SR38,50_.jpg', 'https://images-na.ssl-images-amazon.com/images/I/41JTTDK0YAL._SR38,50_.jpg']

5: --- nan

6: --- ['https://images-na.ssl-images-amazon.com/images/I/51DnrdhcH4L._SR38,50_.jpg', 'https://images-na.ssl-images-amazon.com/images/I/51IxnsEHMqL._SR38,50_.jpg', 'https://images-na.ssl-images-amazon.com/images/I/61uaUSSf%2B6L._SR38,50_.jpg']

7: --- ['https://images-na.ssl-images-amazon.com/images/I/41dB2RPT8tL._SR38,50_.jpg', 'https://images-na.ssl-images-amazon.com/images/I/41YnTnmY3yL._SR38,50_.jpg', 'https://images-na.ssl-images-amazon.com/images/I/41yTvx75Y3L._SR38,50_.jpg

<h1 style="margin: 0; padding: 0; color: #d9f4e4;">Pre-processing<h1/>

In [21]:
for col in ['title', 'brand', 'description']:
    meta_df[col] = meta_df[col].fillna(" ")

for col in ['title', 'brand', 'description']:
    meta_df[col] = meta_df[col].fillna('').apply(clean_text)

In [22]:
meta_df['title'] = meta_df['title'].apply(clean)
meta_df['brand'] = meta_df['brand'].apply(clean)
meta_df['description'] = meta_df['description'].apply(clean)

In [23]:
meta_df = meta_df.rename(columns={'asin': 'item_id'})

In [24]:
meta_df = preprocess_images(meta_df)

In [25]:
meta_df.isnull().sum()

title                   0
brand                   0
feature             62762
rank                 6415
date                 1636
item_id                 0
imageURL                0
imageURLHighRes     54620
description             0
price              168838
also_view          175042
also_buy           164995
fit                181806
details            185752
similar_item       186320
tech1              186540
dtype: int64

<h1 style="margin: 0; padding: 0; color: #d9f4e4;">Display Pre-processed Data<h1/>

In [26]:
random_descs = meta_df['description'].sample(50, random_state=42)

for i, desc in enumerate(random_descs):
    print(f"\n{i+1}: --- {desc}")


1: --- 

2: --- 

3: --- lovely earring showcase created ruby stone gem surrounded 14 sparkling cz stone earring crafted sterling silver secured leverbacks total gem weight 2ct total cz weight 38ct

4: --- 

5: --- 

6: --- 

7: --- 

8: --- 

9: --- 

10: --- 

11: --- 

12: --- 

13: --- 

14: --- 

15: --- 

16: --- 

17: --- 

18: --- 

19: --- 

20: --- 

21: --- 

22: --- 

23: --- 

24: --- 

25: --- 

26: --- cozy unicorn feature zipper front shooting star fleece dress furry rainbow cuff attached authentic unicorn fur tail mystical horn hood glorious mane

27: --- 

28: --- beautiful rom leather concealment purse made premium leather reinforced stitching quality zipper color black width 12 height 9 depth 4 strap drop 15 gun compartment side access measure 10 x 8 7 lockable zipper shown 2 key included velcro lined precise gun positioning double zipper top open large center compartment 1 2 deep zippered compartment one wall open pocket along side elastic closed open pocket wall 

In [27]:
random_descs = meta_df['imageURL'].sample(50, random_state=42)

for i, desc in enumerate(random_descs):
    print(f"\n{i+1}: --- {desc}")


1: --- 

2: --- https://images-na.ssl-images-amazon.com/images/I/41oXbLw29FL._SR38,50_.jpg

3: --- https://images-na.ssl-images-amazon.com/images/I/41GEF61UGWL._US40_.jpg

4: --- https://images-na.ssl-images-amazon.com/images/I/41R6WvDa5zL._SR38,50_.jpg

5: --- 

6: --- https://images-na.ssl-images-amazon.com/images/I/51DnrdhcH4L._SR38,50_.jpg

7: --- https://images-na.ssl-images-amazon.com/images/I/41dB2RPT8tL._SR38,50_.jpg

8: --- https://images-na.ssl-images-amazon.com/images/I/51haAT1vAiL._SR38,50_.jpg

9: --- https://images-na.ssl-images-amazon.com/images/I/41qGmCnt%2BtL._SR38,50_.jpg

10: --- https://images-na.ssl-images-amazon.com/images/I/41uFvYlmGyL._SR38,50_.jpg

11: --- https://images-na.ssl-images-amazon.com/images/I/41GNvspiqDL._SR38,50_.jpg

12: --- https://images-na.ssl-images-amazon.com/images/I/41MavSzbzmL._US40_.jpg

13: --- https://images-na.ssl-images-amazon.com/images/I/41M1Z7gJurL._US40_.jpg

14: --- https://images-na.ssl-images-amazon.com/images/I/41u7su-oKPL._S

<h1 style="margin: 0; padding: 0; color: #f00;">Content - Based - Recommendation<h1/>
<h2 style="margin: 0; padding: 0; color: #d9f4e4;">TFIDF<h2/>

<p style="margin: 0; padding: 0; font-size: 16px; color: #549ef1;">Training</p>

In [28]:
vectorizer = TfidfVectorizer(
    stop_words='english',
    max_features=50000
)

In [29]:
combines = meta_df[["title", "description", "brand", "imageURL"]]

In [30]:

combines['combined'] = meta_df['title'] + meta_df["description"] + meta_df['brand']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combines['combined'] = meta_df['title'] + meta_df["description"] + meta_df['brand']


In [31]:
combines.head(5)

Unnamed: 0,title,description,brand,imageURL,combined
0,slime time fall fest cdrom collector card neut...,,group publishing co,https://images-na.ssl-images-amazon.com/images...,slime time fall fest cdrom collector card neut...
1,xcc qi promise new spider snake preparing men ...,,,https://images-na.ssl-images-amazon.com/images...,xcc qi promise new spider snake preparing men ...
2,magical thing really,professional amateur magician routine include ...,christopher manos,https://images-na.ssl-images-amazon.com/images...,magical thing reallyprofessional amateur magic...
3,ash ash orange orange,,flickerlamp publishing,https://images-na.ssl-images-amazon.com/images...,ash ash orange orangeflickerlamp publishing
4,aether empire 1 2016 first printing comic book...,,,https://images-na.ssl-images-amazon.com/images...,aether empire 1 2016 first printing comic book...


In [32]:
combines.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186637 entries, 0 to 186636
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   title        186637 non-null  object
 1   description  186637 non-null  object
 2   brand        186637 non-null  object
 3   imageURL     186637 non-null  object
 4   combined     186637 non-null  object
dtypes: object(5)
memory usage: 7.1+ MB


In [33]:
combines.isnull().sum()

title          0
description    0
brand          0
imageURL       0
combined       0
dtype: int64

In [34]:
tfidf_matrix_TDB = vectorizer.fit_transform(combines["combined"])

In [35]:
content_based_recommend_tdidf(5, 5, tfidf_matrix_TDB, combines)

slime time fall fest cdrom collector card neutron ball incredi ball glow stick necklace paper fram 



bentibo woman floral tribal aztec pattern printed legging stretch pocket jogger pant blue — similarity: 1.0


bentibo woman floral tribal aztec pattern printed legging stretch pocket jogger pant blue — similarity: 1.0


bentibo woman floral tribal aztec pattern printed legging stretch pocket jogger pant hot pink — similarity: 0.965


woman floral tribal aztec pattern printed legging stretch pocket jogger pant pink — similarity: 0.871


bentibo woman floral tribal aztec pattern printed legging stretch pocket jogger pant blue l — similarity: 0.856

------------------



xcc qi promise new spider snake preparing men accessory alloy fitting magnet buckle bracelet jewelry 



michael kor ring size 7 mkj2659791 — similarity: 1.0


michael kor bracelet mkj2928710 — similarity: 0.721


michael kor earring mkj1966040 — similarity: 0.718


michael kor earring mkj2855791 — similarity: 0.718


michael kor earring mkj1951931 — similarity: 0.718

------------------



magical thing really 



singlelady men leather g string thong underwear harness corset vest purple — similarity: 0.865


singlelady womens swim brief adjustable tie mini short swimwear black — similarity: 0.487


singlelady woman letter print baggy shirt swimwear bikini cover beach dress black — similarity: 0.475


singlelady woman tribal printed tankini boyshort bikini set b black — similarity: 0.472


wenmei men sexy harness jockstrap thong healthcare corset vest underwear black — similarity: 0.435

------------------



ash ash orange orange 



tommy hilfiger men classic fit jean 30x32 black — similarity: 0.721


tommy hilfiger men black tie — similarity: 0.661


tommy hilfiger womens handbag — similarity: 0.625


tommy hilfiger womens overdye pleated jean jacket blue — similarity: 0.6


tommy hilfiger men custom fit plaid long sleeve shirt xl blue white — similarity: 0.599

------------------



aether empire 1 2016 first printing comic book special edition rare blue juice comic 



shlax wing men tie white stripe paisley silk necktie 63 57 5 skinny — similarity: 0.77


shlax wing stripe men tie blue green silk silk necktie formal — similarity: 0.683


shlax wing men necktie geometric blue yellow silk tie skinny extra long — similarity: 0.664


shlax wing men necktie geometric blue yellow silk tie skinny extra long — similarity: 0.664


shlax wing necktie floral men tie navy dark blue geometric fashion dress — similarity: 0.648

------------------



<h2 style="margin: 0; padding: 0; color: #d9f4e4;">TFIDF Enhanced with weights<h2/>

In [36]:
combines['combined'] = (
    combines['title'] * 3 + " " +
    combines['brand'] * 2 + " " +
    combines['description']
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combines['combined'] = (


In [37]:
vectorizer = TfidfVectorizer(
    stop_words='english',
    min_df=3,
    ngram_range=(1, 2)   # bigrams + unigrams
)

In [38]:
tfidf_matrix_TDB = vectorizer.fit_transform(combines['combined'])

In [39]:
content_based_recommend_tdidf(5, 5, tfidf_matrix_TDB, combines)

slime time fall fest cdrom collector card neutron ball incredi ball glow stick necklace paper fram 



polo golf ralph lauren men stripe vintage pima lisle shirt black white large — similarity: 0.491


polo ralph lauren men stripe mesh pique big pony polo shirt large blue — similarity: 0.431


ralph lauren golf rlx men cypress golf short rp 34w — similarity: 0.423


albatross golf men midnight fade polo l — similarity: 0.387


polo ralph lauren men polo shirt classic fit lime green — similarity: 0.374

------------------



xcc qi promise new spider snake preparing men accessory alloy fitting magnet buckle bracelet jewelry 



unisex kid child hooded cloak role play costume halloween party cape red — similarity: 0.711


adult velvet hooded cloak — similarity: 0.429


hisionlee hot sexy halloween christmas party costume woman 1 pc dress — similarity: 0.29


pirate deluxe role play set — similarity: 0.251


pirate deluxe role play set — similarity: 0.251

------------------



magical thing really 



twisted straw hat assorted — similarity: 0.517


stetson graydon black straw hat 7 1 2 — similarity: 0.516


baja straw hat — similarity: 0.443


hand braided packable striped straw hat multicolor 22 — similarity: 0.376


safari straw hat natural band osfm — similarity: 0.375

------------------



ash ash orange orange 



gamt retro cat eye sunglass mirrored lens transparent frame desinger woman grey — similarity: 0.441


pandada woman metal frame retro square eyewear flower — similarity: 0.411


pandada woman metal frame retro square eyewear yellow gold — similarity: 0.402


clear lens woman fashion cat eye eyeglass frame retro style black red — similarity: 0.382


pandada woman metal frame retro square eyewear black silver — similarity: 0.382

------------------



aether empire 1 2016 first printing comic book special edition rare blue juice comic 



bluetime womens lapel campus casual plaid shirt slim fit flannel charming top blouse — similarity: 1.0


bluetime womens lapel campus casual plaid shirt slim fit flannel charming top blouse — similarity: 1.0


bluetime womens lapel campus casual plaid shirt slim fit flannel charming top blouse — similarity: 1.0


bluetime womens lapel campus casual plaid shirt slim fit flannel charming top blouse — similarity: 1.0


bluetime womens lapel campus casual plaid shirt slim fit flannel charming top blouse — similarity: 1.0

------------------



<h2 style="margin: 0; padding: 0; color: #d9f4e4;">TFIDF (With no description)<h2/>

In [40]:
combines['combined'] = (
    combines['title'] * 3 + " " +
    combines['brand'] * 2 + " " 
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combines['combined'] = (


In [41]:
tfidf_matrix_TB = vectorizer.fit_transform(combines['combined'])

In [42]:
content_based_recommend_tdidf(5, 5, tfidf_matrix_TB, combines)

slime time fall fest cdrom collector card neutron ball incredi ball glow stick necklace paper fram 



lowpricenice woman vintage lace ruffle frilly ankle sock purple — similarity: 0.641


lowpricenice woman vintage lace ruffle frilly ankle sock khaki — similarity: 0.616


lowpricenice woman vintage lace ruffle frilly ankle sock black — similarity: 0.603


lowpricenice woman vintage lace ruffle frilly ankle sock white — similarity: 0.602


soft frilly ankle princess girl sock lace ruffle lady — similarity: 0.598

------------------



xcc qi promise new spider snake preparing men accessory alloy fitting magnet buckle bracelet jewelry 



nike air max penny kid university red black white 6 u — similarity: 1.0


nike air max penny kid university red black white 4 5 u — similarity: 1.0


air max penny 685153 600size 12 — similarity: 0.546


nike air max motion — similarity: 0.38


nike air max wright — similarity: 0.369

------------------



magical thing really 



dressystar vintage multi floral print party cocktail dress casual swing dress size 6 v neckl — similarity: 0.991


dressystar vintage multi floral print party cocktail dress casual swing dress size 4 one l — similarity: 0.928


dressystar vintage multi floral print party cocktail dress casual swing dress size 2 v neck — similarity: 0.924


dressystar vintage multi floral print party cocktail dress casual swing dress size 8 v neck — similarity: 0.924


dressystar vintage multi floral print party cocktail dress casual swing dress size 2 heartl — similarity: 0.921

------------------



ash ash orange orange 



phoenix woman cotton sleeveless shut squat letter print tank top shirt — similarity: 1.0


koineco woman shut squat racerback red tank top medium — similarity: 0.564


phoenix woman cotton black hate 2 letter print shirt top shirt — similarity: 0.479


shop4ever shut squat woman racerback tank top gym workout tank top small charcoal 0 — similarity: 0.361


phoenix woman princess letter print round neck short sleeve top shirt — similarity: 0.35

------------------



aether empire 1 2016 first printing comic book special edition rare blue juice comic 



jovivi 7 chakras natural quartz gemstone bead pendant healing point chakra reiki flower style — similarity: 0.955


jovivi 7 chakras natural quartz gemstone bead pendant healing point chakra reiki godness moon style — similarity: 0.952


jovivi 7 chakras natural quartz gemstone bead pendant healing point chakra reiki raging fire disk style — similarity: 0.934


jovivi 7 chakras natural quartz gemstone bead pendant healing point chakra reiki sun flower style — similarity: 0.918


jovivi 7 chakras natural quartz gemstone bead pendant healing point chakra reiki om symbol human body style — similarity: 0.881

------------------



<h3 style="margin: 0; padding: 0; color: #d9f4e4;">Delete unneeded data<h3/>

In [43]:
del tfidf_matrix_TB
del vectorizer
# del tfidf_matrix_TDB
del random_descs

<h1 style="margin: 0; padding: 0; color: #f00;">Collaborative - Filtering - Recommendation<h1/>

<p style="margin: 0; padding: 0; font-size: 16px; color: #549ef1;">Training</p>

In [44]:
ratings_df = fashion_df[['reviewerID', 'asin', 'overall']]
ratings_df = ratings_df.dropna(subset=['reviewerID', 'asin'])

avg_rating = ratings_df['overall'].mean()
ratings_df['overall'] = ratings_df['overall'].fillna(avg_rating)

ratings_df.rename(columns={'reviewerID': 'user_id', 'asin': 'item_id', 'overall': 'rating'}, inplace=True)

In [45]:
ratings_df.head(5)

Unnamed: 0,user_id,item_id,rating
0,A1D4G1SNUZWQOT,7106116521,5
1,A3DDWDH9PX2YX2,7106116521,2
2,A2MWC41EW7XL15,7106116521,4
3,A2UH2QQ275NV45,7106116521,2
4,A89F3LQADZBS5,7106116521,3


In [46]:
ratings_df.isnull().sum()

user_id    0
item_id    0
rating     0
dtype: int64

In [47]:
product_df = meta_df[['item_id', 'title', 'brand', "description"]].dropna()

In [48]:
product_df.head()

Unnamed: 0,item_id,title,brand,description
0,764443682,slime time fall fest cdrom collector card neut...,group publishing co,
1,1291691480,xcc qi promise new spider snake preparing men ...,,
2,1940280001,magical thing really,christopher manos,professional amateur magician routine include ...
3,1940735033,ash ash orange orange,flickerlamp publishing,
4,1940967805,aether empire 1 2016 first printing comic book...,,


In [49]:
product_df.isnull().sum()

item_id        0
title          0
brand          0
description    0
dtype: int64

In [50]:
user_counts = ratings_df['user_id'].value_counts()
ratings_df = ratings_df[ratings_df['user_id'].isin(user_counts[user_counts >= 5].index)]

In [51]:
user_counts

user_id
A3G5KDMFNRUXHB    40
A3JBQHQZEZPQK4    36
A1RRX286ZRI830    32
A2PBHVTPTIIGKR    31
A2GP4EJIAA2OE0    29
                  ..
A3F8ZWBOAZUZE2     1
A2HI8EO6V5PST8     1
A3GKQ3VSTGLAHW     1
A3HEDQXYV9OCVR     1
A2UZ5SVYOG3748     1
Name: count, Length: 749233, dtype: int64

In [52]:
ratings_df.head(5)

Unnamed: 0,user_id,item_id,rating
114,AAQO19HKS86MQ,B00008JOQI,4
1345,A3FOL5CECUQJKV,B0006HB4XE,5
1519,AQG16QCMT344N,B0007MV6PO,5
1962,A2R0KB6P9AWB3N,B0008F6WMM,5
2218,A1KPFFU7NOVNCY,B0009A1EA6,3


In [53]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['user_id', 'item_id', 'rating']], reader)

In [54]:
trainset, testset = train_test_split(data, test_size=0.2)

In [55]:
model = SVD()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x795c0ab245d0>

In [56]:
predictions = model.test(testset)

In [57]:
accuracy.rmse(predictions)

RMSE: 1.1225


1.1224545358823024

In [58]:
for i in range(5):
    idx = random.randint(0, len(ratings_df) - 1)
    user_id = ratings_df.iloc[idx]['user_id']
    
    recommendations = collaborative_filtering_recommend(user_id, ratings_df, meta_df, model, top_n=5)

    print(f"\n🔹 User {user_id} has these recommendations:\n")
    for title, image in zip(recommendations['title'], recommendations['imageURL']):
        if isinstance(image, str):
            display(Image(url=image, width=100))
        print(f"{title[:50]}")


🔹 User A1688VED1CELM6 has these recommendations:



yoursfs red halo ring woman big zircon cz zirconia


mixmax woman flowy sheer crop sleeve loose chiffon


mj metal jewelry 2mm 10mm white tungsten carbide m


lnlclothing junior distressed skinny jean blue 9


allegra k woman round neck semi sheer batwing slee

🔹 User A1OYVYWR7L5RA2 has these recommendations:



var apagestart new date gettime var ue t0 ue t0 ne


nike woman flex supreme tr 4 cross trainer


nike woman flex supreme tr 4 cross trainer


allegra k fall winter woman stripe top patchwork b


nike woman flex supreme tr 4 cross trainer

🔹 User AZWP047VIDYQH has these recommendations:



mj metal jewelry 2mm 10mm white tungsten carbide m


nike woman flex supreme tr 4 cross trainer


nike woman flex supreme tr 4 cross trainer


nike woman flex supreme tr 4 cross trainer


nike woman flex supreme tr 4 cross trainer

🔹 User AXVYVUK058AM3 has these recommendations:



mj metal jewelry 2mm 10mm white tungsten carbide m


lnlclothing junior distressed skinny jean blue 9


mj metal jewelry 2mm 10mm white tungsten carbide m


iyun tm hollow butterfly antique festoon bib choke


yazilind punk gothic style skull design silver cha

🔹 User AS1ZVSX5FA84W has these recommendations:



allegra k lady long sleeve letter pattern pullover


fashion golden brass beaded id stretch bracelet


nike woman flex supreme tr 4 cross trainer


chunky twist tribal antique gold bib statement uni


havaianas woman top flip flop sandal


In [59]:
param_grid = {
    'n_factors': [50, 100],
    'reg_all': [0.02, 0.1],
    'n_epochs': [1, 5]
}

gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
gs.fit(data)

print("Best RMSE:", gs.best_score['rmse'])
print("Best params:", gs.best_params['rmse'])

model_svd_best = gs.best_estimator['rmse']
model_svd_best.fit(trainset)

Best RMSE: 1.1956624309217607
Best params: {'n_factors': 100, 'reg_all': 0.02, 'n_epochs': 5}


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x795c0ca1a750>

In [60]:
predictions = model_svd_best.test(testset)

In [61]:
accuracy.rmse(predictions)

RMSE: 1.2006


1.2005813410067188

In [62]:
for i in range(5):
    idx = random.randint(0, len(ratings_df) - 1)
    user_id = ratings_df.iloc[idx]['user_id']
    
    recommendations = collaborative_filtering_recommend(user_id, ratings_df, meta_df, model_svd_best, top_n=5)

    print(f"\n🔹 User {user_id} has these recommendations:\n")
    for title, image in zip(recommendations['title'], recommendations['imageURL']):
        if isinstance(image, str):
            display(Image(url=image, width=100))
        print(f"{title}")


🔹 User AL1WTXQJCKT8X has these recommendations:



alpha industry men tall size 65 thigh length zip field coat pocket


allegra k woman round neck semi sheer batwing sleeve blouse xl green white


qiyun z tribal jewelry flat round dangle coin tassel pendant chain necklace woman


aberry white black cutout design two piece bikini set woman white xxl


tiffany tf4068b woman butterfly black sunglass 58mm 80553c

🔹 User A3NMTCFU9VWTXB has these recommendations:



var apagestart new date gettime var ue t0 ue t0 new date window ue ihb window ue ihb window ueinit 0 1 window ue ihb 1 var ue csm window ue hob new date function var e ue ue f date function return new date e function b return f b 0 ue t0 e stub function b b var c b function c push c slice call argument e ue id b replay function b var c shift b 0 1 2 b isstub 1 e exec function b return function 1 window ueinit try return b apply argument catch c uelogerror c attribution undefined loglevel warn ue csm var ue err chan jserr rw function e function h f b ec mxe f ter push f b b var c f loglevel b loglevel c c k c c n c p ec c c k ecf b pageurl e location e location href b loglevel c b attribution f attribution b attribution erl push ex f info b function l b c e g uelogerror f b l c c e err g fromonerror 1 args argument g attribution g attribution loglevel g loglevel void 0 return 1 var k fatal error n warn p downgraded ec 0 ecf 0 pec 0 t 0 erl ter mxe 50 starttimer function t setinterval fu

woman fashion trendy lace tea floral flower pleated midi chiffon skirt usa wht


persun woman plunge neck pullover sweater top lace long sleeve knit mini dress


genuine mother pearl stone chip shell crescent stretch bracelet one size fit


susenstone fashion pierced heart pearl love friendship leather charm bracelet

🔹 User A3KWGJH9EBRLE8 has these recommendations:



yazilind woman jewelry multi strand chain collar bib temperament necklace


havaianas woman top flip flop sandal


funoc new lady womens pocket slim fit double breasted trench coat jacket outwear


yayun yayu woman sleeveless loose solid flare tunic top r flesh xl


yazilind jewelry christmas gift rose gold plated glaring multicolor crystal cute starfish brooch pin woman girl

🔹 User A1HXVAMH5RZR96 has these recommendations:



var apagestart new date gettime var ue t0 ue t0 new date window ue ihb window ue ihb window ueinit 0 1 window ue ihb 1 var ue csm window ue hob new date function var e ue ue f date function return new date e function b return f b 0 ue t0 e stub function b b var c b function c push c slice call argument e ue id b replay function b var c shift b 0 1 2 b isstub 1 e exec function b return function 1 window ueinit try return b apply argument catch c uelogerror c attribution undefined loglevel warn ue csm var ue err chan jserr rw function e function h f b ec mxe f ter push f b b var c f loglevel b loglevel c c k c c n c p ec c c k ecf b pageurl e location e location href b loglevel c b attribution f attribution b attribution erl push ex f info b function l b c e g uelogerror f b l c c e err g fromonerror 1 args argument g attribution g attribution loglevel g loglevel void 0 return 1 var k fatal error n warn p downgraded ec 0 ecf 0 pec 0 t 0 erl ter mxe 50 starttimer function t setinterval fu

powerstep pinnacle orthotic shoe insole


queenmore woman bodycon midi bandage clubwear party evening tank dress slit back large white


roewell baby headband girl cute hair bow hair band newborn headband 9 pack


jovivi black obsidian 7 row irregular chip bracelet

🔹 User A2QHGKRYPCI5NZ has these recommendations:



miusol woman lapel plaid pattern vintage slimming pencil dress


powerstep pinnacle orthotic shoe insole


nike woman flex supreme tr 4 cross trainer


qiyun z tribal jewelry flat round dangle coin tassel pendant chain necklace woman


cutecc woman sexy deep v neck sleeveless fashion casual summer tank top shirt grey


<h3 style="margin: 0; padding: 0; color: #d9f4e4;">Delete unneeded data<h3/>

In [63]:
del model

<h1 style="margin: 0; padding: 0; color: #f00;">Hybrid - Recommendation<h1/>

In [109]:

def hybrid_recommend(user_id, tfidf_matrix, items_ordered, ratings_df, meta_df, cf_model, top_n=10, alpha=0.7, candidate_size=1000, rating_scale=(1.0, 5.0), rating_weighting='global_center'):
    item_to_idx = build_tfidf_index_map(items_ordered)
    all_items = np.array(items_ordered)

    rated_items = ratings_df[ratings_df['user_id'] == user_id]['item_id'].unique()
    candidate_mask = ~np.isin(all_items, rated_items)
    candidate_items = all_items[candidate_mask]

    user_profile = build_user_content_vector(user_id, ratings_df, item_to_idx, tfidf_matrix, rating_weighting)
    if user_profile is None:
        fallback = (ratings_df.groupby('asin')['rating']
                    .mean()
                    .reset_index()
                    .rename(columns={'rating': 'avg_rating'}))
        top = fallback.merge(meta_df, on='item_id', how='left').sort_values('avg_rating', ascending=False).head(top_n)
        top['hybrid_score'] = top['avg_rating']
        return top[['asub','title','brand','hybrid_score']]
        
    candidate_indices = [item_to_idx[it] for it in candidate_items if it in item_to_idx]
    if len(candidate_indices) == 0:
        return pd.DataFrame(columns=['asin','title','brand','hybrid_score'])

    tfidf_candidates = tfidf_matrix[candidate_indices]
    cb_scores = cosine_similarity(user_profile, tfidf_candidates).flatten()

    top_k = min(candidate_size, len(candidate_indices))
    top_k_idx_local = np.argpartition(-cb_scores, top_k-1)[:top_k] 
    top_candidate_indices = [candidate_indices[i] for i in top_k_idx_local]
    top_candidate_items = [all_items[idx] for idx in top_candidate_indices]
    top_candidate_cb_scores = cb_scores[top_k_idx_local]

    cf_predictions = []
    for item_id in top_candidate_items:
        pred = cf_model.predict(user_id, item_id)
        cf_pred = pred.est
        cf_predictions.append(cf_pred)
    cf_predictions = np.array(cf_predictions)

    rmin, rmax = rating_scale
    cb_norm = np.clip(top_candidate_cb_scores, 0.0, 1.0)
    cb_scaled = cb_norm * (rmax - rmin) + rmin

    hybrid_scores = alpha * cf_predictions + (1.0 - alpha) * cb_scaled
    top_order = np.argsort(-hybrid_scores)[:top_n]

    selected_items = [top_candidate_items[i] for i in top_order]
    selected_scores = hybrid_scores[top_order]

    result = pd.DataFrame({
        'asin': selected_items,
        'hybrid_score': selected_scores
    })

    display(meta_df.head())
    display(result.head())

    result = result.merge(meta_df, left_on='asin', right_on='item_id', how='left')
    cols = ['item_id', 'title'] + ([c for c in ['brand','imageURL'] if c in result.columns]) + ['hybrid_score']
    return result[cols]

In [110]:
idx = random.randint(0, len(ratings_df) - 1)
user_id = ratings_df.iloc[idx]['user_id']
items_ordered = meta_df['item_id'].tolist()
# print(ratings_df)
recommendations = hybrid_recommend(
    user_id=user_id,
    tfidf_matrix=tfidf_matrix_TDB,
    items_ordered=items_ordered,
    ratings_df=ratings_df,
    meta_df=meta_df,
    cf_model=model_svd_best,
    top_n=10,
    alpha=0.7,
    candidate_size=2000,      # مرشحين قبل التصفية
    rating_scale=(1.0, 5.0),  # نفس نطاق التقييم
    rating_weighting='global_center'
)
    
print(recommendations[['title', 'brand', 'hybrid_score']])

Unnamed: 0,title,brand,feature,rank,date,item_id,imageURL,imageURLHighRes,description,price,also_view,also_buy,fit,details,similar_item,tech1
0,slime time fall fest cdrom collector card neut...,group publishing co,[Product Dimensions:\n \n8....,"13,052,976inClothing,Shoesamp;Jewelry(",8.70 inches,764443682,https://images-na.ssl-images-amazon.com/images...,[https://images-na.ssl-images-amazon.com/image...,,,,,,,,
1,xcc qi promise new spider snake preparing men ...,,,"11,654,581inClothing,Shoesamp;Jewelry(",5 star,1291691480,https://images-na.ssl-images-amazon.com/images...,[https://images-na.ssl-images-amazon.com/image...,,,,,,,,
2,magical thing really,christopher manos,[Package Dimensions:\n \n8....,"19,308,073inClothing,ShoesJewelry(",5 star,1940280001,https://images-na.ssl-images-amazon.com/images...,[https://images-na.ssl-images-amazon.com/image...,professional amateur magician routine include ...,,,,,,,
3,ash ash orange orange,flickerlamp publishing,[Package Dimensions:\n \n8....,"19,734,184inClothing,ShoesJewelry(",5 star,1940735033,https://images-na.ssl-images-amazon.com/images...,[https://images-na.ssl-images-amazon.com/image...,,,,,,,,
4,aether empire 1 2016 first printing comic book...,,[Package Dimensions:\n \n10...,"10,558,646inClothing,Shoesamp;Jewelry(",5 star,1940967805,https://images-na.ssl-images-amazon.com/images...,[https://images-na.ssl-images-amazon.com/image...,,$4.50,,,,,,


Unnamed: 0,asin,hybrid_score
0,B00ZFXA6TW,3.467459
1,B00ZFXAGH4,3.452628
2,B00ZFXA9RG,3.448993
3,B00ZFXA976,3.444788
4,B01HGQRK80,3.440759


                                               title           brand  \
0  zmart woman ruffle sleeve chiffon blouse shirt...           zmart   
1  zmart woman ruffle sleeve chiffon blouse shirt...           zmart   
2  zmart woman ruffle sleeve chiffon blouse shirt...           zmart   
3  zmart woman ruffle sleeve chiffon blouse shirt...           zmart   
4  lisingtool woman letter printing short sleeve ...      lisingtool   
5  baomabao woman round neck letter printing shor...  baomabao shirt   
6  baomabao woman round neck letter printing shor...  baomabao shirt   
7  lilbetter woman loose casual short sleeve chif...       lilbetter   
8  pattyboutik woman v neck cut shoulder short sl...                   
9  zmart woman ruffle sleeve chiffon blouse shirt...           zmart   

   hybrid_score  
0      3.467459  
1      3.452628  
2      3.448993  
3      3.444788  
4      3.440759  
5      3.410087  
6      3.407774  
7      3.407252  
8      3.400645  
9      3.399613  
