In [1]:
pip install pandas scikit-learn surprise

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split

In [19]:
products = pd.read_csv("walmart__product_review_data.tsv",sep ="\t")  # Change this to your actual file path

In [21]:
updated_products = products[['Uniq Id', 'Crawl Timestamp','Product Id', 'Product Name', 'Product Rating', 'Product Reviews Count', 'Product Category', 'Product Brand', 'Product Image Url',  'Product Description', 'Product Price']]

In [23]:
updated_products.head()

Unnamed: 0,Uniq Id,Crawl Timestamp,Product Id,Product Name,Product Rating,Product Reviews Count,Product Category,Product Brand,Product Image Url,Product Description,Product Price
0,1705736792d82aa2f2d3caf1c07c53f4,2020-09-24 03:21:12 +0000,2e17bf4acecdece67fc00f07ad62c910,"OPI Infinite Shine, Nail Lacquer Nail Polish, ...",,,Premium Beauty > Premium Makeup > Premium Nail...,OPI,https://i5.walmartimages.com/asr/0e1f4c51-c1a4...,,8.95
1,95a9fe6f4810fcfc7ff244fd06784f11,2020-10-30 14:04:08 +0000,076e5854a62dd283c253d6bae415af1f,"Nice n Easy Permanent Color, 111 Natural Mediu...",,,Beauty > Hair Care > Hair Color > Auburn Hair ...,Nice'n Easy,https://i5.walmartimages.com/asr/9c8e42e4-13a5...,Pack of 3 Pack of 3 for the UPC: 381519000201 ...,29.86
2,8d4d0330178d3ed181b15a4102b287f2,2020-08-06 05:51:47 +0000,8a4fe5d9c7a6ed26cc44d785a454b124,Clairol Nice N Easy Permanent Color 7/106A Nat...,4.5,29221.0,Beauty > Hair Care > Hair Color > Permanent Ha...,Clairol,https://i5.walmartimages.com/asr/e3a601c2-6a2b...,This Clairol Nice N Easy Permanent Color gives...,7.99
3,fddc4df45b35efd886794b261f730c51,2020-07-15 11:22:04 +0000,03b5fb878a33eadff8b033419eab9669,"Kokie Professional Matte Lipstick, Hot Berry, ...",,,Beauty > Makeup > Lip,Kokie Cosmetics,https://i5.walmartimages.com/asr/25b4b467-bc61...,Calling all matte lip lovers! Indulge in our r...,5.16
4,0990cf89a59ca6a0460349a3e4f51d42,2020-11-26T12:27:20+00:00,ce3d761e57d6ccad80619297b5b1bcbc,"Gillette TRAC II Plus Razor Blade Refills, Fit...",,131.0,Seasonal > Stock Up Essentials > Personal Care...,Gillette,https://i5.walmartimages.com/asr/1a2ebb06-cd01...,"In 1971, Gillette introduced the Trac II razor...",19.97


In [25]:
#null values check
updated_products.isnull().sum()

Uniq Id                     0
Crawl Timestamp             0
Product Id                  0
Product Name                0
Product Rating           2806
Product Reviews Count    1654
Product Category           10
Product Brand              13
Product Image Url           0
Product Description      1127
Product Price              42
dtype: int64

In [27]:
#to fill the missing values

updated_products['Product Category'].fillna('', inplace=True)
updated_products['Product Brand'].fillna('', inplace=True)

#filled witin an emply string
updated_products['Product Description'].fillna('', inplace=True)

# Assuming df is your DataFrame
updated_products['Product Rating'].fillna(updated_products['Product Rating'].mean(), inplace=True)
updated_products['Product Reviews Count'].fillna(updated_products['Product Reviews Count'].median(), inplace=True)
updated_products['Product Price'].fillna(updated_products['Product Price'].mean(), inplace=True)


#null values check
updated_products.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  updated_products['Product Category'].fillna('', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  updated_products['Product Category'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df

Uniq Id                  0
Crawl Timestamp          0
Product Id               0
Product Name             0
Product Rating           0
Product Reviews Count    0
Product Category         0
Product Brand            0
Product Image Url        0
Product Description      0
Product Price            0
dtype: int64

In [29]:
product_tags = (updated_products['Product Category'] + updated_products['Product Brand'] + updated_products['Product Description'] )
product_tags
updated_products['Product Tags'] = product_tags
updated_products.head(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  updated_products['Product Tags'] = product_tags


Unnamed: 0,Uniq Id,Crawl Timestamp,Product Id,Product Name,Product Rating,Product Reviews Count,Product Category,Product Brand,Product Image Url,Product Description,Product Price,Product Tags
0,1705736792d82aa2f2d3caf1c07c53f4,2020-09-24 03:21:12 +0000,2e17bf4acecdece67fc00f07ad62c910,"OPI Infinite Shine, Nail Lacquer Nail Polish, ...",4.294622,20.0,Premium Beauty > Premium Makeup > Premium Nail...,OPI,https://i5.walmartimages.com/asr/0e1f4c51-c1a4...,,8.95,Premium Beauty > Premium Makeup > Premium Nail...


In [31]:
#TO MAKE THE COLUMNS NAMES SHORTER
colummn_name_mapping={
    'Uniq Id':'ID',
    'Crawl Timestamp' : 'Timestamp', 
    'Product Id':'ProdID',
    'Product Rating':'Rating',
'Product Reviews Count':'ReviewCount',
       'Product Category':'Category',
    'Product Brand':'Brand',
    'Product Name':'Name',
       'Product Image Url':'ImageURL',
    'Product Description':'Description', 
    'Product Price' : 'Price',
    'Product Tags':'Tags',
   
}
#rename the cols
updated_products.rename(columns =colummn_name_mapping,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  updated_products.rename(columns =colummn_name_mapping,inplace=True)


In [33]:
num_unique_ids = updated_products['ID'].nunique()
total_ids = updated_products['ID'].shape[0]

if num_unique_ids == total_ids:
    print("No duplicates found in the ID column.")
else:
    print("Duplicates exist in the ID column.")


No duplicates found in the ID column.


In [35]:
import re
import nltk
from nltk.corpus import stopwords

# Download NLTK stopwords (only needs to be run once)
nltk.download('stopwords')

# Get English stop words from NLTK
stop_words = set(stopwords.words('english'))

# Function to clean and extract tags
def clean_and_extract_tags(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Use regular expressions to find words only (letters a-z)
    words = re.findall(r'\b[a-z]+\b', text)
    
    # Remove stop words
    tags = [word for word in words if word not in stop_words]
    
    return ','.join(tags)

# Apply the function to the required columns
columns_to_extract_tags_from = ['Category', 'Brand', 'Description', 'Tags']
for column in columns_to_extract_tags_from:
    updated_products[column] = updated_products[column].apply(clean_and_extract_tags)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  updated_products[column] = updated_products[column].apply(clean_and_extract_tags)


In [37]:

updated_products['Name'] = updated_products['Name'].str.split(',').str[0]

# Display the updated DataFrame to check the changes
print(updated_products['Name'])


0                                      OPI Infinite Shine
1                             Nice n Easy Permanent Color
2       Clairol Nice N Easy Permanent Color 7/106A Nat...
3                       Kokie Professional Matte Lipstick
4               Gillette TRAC II Plus Razor Blade Refills
                              ...                        
4995             Garden Mint Room Spray (Double Strength)
4996    Garnier Nutrisse Nourishing Hair Color Creme (...
4997                             Nail File Electric Drill
4998    Creed Love In Black Hair And Body Wash 6.8oz/2...
4999                                           Foundation
Name: Name, Length: 5000, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  updated_products['Name'] = updated_products['Name'].str.split(',').str[0]


In [39]:
# Define a mapping of categories to broader categories
category_mapping = {
    'beauty': 'Beauty & Personal Care',
    'makeup': 'Beauty & Personal Care',
    'nail': 'Beauty & Personal Care',
    'hair': 'Hair Care',
    'household': 'Household Essentials',
    'premium': 'Premium Products',
    'seasonal': 'Seasonal Products',
    'personal': 'Personal Care',
    'health': 'Health & Wellness',
    'home': 'Home & Garden',
    'patio': 'Outdoor Living',
    'clothing': 'Apparel & Accessories',
    'sports': 'Sports & Outdoors',
    'baby': 'Baby Products',
    'food': 'Food & Beverage',
    'auto': 'Automotive Products',
    'shop': 'Shopping Essentials',
    'gifts': 'Gifts & Specialty Items',
    'industrial': 'Industrial & Commercial Supplies',
    'pets': 'Pet Supplies',
    'character': 'Character Merchandise',
    'toys': 'Toys & Games',
    'arts': 'Arts & Crafts',
    'feature': 'Featured Products',
    'cell': 'Cell Phones & Accessories',
    'electronics': 'Electronics & Gadgets',
    'party': 'Party Supplies',
    'jewelry': 'Jewelry & Accessories',
    'books': 'Books & Stationery',
    'office': 'Office Supplies'
}

# Function to map only the first word of the category
def map_first_category(categories):
    first_word = categories.split(',')[0].strip()  # Get the first word
    return category_mapping.get(first_word, first_word)  # Map it using the mapping dictionary

# Apply the mapping function to the Category column
updated_products['Category'] = updated_products['Category'].apply(map_first_category)

# Optional: Display the updated DataFrame to check the changes
print(updated_products['Category'])


0             Premium Products
1       Beauty & Personal Care
2       Beauty & Personal Care
3       Beauty & Personal Care
4            Seasonal Products
                 ...          
4995      Household Essentials
4996    Beauty & Personal Care
4997    Beauty & Personal Care
4998          Premium Products
4999    Beauty & Personal Care
Name: Category, Length: 5000, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  updated_products['Category'] = updated_products['Category'].apply(map_first_category)


In [41]:
# Function to keep only unique words in the Description column
def keep_unique_words(description):
    # Split the description into words and convert to a set for uniqueness
    unique_words = set(description.split(','))
    # Join the unique words back into a string
    return ', '.join(unique_words)

# Apply the function to the Description column
updated_products['Description'] = updated_products['Description'].apply(keep_unique_words)

# Optional: Display the updated DataFrame to check the changes
print(updated_products['Description'])


0                                                        
1       gloss, copolymer, aminophenol, easy, beautiful...
2       apply, make, give, easy, colorings, hair, dark...
3       lip, punch, matte, rich, quit, highly, color, ...
4       always, well, perfect, handles, today, blades,...
                              ...                        
4995    usa, mint, contains, dye, home, spray, blend, ...
4996    shea, oils, nourishing, root, rich, oil, color...
4997    metal, bits, make, rough, chuck, spots, able, ...
4998                                                     
4999    usa, beige, medium, contains, com, dry, wont, ...
Name: Description, Length: 5000, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  updated_products['Description'] = updated_products['Description'].apply(keep_unique_words)


In [45]:
# Assuming 'all_products' is your DataFrame
updated_products['ImageURL'] = updated_products['ImageURL'].str.split('|').str[0]

# Display the updated DataFrame to check the changes
print(updated_products['ImageURL'])


0       https://i5.walmartimages.com/asr/0e1f4c51-c1a4...
1       https://i5.walmartimages.com/asr/9c8e42e4-13a5...
2       https://i5.walmartimages.com/asr/e3a601c2-6a2b...
3       https://i5.walmartimages.com/asr/25b4b467-bc61...
4       https://i5.walmartimages.com/asr/1a2ebb06-cd01...
                              ...                        
4995    https://i5.walmartimages.com/asr/0e0416ae-6b70...
4996    https://i5.walmartimages.com/asr/24d7a837-51f8...
4997    https://i5.walmartimages.com/asr/d6202179-2c93...
4998    https://i5.walmartimages.com/asr/3dc99239-66d2...
4999    https://i5.walmartimages.com/asr/a76122fd-322a...
Name: ImageURL, Length: 5000, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  updated_products['ImageURL'] = updated_products['ImageURL'].str.split('|').str[0]


In [49]:
#content
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(updated_products['Tags'])


In [51]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [65]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = updated_products[updated_products['Name'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Get top 10 recommendations
    product_indices = [i[0] for i in sim_scores]
    return df.iloc[product_indices]

In [67]:
#collab
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(updated_products[['ID', 'ProdID', 'Rating']], reader)

In [57]:
trainset, testset = train_test_split(data, test_size=0.2)

In [59]:
model = SVD()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2563c25f4d0>

In [71]:
def get_collaborative_recommendations(user_id, num_recommendations=10):
    product_ids = updated_products['ProdID'].unique()
    predictions = [model.predict(user_id, prod_id) for prod_id in product_ids]
    predictions.sort(key=lambda x: x.est, reverse=True)
    recommended_product_ids = [pred[1] for pred in predictions[:num_recommendations]]
    return updated_products[updated_products['ProdID'].isin(recommended_product_ids)]


In [73]:
def hybrid_recommendation(title, user_id):
    content_recs = get_recommendations(title)
    collaborative_recs = get_collaborative_recommendations(user_id)
    
    # Merge and rank recommendations here
    # This can be done based on the scores you have from each method
    # For simplicity, let's just return the two sets combined
    return pd.concat([content_recs, collaborative_recs]).drop_duplicates().head(10)


In [83]:
print(get_recommendations("Product Name"))  # For content-based recommendations
print(get_collaborative_recommendations(user_id=1705736792d82aa2f2d3caf1c07c53f4))  # For collaborative filtering
print(hybrid_recommendation("Product Name", user_id=1705736792d82aa2f2d3caf1c07c53f4))  # For hybrid recommendations

SyntaxError: invalid decimal literal (4289237995.py, line 2)

In [81]:
updated_products['ID']

0       1705736792d82aa2f2d3caf1c07c53f4
1       95a9fe6f4810fcfc7ff244fd06784f11
2       8d4d0330178d3ed181b15a4102b287f2
3       fddc4df45b35efd886794b261f730c51
4       0990cf89a59ca6a0460349a3e4f51d42
                      ...               
4995    2771f0606e9638de508741f52029d51c
4996    0f218eb3ac736975ccfdde987baa4b83
4997    34d1aa70845416c3df059a088aaf18dc
4998    ff9cfa22550bf036e2487a9100d927f1
4999    0de862f8c1c4f23b6c4cfe59fd574b59
Name: ID, Length: 5000, dtype: object

In [87]:
# Create a mapping from complex IDs to sequential integers
updated_products['UserID'] = updated_products['ID'].astype('category').cat.codes


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  updated_products['UserID'] = updated_products['ID'].astype('category').cat.codes


In [89]:
# Create a mapping DataFrame
user_mapping = updated_products[['ID', 'UserID']].drop_duplicates().sort_values('UserID').reset_index(drop=True)
print(user_mapping)


                                    ID  UserID
0     0016a804b91dc7940315e093673d0c76       0
1     001bb3973ace78025455cd209ad24814       1
2     00261b76af7c600795d6f44dac3444dc       2
3     002d1e5fa94fb8adacbd90f0832a3c35       3
4     00424261c1918e579ab295ea2c35a2b3       4
...                                ...     ...
4995  ffcf16834a20b68a875f65b06fe93b13    4995
4996  ffd6745a7adeb08cef9fcdcfb52be162    4996
4997  ffdfcf9ab74213a313268f1ae65d558c    4997
4998  ffe30812974331718690097eb03c8239    4998
4999  fff427fe76ee4a875cdab7f65cf9c8c4    4999

[5000 rows x 2 columns]
