In [69]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors  
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD

In [39]:
path =  './Myntra_Fashion_Clothing.csv'
clothing_df= pd.read_csv(path)
clothing_df


Unnamed: 0,URL,Product_id,BrandName,Category,Individual_category,category_by_Gender,Description,DiscountPrice (in Rs),OriginalPrice (in Rs),DiscountOffer,SizeOption,Ratings,Reviews
0,https://www.myntra.com/jeans/roadster/roadster...,2296012,Roadster,Bottom Wear,jeans,Men,roadster men navy blue slim fit mid rise clean...,824.0,1499.0,45% OFF,"28, 30, 32, 34, 36",3.9,999.0
1,https://www.myntra.com/track-pants/locomotive/...,13780156,LOCOMOTIVE,Bottom Wear,track-pants,Men,locomotive men black white solid slim fit tra...,517.0,1149.0,55% OFF,"S, M, L, XL",4.0,999.0
2,https://www.myntra.com/shirts/roadster/roadste...,11895958,Roadster,Topwear,shirts,Men,roadster men navy white black geometric print...,629.0,1399.0,55% OFF,"38, 40, 42, 44, 46, 48",4.3,999.0
3,https://www.myntra.com/shapewear/zivame/zivame...,4335679,Zivame,Lingerie & Sleep Wear,shapewear,Women,zivame women black saree shapewear zi3023core0...,893.0,1295.0,31% OFF,"S, M, L, XL, XXL",4.2,999.0
4,https://www.myntra.com/trousers/highlander/hig...,6744434,HIGHLANDER,Bottom Wear,trousers,Men,highlander men olive green slim fit solid regu...,599.0,1499.0,60% OFF,"30, 32, 34, 36",3.9,998.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
117536,https://www.myntra.com/tshirts/hrx-by-hrithik-...,8379269,HRX by Hrithik Roshan,Sports Wear,tshirts,Women,hrx by hrithik roshan women navy blue nautical...,404.0,899.0,55% OFF,"XS, S, M, L, XL",4.4,0.0
117537,https://www.myntra.com/track-pants/stylestone/...,12767048,StyleStone,Sports Wear,track-pants,Women,stylestone women black solid track pants,467.0,899.0,48% OFF,"S, M, L, XL",4.2,0.0
117538,https://www.myntra.com/tshirts/hrx-by-hrithik-...,10106141,HRX by Hrithik Roshan,Sports Wear,tshirts,Women,hrx by hrithik roshan women black green print...,404.0,899.0,55% OFF,"S/M, L/XL",4.4,0.0
117539,https://www.myntra.com/tshirts/hrx-by-hrithik-...,11640324,HRX by Hrithik Roshan,Sports Wear,tshirts,Women,hrx by hrithik roshan women north sea printed ...,494.0,899.0,45% OFF,"XS, S, M, L, XL",4.4,0.0


In [41]:
# Split the data
train_data, test_data = train_test_split(clothing_df, test_size=0.2, random_state=42)


In [42]:
# Create a pivot table for product-product interactions using both Ratings and Reviews for training data
train_features = train_data[['Product_id', 'Ratings', 'Reviews']].set_index('Product_id').fillna(0)

# Convert to sparse matrix representation
train_features_sparse = csr_matrix(train_features.values)


In [44]:
# Use NearestNeighbors for approximate nearest neighbors
nn = NearestNeighbors(metric='cosine', algorithm='brute')
nn.fit(train_features_sparse)

In [45]:
# Apply Truncated SVD to reduce dimensions for the training set
#svd = TruncatedSVD(n_components=100, random_state=42)
# train_features_reduced = svd.fit_transform(train_features_sparse)

In [47]:
# Get distances and indices of nearest neighbors for a sample item
distances, indices = nn.kneighbors(train_features_sparse[0], n_neighbors=10)

# Print the results for the first item
print("Distances: ", distances)
print("Indices: ", indices)

Distances:  [[1.11022302e-16 1.11022302e-16 1.11022302e-16 1.11022302e-16
  1.11022302e-16 1.11022302e-16 1.11022302e-16 1.11022302e-16
  1.11022302e-16 1.11022302e-16]]
Indices:  [[51097 56577  8484 58428 32833 90994 40662  2501 28007 41054]]


In [48]:
def get_recommendations(item_index, model, data, n_neighbors=10):
    distances, indices = model.kneighbors(data[item_index], n_neighbors=n_neighbors)
    return indices.flatten(), distances.flatten() 


In [49]:
# Example usage for the first item
item_index = 0
recommended_indices, recommended_distances = get_recommendations(item_index, nn, train_features_sparse)
print("Recommended indices:", recommended_indices)
print("Recommended distances:", recommended_distances) 

Recommended indices: [51097 56577  8484 58428 32833 90994 40662  2501 28007 41054]
Recommended distances: [1.11022302e-16 1.11022302e-16 1.11022302e-16 1.11022302e-16
 1.11022302e-16 1.11022302e-16 1.11022302e-16 1.11022302e-16
 1.11022302e-16 1.11022302e-16]


In [51]:
recommendations = {}

for idx in range(train_features_sparse.shape[0]):
    recommended_indices, _ = get_recommendations(idx, nn, train_features_sparse)
    recommendations[idx] = recommended_indices[1:]  


In [55]:
# Function to get similar products with descriptions
def get_similar_products_with_descriptions(product_id, n_neighbors=10):
    product_index = train_features.index.get_loc(product_id)
    recommended_indices, _ = get_recommendations(product_index, nn, train_features_sparse, n_neighbors=n_neighbors)
    recommended_products = [train_features.index[idx] for idx in recommended_indices[1:]]
    
    # Get descriptions for the recommended products
    similar_products_with_descriptions = []
    for product in recommended_products:
        description = clothing_df.loc[clothing_df['Product_id'] == product, 'Description'].values[0]
        similar_products_with_descriptions.append((product, description))
    
    return similar_products_with_descriptions

In [70]:
# Example: Get top 5 similar products for a given product_id
product_id_example = train_data['Product_id'].iloc[790]
similar_products = get_similar_products_with_descriptions(product_id_example, n_neighbors=5)
for product in similar_products:
    print(f"Product ID: {product[0]}, Description: {product[1]}")

Product ID: 14121958, Description: anouk men black printed regular pure cotton kurta with pyjamas
Product ID: 9823649, Description: sztori plus size women navy blue skinny fit mid rise clean look stretchable jeans
Product ID: 10931658, Description: dressberry beige self design non wired lightly padded everyday bra
Product ID: 12142384, Description: levis men pack of 2 black solid basic briefs 017 brief


In [60]:
import numpy as np

In [68]:
# Evaluate using RMSE
def calculate_rmse(test_data, train_features_sparse, nn, k=5):
    errors = []
    for product_id in test_data['Product_id']:
        if product_id not in train_features.index:
            continue
        similar_products = get_similar_products_with_descriptions(product_id, n_neighbors=k)
        if not similar_products:
            continue
        predicted_rating = np.mean([train_data.loc[train_data['Product_id'] == sp[0], 'Ratings'].values[0] for sp in similar_products])
        actual_rating = test_data[test_data['Product_id'] == product_id]['Ratings'].values[0]
        errors.append((predicted_rating - actual_rating) ** 2)
    return np.sqrt(np.mean(errors))

RMSE: nan


In [62]:
# Example: Calculate RMSE for the test set
rmse = calculate_rmse(test_data, train_features_sparse, nn, k=5)
print(f'RMSE: {rmse}')

RMSE: nan


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [53]:
import pickle

In [54]:
# Save the recommendations dictionary to a file
with open('recommendations_cf_with_outliers.pkl', 'wb') as f:
    pickle.dump(recommendations, f)

print("Recommendations saved successfully.")


Recommendations saved successfully.


In [52]:
# Calculate the cosine similarity matrix based on reduced matrix for the training set
#train_similarity_sparse = cosine_similarity(train_features_sparse)

In [None]:
# Convert to DataFrame for easier manipulation
product_similarity_df = pd.DataFrame(product_similarity_sparse, index=product_features.index, columns=product_features.index)

In [8]:

# Convert to DataFrame for easier manipulation
product_similarity_df = pd.DataFrame(product_similarity, index=product_features.index, columns=product_features.index)



NameError: name 'product_similarity' is not defined

In [9]:
# Example: Get similar products for a given product_id
def get_similar_products(product_id, num_similar=5):
    similar_products = product_similarity_df[product_id].sort_values(ascending=False)[1:num_similar+1]
    return similar_products


In [10]:

# Get top 5 similar products for a given product_id
similar_products = get_similar_products(2296012, num_similar=5)
print(similar_products)

NameError: name 'product_similarity_df' is not defined