In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
data = pd.read_csv("shoes.csv")
data

Unnamed: 0,Brand_Name,How_Many_Sold,Current_Price,Product_details,RATING
0,ASIAN,2242,"₹1,098","Oxygen-01 Sports Running,Walking & Gym Shoes w...",3.8
1,ASIAN,240,₹674,"Men's Express-08 Sports Running,Walking,Gym,Tr...",4.0
2,ASIAN,16662,₹588,"Men's Cosko Sports Running,Walking,Gym,Trainin...",3.8
3,ASIAN,135,₹599,"Wind-03 Sports Running,Walking & Gym Shoes wit...",4.0
4,Reebok,240,₹982,Men's Velocity Runner Lp Running Shoe,4.0
...,...,...,...,...,...
23935,Campus,198,₹623,Men's HARVEL PRO Running Shoes,4.0
23936,Wakefield,49,₹399,Stylish Running Sports Shoes for MEN-2010,3.4
23937,URJO,557,₹839,Mens Running Shoes,4.2
23938,FEETEES,63,₹349,Nexon Men's Casual Eva Socks Knitted Running S...,3.6


In [3]:
selected_col = ['Brand_Name', 'Product_details']

df = data[selected_col]
df

Unnamed: 0,Brand_Name,Product_details
0,ASIAN,"Oxygen-01 Sports Running,Walking & Gym Shoes w..."
1,ASIAN,"Men's Express-08 Sports Running,Walking,Gym,Tr..."
2,ASIAN,"Men's Cosko Sports Running,Walking,Gym,Trainin..."
3,ASIAN,"Wind-03 Sports Running,Walking & Gym Shoes wit..."
4,Reebok,Men's Velocity Runner Lp Running Shoe
...,...,...
23935,Campus,Men's HARVEL PRO Running Shoes
23936,Wakefield,Stylish Running Sports Shoes for MEN-2010
23937,URJO,Mens Running Shoes
23938,FEETEES,Nexon Men's Casual Eva Socks Knitted Running S...


In [4]:
df.loc[:,'combined'] =df[selected_col].apply(lambda x: ' '.join(x.astype(str)), axis=1)
df["id"] = df.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:,'combined'] =df[selected_col].apply(lambda x: ' '.join(x.astype(str)), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["id"] = df.index


In [5]:
df = df.iloc[:2000]
df.drop_duplicates
df

Unnamed: 0,Brand_Name,Product_details,combined,id
0,ASIAN,"Oxygen-01 Sports Running,Walking & Gym Shoes w...","ASIAN Oxygen-01 Sports Running,Walking & Gym S...",0
1,ASIAN,"Men's Express-08 Sports Running,Walking,Gym,Tr...","ASIAN Men's Express-08 Sports Running,Walking,...",1
2,ASIAN,"Men's Cosko Sports Running,Walking,Gym,Trainin...","ASIAN Men's Cosko Sports Running,Walking,Gym,T...",2
3,ASIAN,"Wind-03 Sports Running,Walking & Gym Shoes wit...","ASIAN Wind-03 Sports Running,Walking & Gym Sho...",3
4,Reebok,Men's Velocity Runner Lp Running Shoe,Reebok Men's Velocity Runner Lp Running Shoe,4
...,...,...,...,...
1995,Sparx,Men SM-661 Sports Shoes,Sparx Men SM-661 Sports Shoes,1995
1996,Sparx,Men's Sm-680 Running Shoe,Sparx Men's Sm-680 Running Shoe,1996
1997,Sparx,Men's Shoes Running,Sparx Men's Shoes Running,1997
1998,D Shoes,Sports Running Shoes for Men's,D Shoes Sports Running Shoes for Men's,1998


In [6]:
encoder = OneHotEncoder()
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['combined'])
cosine_similarities = cosine_similarity(tfidf_matrix)

In [7]:
contents = df['combined'].tolist()
contents_encoded = encoder.fit_transform([[content] for content in contents])

# Print one-hot encoded matrix
print("One-Hot Encoded Categories:")
print(contents_encoded.toarray())

One-Hot Encoded Categories:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [8]:
def find_similar_items(id, df, cosine_similarities):
    idx = df.index.get_loc(df[df['id'] == id].index[0])
    sim_scores = list(enumerate(cosine_similarities[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:5]  # Get top 5 similar items (excluding the item itself)

    print("=== Item: ")
    print("{}: {}".format(idx, df['Brand_Name'].iloc[idx]))

    print("=== Similar Items")
    for i, score in sim_scores:
        print("{}: {} {:.2f}".format(i, df['Brand_Name'].iloc[i], score))

# Find similar items for a given id
find_similar_items(1, df, cosine_similarities)

=== Item: 
1: ASIAN
=== Similar Items
32: ASIAN 1.00
61: ASIAN 1.00
92: ASIAN 1.00
121: ASIAN 1.00


In [9]:
# def search_items(search_word, df):
#     results = []
#     for index, row in df.iterrows():
#         content = row['combined']
#         if search_word.lower() in content.lower():
#             results.append(index)
#     return results
# search_items("run", df)


In [10]:
def recommend_products(query, cosine_similarities, df):
    
    search_query = []
    
    for index, row in df.iterrows():
        content = row['combined']
        if query.lower() in content.lower():
            search_query.append(index)
    
   
    results = []
    if search_query:
        for index in search_query:
            similar_items = list(enumerate(cosine_similarities[df.index.get_loc(index)]))
            sorted_items = sorted(similar_items, key=lambda x: x[1], reverse=True)
            results.extend(sorted_items)
        result = list(set(results))
        return sorted(result, key=lambda x: x[1], reverse=True)
    
    
    else:
        for index, row in df.iterrows():
            content = row['combined']
            if any(word in content for word in query.split() ):
                similar_items = list(enumerate(cosine_similarities[df.index.get_loc(index)]))
                sorted_items = sorted(similar_items, key=lambda x: x[1], reverse=True)
                results.extend(sorted_items)

        result = list(set(results))
        return sorted(result, key=lambda x: x[1], reverse=True)

In [15]:
# Test the recommend_products function
query = 'walking shoes'
recommended_products = recommend_products(query, cosine_similarities, df)

# Print the results
print("Recommended Products:")
for i, score in recommended_products :
        print("{}: {} {:.2f}".format(i, df['Brand_Name'].iloc[i], score))

Recommended Products:
1706: Campus 1.00
1766: Campus 1.00
926: Campus 1.00
1526: Campus 1.00
86: Campus 1.00
1586: Campus 1.00
146: Campus 1.00
746: Campus 1.00
1346: Campus 1.00
1406: Campus 1.00
566: Campus 1.00
1466: Campus 1.00
626: Campus 1.00
1286: Campus 1.00
1886: Campus 1.00
446: Campus 1.00
1946: Campus 1.00
1106: Campus 1.00
266: Campus 1.00
326: Campus 1.00
26: Campus 1.00
686: Campus 1.00
986: Campus 1.00
506: Campus 1.00
1646: Campus 1.00
806: Campus 1.00
1166: Campus 1.00
1226: Campus 1.00
1826: Campus 1.00
386: Campus 1.00
1046: Campus 1.00
206: Campus 1.00
866: Campus 1.00
1094: Campus 0.41
791: Campus 0.41
1634: Campus 0.41
1451: Campus 0.41
11: Campus 0.41
611: Campus 0.41
854: Campus 0.41
1934: Campus 0.41
1454: Campus 0.41
1271: Campus 0.41
14: Campus 0.41
431: Campus 0.41
674: Campus 0.41
494: Campus 0.41
794: Campus 0.41
1931: Campus 0.41
491: Campus 0.41
1991: Campus 0.41
551: Campus 0.41
1334: Campus 0.41
1151: Campus 0.41
1751: Campus 0.41
311: Campus 0.41
199