In [1]:
import pandas as pd
import numpy as np 

In [2]:
new_data = pd.read_csv('Ajio_Fasion.csv', encoding='latin1')
new_data = new_data.head(5000)

In [3]:
new_data.size

45000

In [4]:
new_data['Name'] = new_data['Description']
new_data = new_data[['Id_Product', 'Category_by_gender', 'Name', 'Brand', 'Color','Description', 'URL_image', 'Product_URL']]
new_data.head(1)

Unnamed: 0,Id_Product,Category_by_gender,Name,Brand,Color,Description,URL_image,Product_URL
0,441137362002,Men,Checked Polo T-shirt,netplay,white,Checked Polo T-shirt,https://assets.ajio.com/medias/sys_master/root...,https://www.ajio.com/netplay-checked-polo-t-sh...


In [5]:
column_name_mapping = {
    'Id_Product': 'ID',
    'Category_by_gender': 'Category',
}
new_data.rename(columns= column_name_mapping, inplace= True)

In [6]:
new_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           5000 non-null   int64 
 1   Category     5000 non-null   object
 2   Name         5000 non-null   object
 3   Brand        5000 non-null   object
 4   Color        5000 non-null   object
 5   Description  5000 non-null   object
 6   URL_image    5000 non-null   object
 7   Product_URL  5000 non-null   object
dtypes: int64(1), object(7)
memory usage: 312.6+ KB


In [7]:
new_data.dropna(inplace= True)

In [8]:
new_data.isnull().sum()

ID             0
Category       0
Name           0
Brand          0
Color          0
Description    0
URL_image      0
Product_URL    0
dtype: int64

In [9]:
new_data.drop_duplicates(inplace= True)

In [10]:
new_data.duplicated().sum()

np.int64(0)

In [11]:
new_data = new_data.drop_duplicates(subset='Name', keep='first')

In [12]:
new_data.head(1)

Unnamed: 0,ID,Category,Name,Brand,Color,Description,URL_image,Product_URL
0,441137362002,Men,Checked Polo T-shirt,netplay,white,Checked Polo T-shirt,https://assets.ajio.com/medias/sys_master/root...,https://www.ajio.com/netplay-checked-polo-t-sh...


In [13]:
new_data['Description'] = new_data['Description'].apply(lambda x:x.split())
new_data['Category'] = new_data['Category'].apply(lambda x:x.split())
new_data['Brand'] = new_data['Brand'].apply(lambda x:x.split())
new_data['Color'] = new_data['Color'].apply(lambda x:x.split())

# new_data.iloc[123]["Category"]
# new_data.iloc[123]["Category"]

In [14]:
new_data.head(1)

Unnamed: 0,ID,Category,Name,Brand,Color,Description,URL_image,Product_URL
0,441137362002,[Men],Checked Polo T-shirt,[netplay],[white],"[Checked, Polo, T-shirt]",https://assets.ajio.com/medias/sys_master/root...,https://www.ajio.com/netplay-checked-polo-t-sh...


In [15]:
# 'arun singh' convert into 'arunsingh'
new_data['Category'] = new_data['Category'].apply(lambda x:[i.replace(" ","") for i in x])
new_data['Brand'] = new_data['Brand'].apply(lambda x:[i.replace(" ","") for i in x])
new_data['Color'] = new_data['Color'].apply(lambda x:[i.replace(" ","") for i in x])
new_data['Description'] = new_data['Description'].apply(lambda x:[i.replace(" ","") for i in x])

In [16]:
new_data['Tags'] = new_data['Description'] + new_data['Category'] + new_data['Brand'] + new_data['Color']

In [17]:
new_data.head(2)

Unnamed: 0,ID,Category,Name,Brand,Color,Description,URL_image,Product_URL,Tags
0,441137362002,[Men],Checked Polo T-shirt,[netplay],[white],"[Checked, Polo, T-shirt]",https://assets.ajio.com/medias/sys_master/root...,https://www.ajio.com/netplay-checked-polo-t-sh...,"[Checked, Polo, T-shirt, Men, netplay, white]"
1,441124497006,[Men],Tapered Fit Flat-Front Trousers,[netplay],[navy],"[Tapered, Fit, Flat-Front, Trousers]",https://assets.ajio.com/medias/sys_master/root...,https://www.ajio.com/netplay-tapered-fit-flat-...,"[Tapered, Fit, Flat-Front, Trousers, Men, netp..."


In [18]:
new_df = new_data[['ID', 'Name', 'Category', 'Brand', 'Tags', 'URL_image', 'Product_URL']]

In [19]:
new_df['Tags'] = new_df['Tags'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Tags'] = new_df['Tags'].apply(lambda x:" ".join(x))


In [20]:
new_df['Tags'] = new_df['Tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Tags'] = new_df['Tags'].apply(lambda x:x.lower())


In [21]:
new_df.head(1)

Unnamed: 0,ID,Name,Category,Brand,Tags,URL_image,Product_URL
0,441137362002,Checked Polo T-shirt,[Men],[netplay],checked polo t-shirt men netplay white,https://assets.ajio.com/medias/sys_master/root...,https://www.ajio.com/netplay-checked-polo-t-sh...


# Text Vectorization
# Bag of Words

In [22]:
import nltk

In [23]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [24]:
def stem(text):
    y = []

    for i in text.split():  # Added the colon here
        y.append(ps.stem(i))

    return " ".join(y)

In [25]:
new_df['Tags'] = new_df['Tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Tags'] = new_df['Tags'].apply(stem)


In [26]:
new_df.shape

(2368, 7)

In [27]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2368, stop_words='english')

In [28]:
vectors = cv.fit_transform(new_df['Tags']).toarray()

In [29]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [30]:
new_df

Unnamed: 0,ID,Name,Category,Brand,Tags,URL_image,Product_URL
0,441137362002,Checked Polo T-shirt,[Men],[netplay],check polo t-shirt men netplay white,https://assets.ajio.com/medias/sys_master/root...,https://www.ajio.com/netplay-checked-polo-t-sh...
1,441124497006,Tapered Fit Flat-Front Trousers,[Men],[netplay],taper fit flat-front trouser men netplay navi,https://assets.ajio.com/medias/sys_master/root...,https://www.ajio.com/netplay-tapered-fit-flat-...
2,460453612003,Striped Slim Fit Shirt with Patch Pocket,[Men],[the-indian-garage-co],stripe slim fit shirt with patch pocket men th...,https://assets.ajio.com/medias/sys_master/root...,https://www.ajio.com/the-indian-garage-co-stri...
3,441036730003,Heathered Crew-Neck T-shirt,[Men],[performax],heather crew-neck t-shirt men performax charcoal,https://assets.ajio.com/medias/sys_master/root...,https://www.ajio.com/performax-heathered-crew-...
4,441128531011,Washed Skinny Fit Jeans with Whiskers,[Men],[john-players-jeans],wash skinni fit jean with whisker men john-pla...,https://assets.ajio.com/medias/sys_master/root...,https://www.ajio.com/john-players-jeans-washed...
...,...,...,...,...,...,...,...
4990,410294621035,New Ottoman Arctic Hooded Windcheater,[Men],[superdry],new ottoman arctic hood windcheat men superdri...,https://assets.ajio.com/medias/sys_master/root...,https://www.ajio.com/superdry-new-ottoman-arct...
4992,469082975002,Logo Print Crew-Neck Batwing T-shirt,[Men],[levis],logo print crew-neck batw t-shirt men levi blue,https://assets.ajio.com/medias/sys_master/root...,https://www.ajio.com/levis-logo-print-crew-nec...
4995,462002525004,Striped Short Sleeves Polo T-shirt,[Men],[campus-sutra],stripe short sleev polo t-shirt men campus-sut...,https://assets.ajio.com/medias/sys_master/root...,https://www.ajio.com/campus-sutra-striped-shor...
4996,441128174003,Lightly Washed Mid-Rise Skinny Jeans,[Men],[john-players-jeans],lightli wash mid-ris skinni jean men john-play...,https://assets.ajio.com/medias/sys_master/root...,https://www.ajio.com/john-players-jeans-lightl...


In [31]:
from sklearn.metrics.pairwise import cosine_similarity

In [32]:
similarity = cosine_similarity(vectors)

In [33]:
sorted(list(enumerate(similarity[0])), reverse= True, key= lambda x:x[1])[1:6]

[(184, np.float64(0.7715167498104596)),
 (1672, np.float64(0.7715167498104596)),
 (165, np.float64(0.7216878364870323)),
 (550, np.float64(0.7216878364870323)),
 (612, np.float64(0.7216878364870323))]

In [34]:
def recommend(items):
    product_index = new_df[new_df['Name'] == items].index[0]
    distances = similarity[product_index]  # Corrected the typo here
    product_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]

    for i in product_list:
        print(new_df.iloc[i[0]].Name)


In [35]:
new_df['Name'][0]

'Checked Polo T-shirt'

In [36]:
recommend('Checked Polo T-shirt')

Printed Cotton Polo T-shirt
Polo T-shirt with Checked Collar
Floral Print Cotton Polo T-shirt
Checked Slim Fit Polo T-shirt
Printed Slim Fit Polo T-shirt


In [37]:
new_df

Unnamed: 0,ID,Name,Category,Brand,Tags,URL_image,Product_URL
0,441137362002,Checked Polo T-shirt,[Men],[netplay],check polo t-shirt men netplay white,https://assets.ajio.com/medias/sys_master/root...,https://www.ajio.com/netplay-checked-polo-t-sh...
1,441124497006,Tapered Fit Flat-Front Trousers,[Men],[netplay],taper fit flat-front trouser men netplay navi,https://assets.ajio.com/medias/sys_master/root...,https://www.ajio.com/netplay-tapered-fit-flat-...
2,460453612003,Striped Slim Fit Shirt with Patch Pocket,[Men],[the-indian-garage-co],stripe slim fit shirt with patch pocket men th...,https://assets.ajio.com/medias/sys_master/root...,https://www.ajio.com/the-indian-garage-co-stri...
3,441036730003,Heathered Crew-Neck T-shirt,[Men],[performax],heather crew-neck t-shirt men performax charcoal,https://assets.ajio.com/medias/sys_master/root...,https://www.ajio.com/performax-heathered-crew-...
4,441128531011,Washed Skinny Fit Jeans with Whiskers,[Men],[john-players-jeans],wash skinni fit jean with whisker men john-pla...,https://assets.ajio.com/medias/sys_master/root...,https://www.ajio.com/john-players-jeans-washed...
...,...,...,...,...,...,...,...
4990,410294621035,New Ottoman Arctic Hooded Windcheater,[Men],[superdry],new ottoman arctic hood windcheat men superdri...,https://assets.ajio.com/medias/sys_master/root...,https://www.ajio.com/superdry-new-ottoman-arct...
4992,469082975002,Logo Print Crew-Neck Batwing T-shirt,[Men],[levis],logo print crew-neck batw t-shirt men levi blue,https://assets.ajio.com/medias/sys_master/root...,https://www.ajio.com/levis-logo-print-crew-nec...
4995,462002525004,Striped Short Sleeves Polo T-shirt,[Men],[campus-sutra],stripe short sleev polo t-shirt men campus-sut...,https://assets.ajio.com/medias/sys_master/root...,https://www.ajio.com/campus-sutra-striped-shor...
4996,441128174003,Lightly Washed Mid-Rise Skinny Jeans,[Men],[john-players-jeans],lightli wash mid-ris skinni jean men john-play...,https://assets.ajio.com/medias/sys_master/root...,https://www.ajio.com/john-players-jeans-lightl...


In [38]:
new_df['Name'][0]

'Checked Polo T-shirt'

In [39]:
import pickle

In [40]:
pickle.dump(new_df, open('Ajio_Recommender.pkl', 'wb'))

In [41]:
pickle.dump(similarity, open('Ajio_Similarity.pkl', 'wb'))