### 2.1 Import Data and Required Packages


**Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.**

In [78]:
# Importing Libraries
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


**Import the CSV Data as Pandas DataFrame**

In [79]:
df = pd.read_csv('data/flipkart_clean_data.csv')

**Show Top 5 Records**

In [80]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,name,price,mrp,variant,image_url,tags
0,0,0,SMPEPGKYW9GZWYVT,Clinic Plus Strong & Thick Health Shampoo,179.0,189.0,340 ml,https://rukminim1.flixcart.com/image/280/280/...,clinic plu strong & thick health shampoo 179.0...
1,1,1,BWSFH4YQYC9RSNZU,DOVE Deeply Nourishing Body Wash,360.0,360.0,800 ml,https://rukminim1.flixcart.com/image/280/280/...,dove deepli nourish bodi wash 360.0 360.0 800 ml
2,2,2,EDOEVT5UGZXJYAHU,EMAMI Healthy & Tasty Refined Rice Bran Oil P...,167.0,180.0,1 L,https://rukminim1.flixcart.com/image/280/280/...,emami healthi & tasti refin rice bran oil pouc...
3,3,3,FLRETEFHF5EK5ECT,Tata Sampann Fine Besan,19.0,64.0,500 g,https://rukminim1.flixcart.com/image/280/280/...,tata sampann fine besan 19.0 64.0 500 g
4,4,4,BFRFVEHYQQMAZXKY,Colgate Vedshakti Mouth Protect Spray,99.0,99.0,10 g,https://rukminim1.flixcart.com/image/280/280/...,colgat vedshakti mouth protect spray 99.0 99.0...


**Shape of the dataset**

In [81]:
df.shape

(798, 9)

**Summary of the dataset**

In [82]:
# Display summary statistics for a dataframe
df.describe()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0
count,798.0,798.0
mean,484.794486,484.794486
std,296.074794,296.074794
min,0.0,0.0
25%,221.25,221.25
50%,477.5,477.5
75%,740.5,740.5
max,1007.0,1007.0


**Check Datatypes in the dataset**

In [83]:
# Check Null and Dtypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 798 entries, 0 to 797
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0.1  798 non-null    int64 
 1   Unnamed: 0    798 non-null    int64 
 2   id            798 non-null    object
 3   name          798 non-null    object
 4   price         798 non-null    object
 5   mrp           798 non-null    object
 6   variant       798 non-null    object
 7   image_url     798 non-null    object
 8   tags          798 non-null    object
dtypes: int64(2), object(7)
memory usage: 56.2+ KB


### Feature Information
* **id:** Shows unique product id
* **name:** Name of the product.
* **price:**  Selling price of the product
* **mrp:** MRP of the product
* **image_url:** Image url of product


In [84]:
df.drop(['tags'],axis=1,inplace=True)

In [85]:
# drop dupicate records present in id column
df.drop_duplicates(subset=['id'],inplace = True)

In [86]:
df['id'].duplicated().sum()

0

In [87]:
# drop dupicate records present in image_url column
df.drop_duplicates(subset=['image_url'],inplace = True)

In [88]:
df['tags'] = df['name']

In [89]:
! pip install nltk




In [90]:
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [91]:
# this fuction is conver word into it's root words
def stem(text):
    y = []
    
    for i in text.split():
        y.append(ps.stem(i))
    
    return " ".join(y)

In [92]:
df['tags'] = df['tags'].apply(stem)

In [93]:
df.head(2)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,name,price,mrp,variant,image_url,tags
0,0,0,SMPEPGKYW9GZWYVT,Clinic Plus Strong & Thick Health Shampoo,179.0,189.0,340 ml,https://rukminim1.flixcart.com/image/280/280/...,clinic plu strong & thick health shampoo
1,1,1,BWSFH4YQYC9RSNZU,DOVE Deeply Nourishing Body Wash,360.0,360.0,800 ml,https://rukminim1.flixcart.com/image/280/280/...,dove deepli nourish bodi wash


## Model training

In [94]:
# importing TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [95]:
vectors = tfidf.fit_transform(df['tags']).toarray()

In [96]:
vectors

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [97]:
from sklearn.metrics.pairwise import cosine_similarity

In [98]:
similarity = cosine_similarity(vectors)

In [99]:
similarity[1]

array([0.        , 1.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.18830137, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.11718329, 0.        , 0.        , 0.19519524,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.11148315, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.09621619, 0.        , 0.        ,
       0.20263291, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.18453303,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [100]:
sorted(list(enumerate(similarity[1])),reverse = True , key=lambda x:x[1])[1:6]

[(739, 0.42262403156289063),
 (694, 0.27623746702341717),
 (796, 0.2669400170876862),
 (571, 0.242054173129299),
 (570, 0.22857072315808571)]

In [101]:
def recommended(product):
    name_index = df[df['name'].str.contains(product)].index[0]
    distances = sorted(list(enumerate(similarity[name_index])),reverse = True ,key=lambda x:x[1])
    
    
    for i in distances[1:6]:
        print(df.iloc[i[0]][['name','variant','price','image_url']])
        print("----------------------------------------------------\n")
        

In [102]:
recommended('Tata Sampann Fine Besan')

name                             Tata Sampann Coriander Powder
variant                                                  100 g
price                                                     29.0
image_url     https://rukminim1.flixcart.com/image/280/280/...
Name: 415, dtype: object
----------------------------------------------------

name                                 Tata Sampann Garam Masala
variant                                                  100 g
price                                                     59.0
image_url     https://rukminim1.flixcart.com/image/280/280/...
Name: 291, dtype: object
----------------------------------------------------

name                                  Tata Sampann Meat Masala
variant                                                  100 g
price                                                     58.0
image_url     https://rukminim1.flixcart.com/image/280/280/...
Name: 674, dtype: object
----------------------------------------------------

name   

In [103]:
import pickle

In [104]:
pickle.dump(df,open('product_dict.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))