### 2.1 Import Data and Required Packages


**Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.**

In [59]:
# Importing Libraries
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


**Import the CSV Data as Pandas DataFrame**

In [60]:
df = pd.read_csv('data/flipkart_clean_data.csv')

**Show Top 5 Records**

In [61]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,name,price,mrp,variant,image_url,tags
0,0,0,SMPEPGKYW9GZWYVT,Clinic Plus Strong & Thick Health Shampoo,179.0,189.0,340 ml,https://rukminim1.flixcart.com/image/280/280/...,clinic plu strong & thick health shampoo 179.0...
1,1,1,BWSFH4YQYC9RSNZU,DOVE Deeply Nourishing Body Wash,360.0,360.0,800 ml,https://rukminim1.flixcart.com/image/280/280/...,dove deepli nourish bodi wash 360.0 360.0 800 ml
2,2,2,EDOEVT5UGZXJYAHU,EMAMI Healthy & Tasty Refined Rice Bran Oil P...,167.0,180.0,1 L,https://rukminim1.flixcart.com/image/280/280/...,emami healthi & tasti refin rice bran oil pouc...
3,3,3,FLRETEFHF5EK5ECT,Tata Sampann Fine Besan,19.0,64.0,500 g,https://rukminim1.flixcart.com/image/280/280/...,tata sampann fine besan 19.0 64.0 500 g
4,4,4,BFRFVEHYQQMAZXKY,Colgate Vedshakti Mouth Protect Spray,99.0,99.0,10 g,https://rukminim1.flixcart.com/image/280/280/...,colgat vedshakti mouth protect spray 99.0 99.0...


**Shape of the dataset**

In [62]:
df.shape

(798, 9)

**Summary of the dataset**

In [63]:
# Display summary statistics for a dataframe
df.describe()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0
count,798.0,798.0
mean,484.794486,484.794486
std,296.074794,296.074794
min,0.0,0.0
25%,221.25,221.25
50%,477.5,477.5
75%,740.5,740.5
max,1007.0,1007.0


**Check Datatypes in the dataset**

In [64]:
# Check Null and Dtypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 798 entries, 0 to 797
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0.1  798 non-null    int64 
 1   Unnamed: 0    798 non-null    int64 
 2   id            798 non-null    object
 3   name          798 non-null    object
 4   price         798 non-null    object
 5   mrp           798 non-null    object
 6   variant       798 non-null    object
 7   image_url     798 non-null    object
 8   tags          798 non-null    object
dtypes: int64(2), object(7)
memory usage: 56.2+ KB


### Feature Information
* **id:** Shows unique product id
* **name:** Name of the product.
* **price:**  Selling price of the product
* **mrp:** MRP of the product
* **image_url:** Image url of product


In [65]:
df.drop(['tags'],axis=1,inplace=True)

In [66]:
# drop dupicate records present in id column
df.drop_duplicates(subset=['id'],inplace = True)

In [67]:
df['id'].duplicated().sum()

0

In [68]:
# drop dupicate records present in image_url column
df.drop_duplicates(subset=['image_url'],inplace = True)

In [69]:
df['tags'] = df['name']

In [70]:
! pip install nltk




In [71]:
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [72]:
# this fuction is conver word into it's root words
def stem(text):
    y = []
    
    for i in text.split():
        y.append(ps.stem(i))
    
    return " ".join(y)

In [73]:
df['tags'] = df['tags'].apply(stem)

In [74]:
df.head(2)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,name,price,mrp,variant,image_url,tags
0,0,0,SMPEPGKYW9GZWYVT,Clinic Plus Strong & Thick Health Shampoo,179.0,189.0,340 ml,https://rukminim1.flixcart.com/image/280/280/...,clinic plu strong & thick health shampoo
1,1,1,BWSFH4YQYC9RSNZU,DOVE Deeply Nourishing Body Wash,360.0,360.0,800 ml,https://rukminim1.flixcart.com/image/280/280/...,dove deepli nourish bodi wash


## Model training

In [75]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer( max_features=2000,stop_words="english" )

In [76]:
vectors = cv.fit_transform(df['tags']).toarray()

In [77]:
from sklearn.metrics.pairwise import cosine_similarity

In [78]:
similarity = cosine_similarity(vectors)

In [79]:
sorted(list(enumerate(similarity[1])),reverse = True , key=lambda x:x[1])[1:6]

[(739, 0.5163977794943223),
 (694, 0.36514837167011077),
 (796, 0.36514837167011077),
 (570, 0.25819888974716115),
 (185, 0.2480694691784169)]

In [80]:
def recommended(product):
    name_index = df[df['name'].str.contains(product)].index[0]
    distances = sorted(list(enumerate(similarity[name_index])),reverse = True ,key=lambda x:x[1])
    
    
    for i in distances[1:6]:
        print(df.iloc[i[0]][['name','variant','price','image_url']])
        print("----------------------------------------------------\n")
        

In [81]:
recommended('DOVE Deeply Nourishing Body Wash')

name                NIVEA Body Wash
variant                       239.0
price         Fresh Pure Shower Gel
image_url                    500 ml
Name: 739, dtype: object
----------------------------------------------------

name                      DETTOL Cool Body Wash and Shower Gel
variant                                                 250 ml
price                                                    160.0
image_url     https://rukminim1.flixcart.com/image/280/280/...
Name: 694, dtype: object
----------------------------------------------------

name                   DETTOL Refresh Body Wash and Shower Gel
variant                                                 250 ml
price                                                    160.0
image_url     https://rukminim1.flixcart.com/image/280/280/...
Name: 796, dtype: object
----------------------------------------------------

name          NIVEA Body Lotion
variant         for Men & Women
price            Aloe Hydration
image_url          

In [82]:
import pickle

In [None]:
pickle.dump(df,open('movie_list.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))