In [1]:
#Importing all the necessary packages.

from PIL import Image
import requests
from io import BytesIO
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import math
import time
import re
import os
import seaborn as sns
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity  
from sklearn.metrics import pairwise_distances
from matplotlib import gridspec
from scipy.sparse import hstack
import plotly
import plotly.figure_factory as ff
from plotly.graph_objs import Scatter, Layout

plotly.offline.init_notebook_mode(connected=True)
warnings.filterwarnings("ignore")

In [2]:
# Loading the data using pandas' read_json function...
data = pd.read_json('tops_fashion.json')

# Getting the number of rows and columns in the dataframe...
print("No. of data points :", data.shape[0], "No. of features/variables :", data.shape[1])

No. of data points : 183138 No. of features/variables : 19


In [3]:
# Printing the features in the dataframe...
data.columns

Index(['asin', 'author', 'availability', 'availability_type', 'brand', 'color',
       'editorial_reivew', 'editorial_review', 'formatted_price',
       'large_image_url', 'manufacturer', 'medium_image_url', 'model',
       'product_type_name', 'publisher', 'reviews', 'sku', 'small_image_url',
       'title'],
      dtype='object')

In [4]:
# Using only 7 features and discarding remaining 12 from the dataframe...
data = data[['asin', 'brand', 'color', 'medium_image_url', 'product_type_name', 'title', 'formatted_price']]

# Again getting the number of rows and columns in the dataframe...
print("No. of data points :", data.shape[0], "No. of features/variables :", data.shape[1])

No. of data points : 183138 No. of features/variables : 7


In [5]:
#Let's have a look at the data...
data.head()

Unnamed: 0,asin,brand,color,medium_image_url,product_type_name,title,formatted_price
0,B016I2TS4W,FNC7C,,https://images-na.ssl-images-amazon.com/images...,SHIRT,Minions Como Superheroes Ironman Long Sleeve R...,
1,B01N49AI08,FIG Clothing,,https://images-na.ssl-images-amazon.com/images...,SHIRT,FIG Clothing Womens Izo Tunic,
2,B01JDPCOHO,FIG Clothing,,https://images-na.ssl-images-amazon.com/images...,SHIRT,FIG Clothing Womens Won Top,
3,B01N19U5H5,Focal18,,https://images-na.ssl-images-amazon.com/images...,SHIRT,Focal18 Sailor Collar Bubble Sleeve Blouse Shi...,
4,B004GSI2OS,FeatherLite,Onyx Black/ Stone,https://images-na.ssl-images-amazon.com/images...,SHIRT,Featherlite Ladies' Long Sleeve Stain Resistan...,$26.26


In [6]:
#Let's look at some statistics of data...
print(data.describe())

              asin   brand  color  \
count       183138  182987  64956   
unique      183138   10577   7380   
top     B01N9MM0J8    Zago  Black   
freq             1     223  13207   

                                         medium_image_url product_type_name  \
count                                              183138            183138   
unique                                             170782                72   
top     https://images-na.ssl-images-amazon.com/images...             SHIRT   
freq                                                   23            167794   

                                                    title formatted_price  
count                                              183138           28395  
unique                                             175985            3135  
top     Nakoda Cotton Self Print Straight Kurti For Women          $19.99  
freq                                                   77             945  


In [7]:
#You can skip the below four lines to run the model on the entire dataset...

#Use this to remove items with no price given...
data = data.loc[~data['formatted_price'].isnull()]
print('Number of data points After eliminating price=NULL :', data.shape[0])

#Use this to remove items with no color given...
data =data.loc[~data['color'].isnull()]
print('Number of data points After eliminating color=NULL :', data.shape[0])

Number of data points After eliminating price=NULL : 28395
Number of data points After eliminating color=NULL : 28385


In [8]:
#We can download all the images if necessary using the following code...

'''
from PIL import Image
import requests
from io import BytesIO

for index, row in images.iterrows():
        url = row['large_image_url']
        response = requests.get(url)
        img = Image.open(BytesIO(response.content))
        img.save('images/183k_images/'+row['asin']+'.jpeg')
'''

"\nfrom PIL import Image\nimport requests\nfrom io import BytesIO\n\nfor index, row in images.iterrows():\n        url = row['large_image_url']\n        response = requests.get(url)\n        img = Image.open(BytesIO(response.content))\n        img.save('images/183k_images/'+row['asin']+'.jpeg')\n"

In [9]:
#Finding the number of products which have exactly the same title...
print(sum(data.duplicated('title')))

2325


In [10]:
#If title has ver few words, its of little use, so let's remove them...
data_sorted = data[data['title'].apply(lambda x: len(x.split()) > 4)]
print("After removal of short titles: ", data_sorted.shape[0])

After removal of short titles:  27949


In [11]:
#Soting data according to title in alphbetical order...
data_sorted.  sort_values('title', inplace = True)
data_sorted.head()

Unnamed: 0,asin,brand,color,medium_image_url,product_type_name,title,formatted_price
118987,B008D30AGK,Out+of+Print+Clothing,Multicolored,https://images-na.ssl-images-amazon.com/images...,SHIRT,"""1984"" Retro Book Cover Women's SLim Fit T-Shi...",$7.51
78827,B003IDE8XQ,Maggie's Organics,Grey,https://images-na.ssl-images-amazon.com/images...,HOME,"""Camisoles Grey - Medium Fair Labor, 1 pc""",$18.99
109599,B00KI3VDXM,Crazy4Bling,Purple,https://images-na.ssl-images-amazon.com/images...,SHIRT,"""I Wanna Be Adored"" Long Sleeve Top with Shred...",$39.99
40451,B073SKNQHD,The Workout Princess,Premium Heather Gray,https://images-na.ssl-images-amazon.com/images...,SHIRT,"""I Workout To Burn off The Crazy"", Tri Blend T...",$24.99
12050,B06WRW8RQ1,AJ,Black,https://images-na.ssl-images-amazon.com/images...,SHIRT,"""Life is a Journey"" Self-Help DIY T-Shirt (Wom...",$9.38


In [12]:
indices = []
for i,row in data_sorted.iterrows():
    indices.append(i)

In [14]:
import itertools
stage1_dedupe_asins = []
i = 0
j = 0
num_data_points = data_sorted.shape[0]
while i < num_data_points and j < num_data_points:
    previous_i = i
    #Store the list of words of ith string in a...
    a = data['title'].loc[indices[i]].split()
    j = i+1
    while j < num_data_points:
        #Store the list of words of jth string in b...
        b = data['title'].loc[indices[j]].split()
        #Store the maximum of lengths of a and b...
        max_length = max(len(a), len(b))
        #Count is used to store number of words matched...
        count = 0
        #itertools.zip_longest(a,b): will map the corresponding words in both strings, it will appened None in case of unequal strings...
        for k in itertools.zip_longest(a,b): 
            if (k[0] == k[1]):
                count += 1
        if (max_length - count) > 2:
            stage1_dedupe_asins.append(data_sorted['asin'].loc[indices[i]])
            if j == num_data_points-1:
                stage1_dedupe_asins.append(data_sorted['asin'].loc[indices[j]])
            i = j
            break
        else:
            j += 1
    if previous_i == i:
        break
                    

In [15]:
#Here, we actually remove duplicates which differ only at the end...
data = data.loc[data['asin'].isin(stage1_dedupe_asins)]
#Printing number of products remaining after removal of duplicates which differ only at the end...
print('Number of data points : ', data.shape[0])

Number of data points :  17587


In [16]:
#This is bruteforcing method and takes significant amount of time...

indices = []
for i,row in data.iterrows():
    indices.append(i)

stage2_dedupe_asins = []
while len(indices)!=0:
    i = indices.pop()
    stage2_dedupe_asins.append(data['asin'].loc[i])
    #Consider the first apperal's title
    a = data['title'].loc[i].split()
    #Store the list of words of ith string in a...
    for j in indices:
        
        b = data['title'].loc[j].split()
        #Store the list of words of jth string in b...
        
        length = max(len(a),len(b))
        
        #Count signifies number of words matched in both the strings...
        count  = 0

        #itertools.zip_longest(a,b): will map the corresponding words in both strings, it will appened None in case of unequal strings
        #Example: a =['a', 'b', 'c', 'd']
        #b = ['a', 'b', 'd']
        #itertools.zip_longest(a,b): will give [('a','a'), ('b','b'), ('c','d'), ('d', None)]
        for k in itertools.zip_longest(a,b): 
            if (k[0]==k[1]):
                count += 1

        #If the number of words in which both strings differ are < 3 , we are consider those two apperals as same and hence ignore them...
        if (length - count) < 3:
            indices.remove(j)

In [17]:
#We'll subset our data by result of above loop... 
data = data.loc[data['asin'].isin(stage2_dedupe_asins)]
print('Number of data points after stage two of dedupe: ',data.shape[0])

Number of data points after stage two of dedupe:  16437


In [18]:
data.to_pickle('clean_data')
print("Pickle complete... You may stop the job")

Pickle complete... You may stop the job
