In [2]:
#Importing all the necessary packages.

from PIL import Image
import requests
from io import BytesIO
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import math
import time
import re
import os
import seaborn as sns
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity  
from sklearn.metrics import pairwise_distances
from matplotlib import gridspec
from scipy.sparse import hstack
import plotly
import plotly.figure_factory as ff
from plotly.graph_objs import Scatter, Layout

plotly.offline.init_notebook_mode(connected=True)
warnings.filterwarnings("ignore")

In [3]:
# Loading the data using pandas' read_json function...
data = pd.read_json('tops_fashion.json')

# Getting the number of rows and columns in the dataframe...
print("No. of data points :", data.shape[0], "No. of features/variables :", data.shape[1])

No. of data points : 183138 No. of features/variables : 19


In [4]:
# Printing the features in the dataframe...
data.columns

Index(['asin', 'author', 'availability', 'availability_type', 'brand', 'color',
       'editorial_reivew', 'editorial_review', 'formatted_price',
       'large_image_url', 'manufacturer', 'medium_image_url', 'model',
       'product_type_name', 'publisher', 'reviews', 'sku', 'small_image_url',
       'title'],
      dtype='object')

In [5]:
# Using only 7 features and discarding remaining 12 from the dataframe...
data = data[['asin', 'brand', 'color', 'medium_image_url', 'product_type_name', 'title', 'formatted_price']]

# Again getting the number of rows and columns in the dataframe...
print("No. of data points :", data.shape[0], "No. of features/variables :", data.shape[1])

No. of data points : 183138 No. of features/variables : 7


In [6]:
#Let's have a look at the data...
data.head()

Unnamed: 0,asin,brand,color,medium_image_url,product_type_name,title,formatted_price
0,B016I2TS4W,FNC7C,,https://images-na.ssl-images-amazon.com/images...,SHIRT,Minions Como Superheroes Ironman Long Sleeve R...,
1,B01N49AI08,FIG Clothing,,https://images-na.ssl-images-amazon.com/images...,SHIRT,FIG Clothing Womens Izo Tunic,
2,B01JDPCOHO,FIG Clothing,,https://images-na.ssl-images-amazon.com/images...,SHIRT,FIG Clothing Womens Won Top,
3,B01N19U5H5,Focal18,,https://images-na.ssl-images-amazon.com/images...,SHIRT,Focal18 Sailor Collar Bubble Sleeve Blouse Shi...,
4,B004GSI2OS,FeatherLite,Onyx Black/ Stone,https://images-na.ssl-images-amazon.com/images...,SHIRT,Featherlite Ladies' Long Sleeve Stain Resistan...,$26.26


In [7]:
#Let's look at some statistics of data...
print(data.describe())

              asin   brand  color  \
count       183138  182987  64956   
unique      183138   10577   7380   
top     B01IXU06VG    Zago  Black   
freq             1     223  13207   

                                         medium_image_url product_type_name  \
count                                              183138            183138   
unique                                             170782                72   
top     https://images-na.ssl-images-amazon.com/images...             SHIRT   
freq                                                   23            167794   

                                                    title formatted_price  
count                                              183138           28395  
unique                                             175985            3135  
top     Nakoda Cotton Self Print Straight Kurti For Women          $19.99  
freq                                                   77             945  


In [8]:
#Storing the state of data variable in pickles folder so that we can use it later for speedy performance...
data.to_pickle('pickels/180k_apparel_data')

In [9]:
#Commenting this out because i'm running the model with entire dataset...

'''
#Use this to remove items with no price given...
data = data.loc[~data['formatted_price'].isnull()]
print('Number of data points After eliminating price=NULL :', data.shape[0])

#Use this to remove items with no color given...
data =data.loc[~data['color'].isnull()]
print('Number of data points After eliminating color=NULL :', data.shape[0])
'''

"\n#Use this to remove items with no price given...\ndata = data.loc[~data['formatted_price'].isnull()]\nprint('Number of data points After eliminating price=NULL :', data.shape[0])\n\n#Use this to remove items with no color given...\ndata =data.loc[~data['color'].isnull()]\nprint('Number of data points After eliminating color=NULL :', data.shape[0])\n"

In [10]:
#We can download all the images if necessary using the following code...

'''
from PIL import Image
import requests
from io import BytesIO

for index, row in images.iterrows():
        url = row['large_image_url']
        response = requests.get(url)
        img = Image.open(BytesIO(response.content))
        img.save('images/183k_images/'+row['asin']+'.jpeg')
'''

"\nfrom PIL import Image\nimport requests\nfrom io import BytesIO\n\nfor index, row in images.iterrows():\n        url = row['large_image_url']\n        response = requests.get(url)\n        img = Image.open(BytesIO(response.content))\n        img.save('images/183k_images/'+row['asin']+'.jpeg')\n"

In [11]:
#Finding the number of products which have exactly the same title...
print(sum(data.duplicated('title')))

7153


In [14]:
#If title has ver few words, its of little use, so let's remove them...
data_sorted = data[data['title'].apply(lambda x: len(x.split()) > 4)]
print("After removal of short titles: ", data_sorted.shape[0])

After removal of short titles:  178026


In [16]:
#Soting data according to title in alphbetical order...
data_sorted.  sort_values('title', inplace = True)
data_sorted.head()

Unnamed: 0,asin,brand,color,medium_image_url,product_type_name,title,formatted_price
118987,B008D30AGK,Out+of+Print+Clothing,Multicolored,https://images-na.ssl-images-amazon.com/images...,SHIRT,"""1984"" Retro Book Cover Women's SLim Fit T-Shi...",$7.51
47516,B071WD44WX,Sweet Claire,,https://images-na.ssl-images-amazon.com/images...,SHIRT,"""ASAP"" As Southern As Possible Tank Top, Heath...",
51434,B072M88RQ6,Sweet Claire,,https://images-na.ssl-images-amazon.com/images...,SHIRT,"""ASAP"" As Southern As Possible Tank Top, Off W...",
39185,B0739MNWM2,Natural Black Girl T-Shirt,,https://images-na.ssl-images-amazon.com/images...,ORCA_SHIRT,"""Afro Bae"" Beautiful Black Women's Shirt",
149224,B01E0XLYHA,GreaterGood,Blue,https://images-na.ssl-images-amazon.com/images...,SHIRT,"""Ask Me About My Granddog"" T-Shirt",
