In [2]:
import boto3
import sagemaker
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter
import matplotlib.pyplot as plt
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.session import s3_input,Session

## Creating an S3 Librabry

In [3]:
bucket_name= 'recmdsysbucket'
my_region= boto3.session.Session().region_name
print(my_region)

us-east-1


In [None]:
s3= boto3.resource('s3')
try:
    if my_region== 'us-east-1':
        s3.create_bucket(Bucket= bucket_name)
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error:',e)

In [4]:
prefix= 'preprocessing_eda'
output_path= 's3://{}/{}/output'.format(bucket_name,prefix)
print(output_path)

s3://recmdsysbucket/preprocessing_eda/output


## Loading the Dataset

In [5]:
data= pd.read_json('s3://recmdsysbucket/tops_fashion.json')

In [5]:
data.head()

Unnamed: 0,sku,asin,product_type_name,formatted_price,author,color,brand,publisher,availability,reviews,large_image_url,availability_type,small_image_url,editorial_review,title,model,medium_image_url,manufacturer,editorial_reivew
0,,B016I2TS4W,SHIRT,,,,FNC7C,,,"[False, https://www.amazon.com/reviews/iframe?...",https://images-na.ssl-images-amazon.com/images...,,https://images-na.ssl-images-amazon.com/images...,Minions Como Superheroes Ironman Women's O Nec...,Minions Como Superheroes Ironman Long Sleeve R...,,https://images-na.ssl-images-amazon.com/images...,,
1,,B01N49AI08,SHIRT,,,,FIG Clothing,,,"[False, https://www.amazon.com/reviews/iframe?...",https://images-na.ssl-images-amazon.com/images...,,https://images-na.ssl-images-amazon.com/images...,Sizing runs on the small side. FIG® recommends...,FIG Clothing Womens Izo Tunic,,https://images-na.ssl-images-amazon.com/images...,,
2,,B01JDPCOHO,SHIRT,,,,FIG Clothing,,,"[False, https://www.amazon.com/reviews/iframe?...",https://images-na.ssl-images-amazon.com/images...,,https://images-na.ssl-images-amazon.com/images...,Sizing runs on the small side. FIG® recommends...,FIG Clothing Womens Won Top,,https://images-na.ssl-images-amazon.com/images...,,
3,,B01N19U5H5,SHIRT,,,,Focal18,,,"[True, https://www.amazon.com/reviews/iframe?a...",https://images-na.ssl-images-amazon.com/images...,,https://images-na.ssl-images-amazon.com/images...,100% Brand New & Fashion<br> Quantity: 1 Piece...,Focal18 Sailor Collar Bubble Sleeve Blouse Shi...,,https://images-na.ssl-images-amazon.com/images...,,
4,,B004GSI2OS,SHIRT,$26.26,,Onyx Black/ Stone,FeatherLite,,Usually ships in 6-10 business days,"[False, https://www.amazon.com/reviews/iframe?...",https://images-na.ssl-images-amazon.com/images...,now,https://images-na.ssl-images-amazon.com/images...,,Featherlite Ladies' Long Sleeve Stain Resistan...,,https://images-na.ssl-images-amazon.com/images...,,


## Shape of the Data and Feature Names

In [6]:
print("The number of Observations in the dataset: ",data.shape[0])
print("The number of features in the dataset: ",data.shape[1])

The number of Observations in the dataset:  183138
The number of features in the dataset:  19


In [7]:
print("The names of the features in the Dataset are as below:")
print('\n')
print(data.columns)

The names of the features in the Dataset are as below:


Index(['sku', 'asin', 'product_type_name', 'formatted_price', 'author',
       'color', 'brand', 'publisher', 'availability', 'reviews',
       'large_image_url', 'availability_type', 'small_image_url',
       'editorial_review', 'title', 'model', 'medium_image_url',
       'manufacturer', 'editorial_reivew'],
      dtype='object')


In [8]:
print("The missing value percenatge in each of the columns of the dataset is as below:")
print("\n")
print(np.round((data.isna().sum()/data.shape[0])*100,2))

The missing value percenatge in each of the columns of the dataset is as below:


sku                   99.80
asin                   0.00
product_type_name      0.00
formatted_price       84.50
author               100.00
color                 64.53
brand                  0.08
publisher             76.58
availability          86.60
reviews                0.00
large_image_url        0.00
availability_type     86.59
small_image_url        0.00
editorial_review      98.49
title                  0.00
model                 65.94
medium_image_url       0.00
manufacturer          76.58
editorial_reivew       1.51
dtype: float64


### Dropping the SKU and the Author features as they have more than 99% of missing values in the data

In [6]:
data.drop(['sku','author'],axis=1,inplace=True)

### Dropping the large_image_url and the small_image_url features and I will be just using the medium_image_url feature instead

In [7]:
data.drop(['large_image_url','small_image_url'],axis=1,inplace=True)

In [8]:
print(np.round((data.isna().sum()/data.shape[0])*100,2))

asin                  0.00
product_type_name     0.00
formatted_price      84.50
color                64.53
brand                 0.08
publisher            76.58
availability         86.60
reviews               0.00
availability_type    86.59
editorial_review     98.49
title                 0.00
model                65.94
medium_image_url      0.00
manufacturer         76.58
editorial_reivew      1.51
dtype: float64


In [9]:
er1= pd.DataFrame(data.iloc[:,9])
er2= pd.DataFrame(data.iloc[:,-1])
er1.columns= ['col1']
er2.columns= ['col2']

In [10]:
data['editorial_review_updated']= np.zeros(data.shape[0])

In [11]:
for i in tqdm(range(data.shape[0])):
    if pd.isna(er1.iloc[i,0]):
        er1.iloc[i,0]= er2.iloc[i,0]    

100%|██████████| 183138/183138 [00:41<00:00, 4379.97it/s]


In [12]:
data['editorial_review_updated']= er1
data.head()

Unnamed: 0,asin,product_type_name,formatted_price,color,brand,publisher,availability,reviews,availability_type,editorial_review,title,model,medium_image_url,manufacturer,editorial_reivew,editorial_review_updated
0,B016I2TS4W,SHIRT,,,FNC7C,,,"[False, https://www.amazon.com/reviews/iframe?...",,Minions Como Superheroes Ironman Women's O Nec...,Minions Como Superheroes Ironman Long Sleeve R...,,https://images-na.ssl-images-amazon.com/images...,,,Minions Como Superheroes Ironman Women's O Nec...
1,B01N49AI08,SHIRT,,,FIG Clothing,,,"[False, https://www.amazon.com/reviews/iframe?...",,Sizing runs on the small side. FIG® recommends...,FIG Clothing Womens Izo Tunic,,https://images-na.ssl-images-amazon.com/images...,,,Sizing runs on the small side. FIG® recommends...
2,B01JDPCOHO,SHIRT,,,FIG Clothing,,,"[False, https://www.amazon.com/reviews/iframe?...",,Sizing runs on the small side. FIG® recommends...,FIG Clothing Womens Won Top,,https://images-na.ssl-images-amazon.com/images...,,,Sizing runs on the small side. FIG® recommends...
3,B01N19U5H5,SHIRT,,,Focal18,,,"[True, https://www.amazon.com/reviews/iframe?a...",,100% Brand New & Fashion<br> Quantity: 1 Piece...,Focal18 Sailor Collar Bubble Sleeve Blouse Shi...,,https://images-na.ssl-images-amazon.com/images...,,,100% Brand New & Fashion<br> Quantity: 1 Piece...
4,B004GSI2OS,SHIRT,$26.26,Onyx Black/ Stone,FeatherLite,,Usually ships in 6-10 business days,"[False, https://www.amazon.com/reviews/iframe?...",now,,Featherlite Ladies' Long Sleeve Stain Resistan...,,https://images-na.ssl-images-amazon.com/images...,,,


In [13]:
data['editorial_review_updated'].isna().sum()/data.shape[0]

0.0

In [14]:
data.drop(['editorial_review'], axis=1, inplace=True)

In [15]:
data.drop([data.columns[13]], axis=1, inplace=True)

In [54]:
data.head()

Unnamed: 0,asin,product_type_name,formatted_price,color,brand,publisher,availability,reviews,availability_type,title,model,medium_image_url,manufacturer,editorial_review_updated
0,B016I2TS4W,SHIRT,,,FNC7C,,,"[False, https://www.amazon.com/reviews/iframe?...",,Minions Como Superheroes Ironman Long Sleeve R...,,https://images-na.ssl-images-amazon.com/images...,,Minions Como Superheroes Ironman Women's O Nec...
1,B01N49AI08,SHIRT,,,FIG Clothing,,,"[False, https://www.amazon.com/reviews/iframe?...",,FIG Clothing Womens Izo Tunic,,https://images-na.ssl-images-amazon.com/images...,,Sizing runs on the small side. FIG® recommends...
2,B01JDPCOHO,SHIRT,,,FIG Clothing,,,"[False, https://www.amazon.com/reviews/iframe?...",,FIG Clothing Womens Won Top,,https://images-na.ssl-images-amazon.com/images...,,Sizing runs on the small side. FIG® recommends...
3,B01N19U5H5,SHIRT,,,Focal18,,,"[True, https://www.amazon.com/reviews/iframe?a...",,Focal18 Sailor Collar Bubble Sleeve Blouse Shi...,,https://images-na.ssl-images-amazon.com/images...,,100% Brand New & Fashion<br> Quantity: 1 Piece...
4,B004GSI2OS,SHIRT,$26.26,Onyx Black/ Stone,FeatherLite,,Usually ships in 6-10 business days,"[False, https://www.amazon.com/reviews/iframe?...",now,Featherlite Ladies' Long Sleeve Stain Resistan...,,https://images-na.ssl-images-amazon.com/images...,,


In [55]:
print(data['product_type_name'].describe())

count     183138
unique        72
top        SHIRT
freq      167794
Name: product_type_name, dtype: object


In [61]:
prd_type_counts= Counter(data['product_type_name'])
print(prd_type_counts.most_common(10))

[('SHIRT', 167794), ('APPAREL', 3549), ('BOOKS_1973_AND_LATER', 3336), ('DRESS', 1584), ('SPORTING_GOODS', 1281), ('SWEATER', 837), ('OUTERWEAR', 796), ('OUTDOOR_RECREATION_PRODUCT', 729), ('ACCESSORY', 636), ('UNDERWEAR', 425)]


In [15]:
print(data['formatted_price'].describe())

count      28395
unique      3135
top       $19.99
freq         945
Name: formatted_price, dtype: object


In [22]:
Counter(data['formatted_price']).most_common(10)

[(None, 154743),
 ('$19.99', 945),
 ('$9.99', 749),
 ('$9.50', 601),
 ('$14.99', 472),
 ('$7.50', 463),
 ('$24.99', 414),
 ('$29.99', 370),
 ('$8.99', 343),
 ('$9.01', 336)]

In [23]:
data['formatted_price']= data['formatted_price'].str[1:]
print(Counter(data['formatted_price']).most_common(10))

[(None, 154743), ('19.99', 945), ('9.99', 749), ('9.50', 601), ('14.99', 472), ('7.50', 463), ('24.99', 414), ('29.99', 370), ('8.99', 343), ('9.01', 336)]


In [25]:
data['formatted_price']= pd.to_numeric(data['formatted_price'],errors='coerce')

In [26]:
data['formatted_price'].describe()

count    28373.000000
mean        32.048922
std         43.405505
min          0.010000
25%         10.800000
50%         19.110000
75%         34.990000
max        995.000000
Name: formatted_price, dtype: float64

In [27]:
data['formatted_price'].isna().sum()/data.shape[0]

0.8450731142635608