In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import spacy
from tqdm import tqdm_notebook as tqdm
import joblib

In [59]:
#get list of sellers for each product
#get category. Maybe subcategory
#write evaluation class

In [2]:
data = pd.read_csv('../data/amazon_co-ecommerce_sample.csv')

In [3]:
data.columns

Index(['uniq_id', 'product_name', 'manufacturer', 'price',
       'number_available_in_stock', 'number_of_reviews',
       'number_of_answered_questions', 'average_review_rating',
       'amazon_category_and_sub_category',
       'customers_who_bought_this_item_also_bought', 'description',
       'product_information', 'product_description',
       'items_customers_buy_after_viewing_this_item',
       'customer_questions_and_answers', 'customer_reviews', 'sellers'],
      dtype='object')

In [4]:
data = data[['product_name', 'manufacturer', 'price', 'number_of_reviews', 'average_review_rating',
            'amazon_category_and_sub_category', 'product_information', 'product_description',
            'customer_reviews', 'sellers']]

In [5]:
data.head()

Unnamed: 0,product_name,manufacturer,price,number_of_reviews,average_review_rating,amazon_category_and_sub_category,product_information,product_description,customer_reviews,sellers
0,Hornby 2014 Catalogue,Hornby,£3.42,15,4.9 out of 5 stars,Hobbies > Model Trains & Railway Sets > Rail V...,Technical Details Item Weight640 g Product Dim...,Product Description Hornby 2014 Catalogue Box ...,Worth Buying For The Pictures Alone (As Ever) ...,"{""seller""=>[{""Seller_name_1""=>""Amazon.co.uk"", ..."
1,FunkyBuys® Large Christmas Holiday Express Fes...,FunkyBuys,£16.99,2,4.5 out of 5 stars,Hobbies > Model Trains & Railway Sets > Rail V...,Technical Details Manufacturer recommended age...,Size Name:Large FunkyBuys® Large Christmas Hol...,Four Stars // 4.0 // 18 Dec. 2015 // By\n \...,"{""seller""=>{""Seller_name_1""=>""UHD WHOLESALE"", ..."
2,CLASSIC TOY TRAIN SET TRACK CARRIAGES LIGHT EN...,ccf,£9.99,17,3.9 out of 5 stars,Hobbies > Model Trains & Railway Sets > Rail V...,Technical Details Manufacturer recommended age...,BIG CLASSIC TOY TRAIN SET TRACK CARRIAGE LIGHT...,**Highly Recommended!** // 5.0 // 26 May 2015 ...,"{""seller""=>[{""Seller_name_1""=>""DEAL-BOX"", ""Sel..."
3,HORNBY Coach R4410A BR Hawksworth Corridor 3rd,Hornby,£39.99,1,5.0 out of 5 stars,Hobbies > Model Trains & Railway Sets > Rail V...,Technical Details Item Weight259 g Product Dim...,Hornby 00 Gauge BR Hawksworth 3rd Class W 2107...,I love it // 5.0 // 22 July 2013 // By\n \n...,
4,Hornby 00 Gauge 0-4-0 Gildenlow Salt Co. Steam...,Hornby,£32.19,3,4.7 out of 5 stars,Hobbies > Model Trains & Railway Sets > Rail V...,Technical Details Item Weight159 g Product Dim...,Product Description Hornby RailRoad 0-4-0 Gild...,Birthday present // 5.0 // 14 April 2014 // By...,


In [6]:
def get_rating(rat):
    try:
        return float(rat[0:3])
    except: 
        return np.nan
    
def get_price(p):
    try:
        p = float(p[1:])
        return p
    except:
        return np.nan

def remove_commas(s):
    s = str(s)
    try:
        return int(s.replace(',', ''))
    except:
        return np.nan

def get_first_seller(sel):
    try:
        return sel.split('>')[2].split(',')[0]
    except:
        return 'unk'
    
def get_cat_sub_cat(c):
    try:
        cats = c.split('>')
        return cats[0], cats[1]
    except:
        return 'unk', 'unk'

In [7]:
data['average_review_rating'] = data['average_review_rating'].apply(get_rating)
data['price'] = data['price'].apply(get_price)
data['number_of_reviews'] = data['number_of_reviews'].apply(remove_commas)
data['sellers'] = data['sellers'].apply(get_first_seller)
data['cat_sub_cat']= data['amazon_category_and_sub_category'].apply(get_cat_sub_cat)

In [8]:
data.dropna(subset=['price'], inplace=True)
data = data.reset_index(drop=True)

In [9]:
data.loc[data['manufacturer'].isnull(), 'manufacturer'] = 'unk'

In [10]:
le_man = LabelEncoder()
le_sel = LabelEncoder()
le_man.fit(data['manufacturer'])
data['manufacturer'] = le_man.transform(data['manufacturer'])
le_sel.fit(data['sellers'])
data['sellers'] = le_sel.transform(data['sellers'])

In [11]:
cat_sub = pd.DataFrame(data['cat_sub_cat'].to_list(), columns = ['cat', 'sub_cat'])
data = pd.concat([data, cat_sub], 1)

In [12]:
le_cat = LabelEncoder()
le_sub = LabelEncoder()
data['cat'] = le_cat.fit_transform(data['cat'])
data['sub_cat'] = le_sub.fit_transform(data['sub_cat'])

In [13]:
joblib.dump(le_cat, '../le_cat.sklearn')
joblib.dump(le_sub, '../le_sub.sklearn')
joblib.dump(le_man, '../le_man.sklearn')
joblib.dump(le_sel, '../le_sel.sklearn')

['../le_sel.sklearn']

In [14]:
import en_core_web_sm
nlp = en_core_web_sm.load()

In [15]:
def get_text_feats(text):
    s = nlp(text)
    return s.vector

In [16]:
data['product_name'] = data['product_name'].astype('str')
data['product_information'] = data['product_information'].astype('str')
data['product_description'] = data['product_description'].astype('str')
data['customer_reviews'] = data['customer_reviews'].astype('str')

In [17]:
#could run this in parallel
name_feats = []
info_feats = []
description_feats = []
review_feats = []
for i in tqdm(range(data.shape[0])):
    name_feats.append(get_text_feats(data['product_name'][i]))
    info_feats.append(get_text_feats(data['product_information'][i]))
    description_feats.append(get_text_feats(data['product_description'][i]))
    review_feats.append(get_text_feats(data['customer_reviews'][i]))

HBox(children=(IntProgress(value=0, max=8546), HTML(value='')))




In [18]:
name_feats = np.stack(name_feats)
info_feats = np.stack(info_feats)
description_feats = np.stack(description_feats)
review_feats = np.stack(review_feats)

In [19]:
drop_cols = ['product_name', 'product_information', 'product_description', 'customer_reviews',
            'amazon_category_and_sub_category', 'cat_sub_cat']
data = data.drop(drop_cols, 1)

In [20]:
train, test = train_test_split(data, test_size=.3, random_state=0)

In [21]:
train_idx = list(train.index)
test_idx = list(test.index)

name_train = name_feats[train_idx]
name_test = name_feats[test_idx]

info_train = info_feats[train_idx]
info_test = info_feats[test_idx]

description_train = description_feats[train_idx]
description_test = description_feats[test_idx]

review_train = review_feats[train_idx]
review_test = review_feats[test_idx]

In [22]:
name_pca = PCA(n_components=1)
info_pca = PCA(n_components=1)
desc_pca = PCA(n_components=1)
review_pca = PCA(n_components=1)
train_names = name_pca.fit_transform(name_train)
train_info = info_pca.fit_transform(info_train)
train_descs = desc_pca.fit_transform(description_train)
train_reviews = review_pca.fit_transform(review_train)

test_names = name_pca.transform(name_test)
test_info = info_pca.transform(info_test)
test_descs = desc_pca.transform(description_test)
test_reviews = review_pca.transform(review_test)

In [23]:
train['name'] = train_names
train['info'] = train_info
train['desc'] = train_descs
train['reviews'] = train_reviews

test['name'] = test_names
test['info'] = test_info
test['desc'] = test_descs
test['reviews'] = test_reviews

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

In [24]:
train.loc[train['number_of_reviews'].isnull(), 'number_of_reviews'] = 0
test.loc[test['number_of_reviews'].isnull(), 'number_of_reviews'] = 0

train.loc[train['average_review_rating'].isnull(), 'average_review_rating'] = 0
test.loc[test['average_review_rating'].isnull(), 'average_review_rating'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [25]:
test.isnull().sum()

manufacturer             0
price                    0
number_of_reviews        0
average_review_rating    0
sellers                  0
cat                      0
sub_cat                  0
name                     0
info                     0
desc                     0
reviews                  0
dtype: int64

In [26]:
train.to_csv('../data/amazon_train.csv', index=False)
test.to_csv('../data/amazon_test.csv', index=False)