In [None]:
from itertools import chain
from pathlib import Path
from pprint import pprint

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from bs4 import BeautifulSoup
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score, mean_absolute_error, r2_score

DATA_SETS_FOLDER = Path('.') / '.datasets'

%matplotlib inline

In [None]:
df = pd.read_csv(DATA_SETS_FOLDER / 'amazon_reviews_us_Watches_v1_00.tsv', 
                 delimiter='\t',
                 error_bad_lines=False)

In [None]:
# sample top of the dataset
df.head()

In [None]:
df.describe()

In [None]:
print('Unique Customers: {}'.format(len(df.customer_id.unique())))
print('Unique Products: {}'.format(len(df.product_id.unique())))

In [None]:
# distribution of stars ratings in reviews
plt.hist(df.star_rating)
plt.xlabel('Star Rating')
plt.ylabel('Counts')
plt.show()

In [None]:
# distribution of helpful votes in reviews
plt.hist(np.clip(df.helpful_votes, 0, 50))
plt.xlabel('Helpful Votes (clipped at 50)')
plt.ylabel('Counts')
plt.show()

In [None]:
# distribution of total votes in reviews
plt.hist(np.clip(df.total_votes, 0, 50))
plt.xlabel('Total Votes (clipped at 50)')
plt.ylabel('Counts')
plt.show()

In [None]:
# this cell will take some time to run (plotting all values in correlation matrix)
pd.plotting.scatter_matrix(df[['star_rating', 'helpful_votes', 'total_votes']], figsize=(6, 6))
plt.show()

In [None]:
# top star_rating with no filter
df.groupby('product_id').agg(['mean', 'count']).sort_values([('star_rating', 'mean')], ascending=False).head(10)

In [None]:
# Review date range
min_review_date = pd.to_datetime(df.review_date).min()
max_review_date = pd.to_datetime(df.review_date).max()
"Reviews are from {} to {}".format(min_review_date, max_review_date)

In [None]:
# Top Tier: Products that appear at the very top of rankings
top_tier_search_sources = [
    'fashion-mens-watches-pg1.html',
    'fashion-mens-watches-pg2.html',
    'fashion-mens-watches-pg3.html',
]

# High Tier: Products that appear between Top Tier and Mid Tier of rankings
high_tier_search_sources = [
    'fashion-mens-watches-pg100.html',
    'fashion-mens-watches-pg101.html',
    'fashion-mens-watches-pg102.html',
]

# Mid Tier: Products that appear at the middle of rankings
mid_tier_search_sources = [
    'fashion-mens-watches-pg200.html',
    'fashion-mens-watches-pg201.html',
    'fashion-mens-watches-pg202.html',
]

# Low Tier: Products that appear between Mid Tier and Bottom Tier of rankings
low_tier_search_sources = [
    'fashion-mens-watches-pg300.html',
    'fashion-mens-watches-pg301.html',
    'fashion-mens-watches-pg302.html',
]

# Bottom Tier: Products that appear at the bottom of rankings
bottom_tier_search_sources = [
    'fashion-mens-watches-pg398.html',
    'fashion-mens-watches-pg399.html',
    'fashion-mens-watches-pg400.html',
]

In [None]:
def get_product_ids_from_html(filepath):
    """ Function reads search results html source and returns the Amazon product_ids found """
    with open(filepath, 'r') as content_file:
        content = content_file.read()

    soup = BeautifulSoup(content, 'html.parser')
    product_ids = []
    item_sections = soup.select('[data-component-type="s-search-results"] div[data-asin]')
    nonsponsored_item_sections = [item_section for item_section in item_sections 
                                  if str(item_section).find('Sponsored') < 0]
    product_ids = [item.get('data-asin') for item in nonsponsored_item_sections]
    return product_ids

def get_tier_product_ids(html_files):
    """ Returns product_ids from tier search sources (HTML files) """
    return chain.from_iterable(get_product_ids_from_html(Path('.') / 'RankingData' / 'AmazonSearchWatches' / file) for file in html_files)







In [None]:
# Get product_ids for each tier
top_tier_product_ids = get_tier_product_ids(top_tier_search_sources)
high_tier_product_ids = get_tier_product_ids(high_tier_search_sources)
mid_tier_product_ids = get_tier_product_ids(mid_tier_search_sources)
low_tier_product_ids = get_tier_product_ids(low_tier_search_sources)
bottom_tier_product_ids = get_tier_product_ids(bottom_tier_search_sources)

In [None]:
# Display samples from top tier
top_tier_df = df[df.product_id.isin(top_tier_product_ids)].groupby('product_id').agg(['mean', 'count'])
top_tier_df.sample(10)

In [None]:
# Display samples from high tier
high_tier_df = df[df.product_id.isin(high_tier_product_ids)].groupby('product_id').agg(['mean', 'count'])
high_tier_df.sample(10)

In [None]:
# Display samples from mid tier
mid_tier_df = df[df.product_id.isin(mid_tier_product_ids)].groupby('product_id').agg(['mean', 'count'])
mid_tier_df.sample(10)

In [None]:
# Display samples from low tier
low_tier_df = df[df.product_id.isin(low_tier_product_ids)].groupby('product_id').agg(['mean', 'count'])
low_tier_df.sample(10)

In [None]:
# Display samples from bottom tier
bottom_tier_df = df[df.product_id.isin(bottom_tier_product_ids)].groupby('product_id').agg(['mean', 'count'])
bottom_tier_df.sample(10)

In [None]:
def remove_column_hierarchy(tier_df):
    tier_df = tier_df.copy()
    tier_df.reset_index(inplace=True)
    tier_df.columns = ["_".join(c).strip('_') for c in tier_df.columns]
    return tier_df

def add_target(tier_df, target_value, tier):
    tier_df = tier_df.copy()
    tier_df['target'] = target_value
    tier_df['tier'] = tier
    return tier_df

top_tier_dataset = add_target(remove_column_hierarchy(top_tier_df), 5., 'Top Tier')
high_tier_dataset = add_target(remove_column_hierarchy(high_tier_df), 4., 'High Tier')
mid_tier_dataset = add_target(remove_column_hierarchy(mid_tier_df), 3., 'Mid Tier')
low_tier_dataset = add_target(remove_column_hierarchy(low_tier_df), 2., 'Low Tier')
bottom_tier_dataset = add_target(remove_column_hierarchy(bottom_tier_df), 1., 'Bottom Tier')

In [None]:
# create a training dataset
tier_training_dataset = pd.concat([top_tier_dataset, 
                                   high_tier_dataset,
                                   mid_tier_dataset,
                                   low_tier_dataset,
                                   bottom_tier_dataset])

tier_training_dataset.sample(10)

In [None]:
# distribution of stars ratings in reviews
# plt.hist(tier_training_dataset)
# plt.xlabel('Star Rating')
# plt.ylabel('Counts')

# sns.catplot(x="who", y="survived", col="class",
#                  data=titanic, saturation=.5,
#                  kind="bar", ci=None, aspect=.6)

tier_training_dataset['star_rating_mean_rounded'] = tier_training_dataset['star_rating_mean'].round()

g = sns.catplot("star_rating_mean_rounded", col="tier",
                 data=tier_training_dataset,
                 kind="count", height=2.5, aspect=.8)

plt.show()

In [None]:
# select feature and target columns
training_features = ['customer_id_count', 'star_rating_mean', 'helpful_votes_mean', 'total_votes_mean']
training_target = ['target']

# scale input variables
scaler = StandardScaler()
scaler.fit(tier_training_dataset[training_features])
X = scaler.transform(tier_training_dataset[training_features])
y = tier_training_dataset[training_target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# train a model to learn from the dataset
lasso = linear_model.Lasso(alpha=0.01)
lasso.fit(X_train,y_train)
train_score=lasso.score(X_train,y_train)
test_score=lasso.score(X_test,y_test)
coeff_used = np.sum(lasso.coef_!=0)
print("training score:", train_score)
print("test score: ", test_score)
print("number of features used: ", coeff_used)

# print best features
print("\ntop features:")
for coef in reversed(sorted(lasso.coef_)):
    if coef > 0:
        coef_index = list(lasso.coef_).index(coef)
        print("{} ({})".format(training_features[coef_index], coef))

In [None]:
# Sample Predictions
predicted_values = np.clip(lasso.predict(X_test), 1., 5.)
predictions = np.array(list(zip(predicted_values, y_test.as_matrix().reshape(-1,))))
predictions[:10]

In [None]:
pred_train = lasso.predict(X_train)
plt.scatter(pred_train, y_train)
plt.plot([.5,5.5], [.5, 5.5], alpha=.5)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.xlim([.5, 5.5])
plt.ylim([.5, 5.5])
plt.show()

In [None]:
plt.scatter(predictions[:,0], predictions[:,1])
plt.plot([.5,5.5], [.5, 5.5], alpha=.5)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.xlim([.5, 5.5])
plt.ylim([.5, 5.5])
plt.show()

In [None]:
regression_metrics = explained_variance_score, mean_absolute_error, r2_score

for metric in regression_metrics:
    print(metric)
    pred_train = lasso.predict(X_train)
    print(metric(y_train, pred_train))

    pred_test = lasso.predict(X_test)
    print(metric(y_test, pred_test))

In [None]:
tier_training_dataset.to_csv(Path('.') / '.datasets' / 'amzn_search_watches_train.csv', index=False)