In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score, mean_absolute_error, r2_score
from sklearn.preprocessing import PolynomialFeatures

OUTPUT_FOLDER = Path('.') / 'Output' / '2019-04-26'
DATA_SETS_FOLDER = Path('.') / '.datasets'

%matplotlib inline

In [None]:
# df = pd.read_csv('/Users/bear-mbp/Library/Mobile Documents/com~apple~CloudDocs/Python/amzn_reviews_ds.csv')
df = pd.read_csv(OUTPUT_FOLDER / 'amzn_reviews_ds.csv')

In [None]:
df.drop_duplicates(subset=['product_title', 'product_category'], inplace=True, keep=False)

In [None]:
df = df[~((df.product_category == 'Video_Games') & (df.percent_rank == 0))]

In [None]:
len(df)

In [None]:
df['cnt_log'] = np.log(df.cnt)

In [None]:
df1 = df[[
    'product_category',
    'cnt', 
    'helpful_votes_cnt',
    'total_votes_cnt',
    'verified_purchase_cnt',
    'vine_cnt',
    'star_rating_mean',
    'helpful_votes_mean',
    'total_votes_mean',
    'star_rating_with_verified_purchase_mean',
    'star_rating_with_vine_mean',
    'star_rating_with_votes_mean',
    'star_rating_helpful_votes_weighted_mean',
    'helpful_votes_ratio',
    'verified_purchase_ratio',
    'vine_ratio',
    'cnt_log'
]].isnull().describe()

df1.loc[['count', 'freq'],:].sum() - 11026.0

In [None]:
training_target = ['percent_rank']

# Feature Selection with ExtraTreesRegressor

In [None]:
from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import LabelEncoder
# Build a forest and compute the feature importances
forest = ExtraTreesRegressor(n_estimators=250,
                              random_state=0)

training_ds = df[df.cnt >= 20.]
training_ds.reset_index(inplace=True, drop=True)
training_ds.fillna(0, inplace=True)

# scale input variables
# scaler = StandardScaler()
# scaler.fit(training_ds)
# X = scaler.transform(training_ds[training_features])
training_features = [
    'product_category',
    'cnt', 
    'helpful_votes_cnt',
    'total_votes_cnt',
    'verified_purchase_cnt',
    'vine_cnt',
    'star_rating_mean',
    'helpful_votes_mean',
    'total_votes_mean',
    'star_rating_with_verified_purchase_mean',
    'star_rating_with_vine_mean',
    'star_rating_with_votes_mean',
    'star_rating_helpful_votes_weighted_mean',
    'helpful_votes_ratio',
    'verified_purchase_ratio',
    'vine_ratio',
    'cnt_log'
]

X = training_ds[training_features]
le = LabelEncoder()
le.fit(X['product_category'])
X['product_category'] = le.transform(X['product_category'])
y = training_ds[training_target]

forest.fit(X, y)

importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %s (%f)" % (f, training_features[indices[f]], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

In [None]:
# training_features = ['cnt_log', 'star_rating_mean', 'verified_purchase_ratio', 'total_votes_mean']
training_features = ['cnt_log', 'star_rating_mean', 'verified_purchase_ratio', 'total_votes_mean',
                     'product_category', 'helpful_votes_ratio']


# Find significant count cut off

In [None]:
training_scores = []
test_scores = []
features_used = []
exclude_cnts = range(1, 51)

for exclude_cnt in exclude_cnts: 
    # select feature and target columns
#     training_features = ['cnt_log', 'star_rating_mean', 'verified_purchase_ratio', 'total_votes_mean']
#     training_target = ['percent_rank']
    training_ds = df[df.cnt >= exclude_cnt]
    training_ds.reset_index(inplace=True, drop=True)
    
    if 'product_category' in training_ds:
        le = LabelEncoder()
        le.fit(training_ds['product_category'])
        training_ds['product_category'] = le.transform(training_ds['product_category'])
        
    training_ds.fillna(0, inplace=True)
    
    # scale input variables
    scaler = StandardScaler()
    scaler.fit(training_ds[training_features])
    X = scaler.transform(training_ds[training_features])
    y = training_ds[training_target]
    
#     poly = PolynomialFeatures(2)
#     X = poly.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # train a model to learn from the dataset
    clf = linear_model.Lasso(alpha=0.01)
    clf.fit(X_train,y_train)
    train_score=clf.score(X_train,y_train)
    test_score=clf.score(X_test,y_test)
    coeff_used = np.sum(clf.coef_!=0)
    
    training_scores.append(train_score)
    test_scores.append(test_score)
    features_used.append(coeff_used)
    # # print best features
    # print("\ntop features:")
    # for coef in reversed(sorted(clf.coef_)):
    #     if coef > 0:
    #         coef_index = list(clf.coef_).index(coef)
    #         print("{} ({})".format(training_features[coef_index], coef))
    
plt.plot(exclude_cnts, np.clip(training_scores, 0, 1.))
plt.plot(exclude_cnts, np.clip(test_scores, 0, 1.))
plt.show()

In [None]:
significant_cut_off = 20

# Train using polynomial features and Lasso

In [None]:
# select feature and target columns
# training_features = ['cnt_log', 'star_rating_mean', 'verified_purchase_cnt', 'helpful_votes_mean', 'total_votes_mean']
training_target = ['percent_rank']
training_ds = df[df.cnt >= significant_cut_off]
training_ds = pd.concat([training_ds[training_ds['product_category'] != 'Video_DVD'],
                         training_ds[training_ds['product_category'] == 'Video_DVD'].sample(300)])
training_ds.reset_index(inplace=True, drop=True)

if 'product_category' in training_ds:
    le = LabelEncoder()
    le.fit(training_ds['product_category'])
    training_ds['product_category'] = le.transform(training_ds['product_category'])

# scale input variables
scaler = StandardScaler()
scaler.fit(training_ds[training_features])
X = scaler.transform(training_ds[training_features])
y = training_ds[training_target]

poly = PolynomialFeatures(2)
X = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# train a model to learn from the dataset
clf = linear_model.Lasso(alpha=0.01)
clf.fit(X_train,y_train)
train_score=clf.score(X_train,y_train)
test_score=clf.score(X_test,y_test)
coeff_used = np.sum(clf.coef_!=0)
print("training score:", train_score)
print("test score: ", test_score)
print("number of features used: ", coeff_used)

# # print best features
# print("\ntop features:")
# for coef in reversed(sorted(clf.coef_)):
#     if coef > 0:
#         coef_index = list(clf.coef_).index(coef)
#         print("{} ({})".format(training_features[coef_index], coef))

In [None]:
# Sample Predictions
training_pred_values = np.clip(clf.predict(X_train), 0., 1.)
training_pred_values = np.array(list(zip(training_pred_values, y_train.as_matrix().reshape(-1,))))
print('training_pred')
print(training_pred_values[:10])

test_pred_values = np.clip(clf.predict(X_test), 0., 1.)
test_pred_values = np.array(list(zip(test_pred_values, y_test.as_matrix().reshape(-1,))))
print('test_pred')
print(test_pred_values[:10])

In [None]:
print('training')
X_train
plt.scatter(training_pred_values[:,0], training_pred_values[:,1])
plt.plot([0.,1.], [0.,1.], alpha=.5)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.xlim([0., 1.])
plt.ylim([0., 1.])
plt.show()

print('test')
plt.scatter(test_pred_values[:,0], test_pred_values[:,1])
plt.plot([0.,1.], [0.,1.], alpha=.5)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.xlim([0., 1.])
plt.ylim([0., 1.])
plt.show()

In [None]:
regression_metrics = explained_variance_score, mean_absolute_error, r2_score

for metric in regression_metrics:
    print(metric)    
    print('train', metric(y_train, training_pred_values[:,0]))
    print('test', metric(y_test, test_pred_values[:,0]))

# Try different linear models

In [None]:
from sklearn import svm
from sklearn import tree
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

#X_train, X_test, y_train, y_test
lin_reg = linear_model.LinearRegression()
lin_reg.fit(X_train, y_train)
print('LinearRegression:', lin_reg.score(X_test, y_test))

ridge = linear_model.Ridge()
ridge.fit(X_train, y_train)
print('Ridge:', ridge.score(X_test, y_test))

lasso = linear_model.Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
print('Lasso:', lasso.score(X_test, y_test))

elastic_net = linear_model.ElasticNet(alpha=0.1)
elastic_net.fit(X_train, y_train)
print('ElasticNet:', elastic_net.score(X_test, y_test))

lasso_lars = linear_model.LassoLars()
lasso_lars.fit(X_train, y_train)
print('LassoLars:', lasso_lars.score(X_test, y_test))

bayesian_ridge = linear_model.BayesianRidge()
bayesian_ridge.fit(X_train, y_train)
print('BayesianRidge:', bayesian_ridge.score(X_test, y_test))

svc = svm.SVR(kernel='linear')
svc.fit(X_train, y_train)
print('SVR:', svc.score(X_test, y_test))

dtr = tree.DecisionTreeRegressor()
dtr.fit(X_train, y_train)
print('DTR:', dtr.score(X_test, y_test))

gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)
gbr.fit(X_train, y_train)
print('GBR:', gbr.score(X_test, y_test))

mlp = MLPRegressor()
mlp.fit(X_train, y_train)
print('MLP:', mlp.score(X_test, y_test))

# Compare separating categories

In [None]:
X_preds = np.clip(clf.predict(X), 0, 1.)
X_pred_values = np.array(list(zip(y.values.reshape(-1), X_preds)))
norms = np.abs(X_pred_values[:,0] - X_pred_values[:,1])
norms_args = np.argsort(norms)

training_ds_pred = training_ds.copy()
training_ds_pred['pred'] = X_preds
training_ds_pred.iloc[norms_args[-20:]]
# training_ds_pred.iloc[norms_args[:5]]
# [x for _,x in sorted(zip(norms,training_pred_values))]

In [None]:
# Source Data TODOs:
# Remove Critic_Score == 0 from product_category Video_Games
# Add helpful_votes_count and total_votes_count
# Add helpful_votes to total_votes ratio 
# Add helpful_votes to total_votes ratio multiplied by star_rating mean

# Verified purchased ratio
# Verified purchased star_rating non-zero mean

# Vine ratio
# Vine star_rating non-zero mean

In [None]:
# pd.read_csv('/private/tmp/amazon_reviews_us_Music_v1_00.tsv', sep='/t', )

In [None]:
df = pd.read_csv(DATA_SETS_FOLDER / 'amazon_reviews_us_Mobile_Electronics_v1_00.tsv', delimiter='\t',
                 error_bad_lines=False)

In [None]:
df

In [None]:
training_ds[training_ds.product_category == 'Video_Games'].describe()

In [None]:
training_ds[training_ds.product_category == 'Books'].describe()

In [None]:
training_ds[training_ds.product_category == 'Video_DVD'].describe()