<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load-Data" data-toc-modified-id="Load-Data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load Data</a></span></li><li><span><a href="#Try-different-model" data-toc-modified-id="Try-different-model-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Try different model</a></span><ul class="toc-item"><li><span><a href="#BaselineOnly" data-toc-modified-id="BaselineOnly-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>BaselineOnly</a></span></li><li><span><a href="#Biased-SGD-model" data-toc-modified-id="Biased-SGD-model-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Biased SGD model</a></span></li><li><span><a href="#KNNBasic" data-toc-modified-id="KNNBasic-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>KNNBasic</a></span></li></ul></li></ul></div>

# Load Data

In [1]:
import os
import pickle
import pandas as pd

import matplotlib.pyplot as plt

from config import RAW_DIR, PRE_DIR, RES_DIR
from utils.data_porter import read_from_csv, save_to_csv

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

In [2]:
# 加载数据

with open(os.path.join(PRE_DIR, 'train_data.pkl'), 'rb') as f:
    train_data = pickle.load(f)
train_data.head()

with open(os.path.join(PRE_DIR, 'test_data.pkl'), 'rb') as f:
    test_data = pickle.load(f)
test_data.head()

Unnamed: 0,overall,asin,reviewTime,reviewerID,reviewerName,reviewText,summary,num_review_ps,num_review_gm
0,5.0,B0000296O5,1999-10-14,A2AXQTB83VMK4L,Amazon Customer,I'm having the most fun I've ever had on PlayS...,Best RPG Ever!,6,268
1,4.0,B00002NDRY,1999-11-05,A2T04VAIXSKJH2,Stefan,I'm usually not crazy about real-time strategy...,Good real time strategy game,6,67
2,5.0,B000021Y5F,1999-11-10,A1QA8K3LD9K892,Chris Adamson,Williams made games for hard-core arcade gamer...,A cool 80's artifact,23,7
3,4.0,B00000JL6V,1999-11-10,AMGJMFJ63DWWH,Ed Matuskey,"Once again you put your hand to the book, and ...","Beautiful game, with excellent (and hard!) puz...",5,37
4,5.0,B00000K4AX,1999-11-10,A3VWWQT4XDSBGQ,Joshua W. Fenton,"If you loved Half-Life, this is a must buy. I ...",AWESOME!,5,26


Unnamed: 0,overall,asin,reviewTime,reviewerID,reviewerName,reviewText,summary,num_review_ps,num_review_gm
424371,3.0,B00L59D9HG,2017-01-16,A1JO2IPCY1J4PS,Honest Frizz,No problem at all with the charger itself. It...,You have to buy it Separately,5,390
424372,5.0,B00005ATSN,2017-01-16,ANVJU2ROVJC8A,Jemi Linked,Great game!,LOZ forever!,9,38
424373,5.0,B015OYM10I,2017-01-16,A19K4H7U79QKE0,Rafael Quintero,Awesome!,Five Stars,11,45
424374,5.0,B00Y4S5KPY,2017-01-16,A11JVJHDF59HJC,Hammerton,Son loves it and works with other razer color ...,Five Stars,7,119
424375,1.0,B00O4FOB3O,2017-01-16,A1JXGU255ZN7JJ,MasterBlaster,I'm updating my review after having used these...,great to start but...,8,31


In [3]:
train_data.shape
val_data = train_data.iloc[int(train_data.shape[0]*0.9):]
val_data.shape
train_data = train_data.iloc[:int(train_data.shape[0]*0.9)]
train_data.shape

(424371, 9)

(42438, 9)

(381933, 9)

In [4]:
train_rate = train_data[['asin', 'reviewerID', 'overall']]
val_rate = val_data[['asin', 'reviewerID', 'overall']]
test_rate = test_data[['asin', 'reviewerID', 'overall']]

# Try different model

In [5]:
from surprise import SVD
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import GridSearchCV
from surprise import Dataset
from surprise import BaselineOnly
from surprise import KNNBaseline

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error as mae

In [6]:
reader = Reader(rating_scale = (0.0, 5.0))
train_set = Dataset.load_from_df(train_rate, reader).build_full_trainset()
val_set = Dataset.load_from_df(val_rate, reader).build_full_trainset().build_testset()
test_set = Dataset.load_from_df(test_rate, reader).build_full_trainset().build_testset()

In [7]:
def print_metrics_sp(predictions):
    mae_score = accuracy.mae(predictions)
    mse_score = accuracy.mse(predictions)
    return mae_score, mse_score

## BaselineOnly

In [8]:
new_train_rate = pd.concat([train_rate, val_rate])
new_train_set = Dataset.load_from_df(new_train_rate, reader).build_full_trainset()

In [9]:
new_train_rate.head()

Unnamed: 0,asin,reviewerID,overall
0,B0000296O5,A2AXQTB83VMK4L,5.0
1,B00002NDRY,A2T04VAIXSKJH2,4.0
2,B000021Y5F,A1QA8K3LD9K892,5.0
3,B00000JL6V,AMGJMFJ63DWWH,4.0
4,B00000K4AX,A3VWWQT4XDSBGQ,5.0


In [10]:
bsl_options = {
    'method': 'als',
    'n_factors':200, 
    'n_epochs':30,
    'lr_all':0.005,
    'reg_all':0.1
}
bias_baseline = BaselineOnly(bsl_options)
bias_baseline.fit(new_train_set)
predictions = bias_baseline.test(test_set)

_, _ = print_metrics_sp(predictions)

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x7fd18af0ca10>

MAE:  0.8497
MSE: 1.2221


In [2]:
# # 设置超参数列表进行网格搜索法调参


# para	mae	mse
# 0	10,0.1	0.84506	1.193916
# 1	10,0.3	0.84506	1.193916
# 2	10,0.5	0.84506	1.193916
# 3	20,0.1	0.84506	1.193916
# 4	20,0.3	0.84506	1.193916
# 5	20,0.5	0.84506	1.193916
# 6	50,0.1	0.84506	1.193916
# 7	50,0.3	0.84506	1.193916
# 8	50,0.5	0.84506	1.193916

In [64]:
# 测试集

bsl_options = {
    'method': 'als',
    'n_factors':200, 
    'n_epochs':30,
    'lr_all':0.005,
    'reg_all':0.1
}
bias_baseline = BaselineOnly(bsl_options)
bias_baseline.fit(new_train_set)
predictions = bias_baseline.test(test_set)

_, _ = print_metrics_sp(predictions)

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x7fd18deda690>

MAE:  0.8497
MSE: 1.2221


## Biased SGD model

In [29]:
bias_sgd = SVD(n_factors=200, n_epochs=30, lr_all=0.005, reg_all=0.02)
bias_sgd.fit(new_train_set)
predictions = bias_sgd.test(test_set)
print_metrics_sp(predictions)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fd16554b150>

MAE:  0.8419
MSE: 1.2388


(0.8419392576060248, 1.2387782765269668)

In [3]:
# 	para	mae	mse
# 0	10,0.01	0.821631	1.186248
# 1	10,0.02	0.822572	1.186152
# 2	10,0.03	0.822677	1.184347
# 3	10,0.05	0.823190	1.182164
# 4	50,0.03	0.825971	1.188587
# 5	50,0.02	0.826106	1.192792
# 6	50,0.01	0.826516	1.195176
# 7	50,0.05	0.826520	1.186558
# 8	100,0.05	0.829048	1.190026
# 9	100,0.03	0.829188	1.196336
# 10	100,0.02	0.830709	1.200818
# 11	100,0.01	0.830784	1.200338
# 12	200,0.05	0.835201	1.198558
# 13	200,0.02	0.836951	1.208555
# 14	200,0.01	0.837601	1.212817
# 15	200,0.03	0.838725	1.208075

In [67]:
# 测试集

bias_sgd = SVD(n_factors=10, n_epochs=30, lr_all=0.005, reg_all=0.01)
bias_sgd.fit(new_train_set)
predictions = bias_sgd.test(test_set)
print_metrics_sp(predictions)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fd18fd0df90>

MAE:  0.8290
MSE: 1.2214


(0.8290480470206647, 1.2213790113726917)

## KNNBasic

In [47]:
knn_model = KNNBaseline(k=40, min_k=1)
knn_model.fit(new_train_set)
predictions = knn_model.test(test_set)
print_metrics_sp(predictions)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x7fd18b084f10>

MAE:  0.8447
MSE: 1.3351


(0.8446775544982594, 1.3351487067036343)

In [4]:
# 	para	mae	mse
# 0	50,2	0.834428	1.198282
# 1	100,2	0.834839	1.198560
# 2	100,10	0.834841	1.198160
# 3	200,2	0.834881	1.198294
# 4	50,10	0.834953	1.198495
# 5	300,1	0.835049	1.197721
# 6	200,1	0.835081	1.197317
# 7	200,3	0.835127	1.197259
# 8	300,10	0.835300	1.199509
# 9	50,1	0.835386	1.199700
# 10	300,2	0.835397	1.198938
# 11	200,10	0.835421	1.198375
# 12	300,3	0.835599	1.198046
# 13	50,3	0.835720	1.200810
# 14	100,1	0.835840	1.199276
# 15	100,3	0.835919	1.198542

In [70]:
# 测试集

knn_model = KNNBaseline(k=50, min_k=2)
knn_model.fit(new_train_set)
predictions = knn_model.test(test_set)
print_metrics_sp(predictions)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x7fd13c138a10>

MAE:  0.8421
MSE: 1.2509


(0.8421157020089943, 1.2508704827986867)