<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Get-Raw-data" data-toc-modified-id="Get-Raw-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Get Raw data</a></span></li><li><span><a href="#Statistical-Description" data-toc-modified-id="Statistical-Description-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Statistical Description</a></span></li><li><span><a href="#NLP-data-processing" data-toc-modified-id="NLP-data-processing-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>NLP data processing</a></span></li></ul></div>

In [1]:
import os
import gzip
import json
import pandas as pd
import numpy as np

from config import RAW_DIR, PRE_DIR, RES_DIR
from utils.data_porter import read_from_csv, save_to_csv

# Get Raw data

In [2]:
df_vgame_dir = os.path.join(RAW_DIR, 'Video_Games.csv')
gz_vgame_dir = os.path.join(RAW_DIR, 'Video_Games_5.json.gz')

In [3]:
rating_data = read_from_csv(df_vgame_dir, header=None, names=['ProductID', 'ReviewerID', 'Rating', 'TimeStamp'])
rating_data['TimeStamp'] = pd.to_datetime(rating_data.TimeStamp, unit='s')
print()
rating_data.head()




Unnamed: 0,ProductID,ReviewerID,Rating,TimeStamp
0,439381673,A21ROB4YDOZA5P,1.0,2014-06-09
1,439381673,A3TNZ2Q5E7HTHD,3.0,2014-05-10
2,439381673,A1OKRM3QFEATQO,4.0,2014-02-07
3,439381673,A2XO1JFCNEYV3T,1.0,2014-02-07
4,439381673,A19WLPIRHD15TH,4.0,2014-01-16


In [4]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')


review_data = getDF(gz_vgame_dir)
review_data['unixReviewTime'] = pd.to_datetime(review_data.unixReviewTime, unit='s')
review_data = review_data.rename(columns={'asin': 'ProductID', 'reviewerID': 'ReviewerID'})
review_data.head()

Unnamed: 0,overall,verified,reviewTime,ReviewerID,ProductID,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,5.0,True,"10 17, 2015",A1HP7NVNPFMA4N,700026657,Ambrosia075,"This game is a bit hard to get the hang of, bu...",but when you do it's great.,2015-10-17,,,
1,4.0,False,"07 27, 2015",A1JGAP0185YJI6,700026657,travis,I played it a while but it was alright. The st...,"But in spite of that it was fun, I liked it",2015-07-27,,,
2,3.0,True,"02 23, 2015",A1YJWEXHQBWK2B,700026657,Vincent G. Mezera,ok game.,Three Stars,2015-02-23,,,
3,2.0,True,"02 20, 2015",A2204E1TH211HT,700026657,Grandma KR,"found the game a bit too complicated, not what...",Two Stars,2015-02-20,,,
4,5.0,True,"12 25, 2014",A2RF5B5H74JLPE,700026657,jon,"great game, I love it and have played it since...",love this game,2014-12-25,,,


# Statistical Description 

In [5]:
rating_data = rating_data.sort_values(by='TimeStamp')

In [6]:
rating_data['Rating'].value_counts()

5.0    1487366
4.0     412413
1.0     311891
3.0     212346
2.0     141333
Name: Rating, dtype: int64

In [7]:
rating_data.isna().sum()

ProductID     0
ReviewerID    0
Rating        0
TimeStamp     0
dtype: int64

In [8]:
print(f"Product total: {rating_data['ProductID'].nunique()}")
print(f"Reviewer total: {rating_data['ReviewerID'].nunique()}")

Product total: 71982
Reviewer total: 1540618


In [9]:
review_data['reviewText'].iloc[1]

'I played it a while but it was alright. The steam was a bit of trouble. The more they move these game to steam the more of a hard time I have activating and playing a game. But in spite of that it was fun, I liked it. Now I am looking forward to anno 2205 I really want to play my way to the moon.'

In [10]:
review_data['summary'].iloc[1]

'But in spite of that it was fun, I liked it'

# NLP data processing

In [12]:
import re
import nltk
# import nltk
# nltk.download('stopwords')
# 手动添加
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
english_stemmer=nltk.stem.SnowballStemmer('english')

In [94]:
def data_clean(rev, remove_stopwords=True): 
    
    try:
        new_text = re.sub("[^a-zA-Z]"," ", rev)
    except:
        print(rev)
        new_text = []
   
    words = new_text.lower().split()
    
    if remove_stopwords:
        sts = set(stopwords.words("english"))
        words = [w for w in words if not w in sts]
    ary=[]
    eng_stemmer = english_stemmer 
    for word in words:
        ary.append(eng_stemmer.stem(word))

    new_ary = ' '.join(ary)
    return(new_ary)

In [95]:
review_data.shape

(497577, 12)

In [98]:
part_data = review_data[: 10000].fillna('')
part_data = part_data.drop(columns=['vote', 'style', 'image', 'reviewTime', 'reviewerName', 'verified'])
part_data = part_data.sort_values(by='unixReviewTime')
part_data.head()

Unnamed: 0,overall,ReviewerID,ProductID,reviewText,summary,unixReviewTime
6043,5.0,A2AXQTB83VMK4L,B0000296O5,I'm having the most fun I've ever had on PlayS...,Best RPG Ever!,1999-10-14
6978,4.0,A2T04VAIXSKJH2,B00002NDRY,I'm usually not crazy about real-time strategy...,Good real time strategy game,1999-11-05
4124,5.0,A3VWWQT4XDSBGQ,B00000K4AX,"If you loved Half-Life, this is a must buy. I ...",AWESOME!,1999-11-10
5861,5.0,A1QA8K3LD9K892,B000021Y5F,Williams made games for hard-core arcade gamer...,A cool 80's artifact,1999-11-10
5993,4.0,AMGJMFJ63DWWH,B000021XYY,"This game actually scared me a couple times, a...","A good game, but way too short!",1999-11-10


In [99]:
part_data['reviewText'] = part_data['reviewText'].apply(lambda x: data_clean(x))
part_data['summary'] = part_data['summary'].apply(lambda x: data_clean(x))

part_data['docs'] = part_data.apply(lambda x: x.reviewText + x.summary, axis=1)

In [100]:
part_data.head()

Unnamed: 0,overall,ReviewerID,ProductID,reviewText,summary,unixReviewTime,docs
6043,5.0,A2AXQTB83VMK4L,B0000296O5,fun ever playstat high recommend best rpg ever,best rpg ever,1999-10-14,fun ever playstat high recommend best rpg ever...
6978,4.0,A2T04VAIXSKJH2,B00002NDRY,usual crazi real time strategi game tend becom...,good real time strategi game,1999-11-05,usual crazi real time strategi game tend becom...
4124,5.0,A3VWWQT4XDSBGQ,B00000K4AX,love half life must buy big fpshooter fan ss l...,awesom,1999-11-10,love half life must buy big fpshooter fan ss l...
5861,5.0,A1QA8K3LD9K892,B000021Y5F,william made game hard core arcad gamer collec...,cool artifact,1999-11-10,william made game hard core arcad gamer collec...
5993,4.0,AMGJMFJ63DWWH,B000021XYY,game actual scare coupl time wit supernatur ph...,good game way short,1999-11-10,game actual scare coupl time wit supernatur ph...


In [101]:
part_data['overall'].value_counts()

5.0    6627
4.0    1809
3.0     808
1.0     388
2.0     368
Name: overall, dtype: int64

In [103]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

In [104]:
corpus = part_data['reviewText'].tolist()

In [106]:
vectorizer = TfidfVectorizer(min_df=20, max_df=0.5)
X = vectorizer.fit_transform(corpus)
print(X.shape)

(10000, 2863)


In [108]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Embedding

In [112]:
train_x = X[: int(X.shape[0] * 0.8)]
val_x = X[int(X.shape[0] * 0.8): int(X.shape[0] * 0.9)]
test_x = X[int(X.shape[0] * 0.9) :]
print(f"train_x: {train_x.shape}")
print(f"val_x: {val_x.shape}")
print(f"test_x: {test_x.shape}")

train_x: (8000, 2863)
val_x: (1000, 2863)
test_x: (1000, 2863)


In [119]:
y_encode = {
    1: [1, 0, 0, 0, 0],
    2: [0, 1, 0, 0, 0],
    3: [0, 0, 1, 0, 0],
    4: [0, 0, 0, 1, 0],
    5: [0, 0, 0, 0, 1]
}
part_data['y'] = part_data['overall'].apply(lambda x: y_encode[x])

train_y = np.array(part_data['y'][: int(X.shape[0] * 0.8)].tolist())
val_y = np.array(part_data['y'][int(X.shape[0] * 0.8): int(X.shape[0] * 0.9)].tolist())
test_y = np.array(part_data['y'][int(X.shape[0] * 0.9) :].tolist())

In [120]:
model = Sequential()
model.add(Dense(256, input_dim=train_x.shape[1]))
model.add(Dropout(rate=0.2))
model.add(Dense(128, input_dim=train_x.shape[1]))
model.add(Dropout(rate=0.2))
model.add(Dense(5, activation='softmax'))

model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['accuracy'])

In [123]:
callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=10,
    mode='auto', restore_best_weights=False
)# verbose

In [124]:
model.fit(train_x, train_y,
          validation_data=(val_x, val_y),
          epochs=100,
          batch_size=32,
          verbose=1, 
          callbacks=[callback])
model.evaluate(test_x, test_y)[1]

Train on 8000 samples, validate on 1000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100


0.726