<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Get-Raw-data" data-toc-modified-id="Get-Raw-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Get Raw data</a></span></li><li><span><a href="#2-Data-Preprocessing" data-toc-modified-id="2-Data-Preprocessing-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>2 Data Preprocessing</a></span><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Only-keep-data-in-'2000-01-01'~'2017-12-09'" data-toc-modified-id="Only-keep-data-in-'2000-01-01'~'2017-12-09'-2.0.1"><span class="toc-item-num">2.0.1&nbsp;&nbsp;</span>Only keep data in '2000-01-01'~'2017-12-09'</a></span></li><li><span><a href="#Join-in-Review-data-&amp;-Rating-data-on-'ReviewID'" data-toc-modified-id="Join-in-Review-data-&amp;-Rating-data-on-'ReviewID'-2.0.2"><span class="toc-item-num">2.0.2&nbsp;&nbsp;</span>Join in Review data &amp; Rating data on 'ReviewID'</a></span></li><li><span><a href="#Filter-reviewer-&amp;-product" data-toc-modified-id="Filter-reviewer-&amp;-product-2.0.3"><span class="toc-item-num">2.0.3&nbsp;&nbsp;</span>Filter reviewer &amp; product</a></span></li><li><span><a href="#Save-the-data-to-RES_DIR" data-toc-modified-id="Save-the-data-to-RES_DIR-2.0.4"><span class="toc-item-num">2.0.4&nbsp;&nbsp;</span>Save the data to RES_DIR</a></span></li></ul></li></ul></li></ul></div>

In [1]:
import os
import gzip
import json
import pandas as pd
import numpy as np
import pickle

from config import RAW_DIR, PRE_DIR, RES_DIR
from utils.data_porter import read_from_csv, save_to_csv

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

# Get Raw data

In [2]:
df_rating_dir = os.path.join(RAW_DIR, 'Video_Games.csv')
gz_review_dir = os.path.join(RAW_DIR, 'Video_Games_5.json.gz')

In [3]:
rating_data = read_from_csv(df_rating_dir, header=None, names=['ProductID', 'ReviewerID', 'Rating', 'TimeStamp'], )
rating_data['TimeStamp'] = pd.to_datetime(rating_data.TimeStamp, unit='s')
rating_data = rating_data.sort_values(by='TimeStamp')

In [4]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

review_data = getDF(gz_review_dir)
review_data['reviewTime'] = pd.to_datetime(review_data.unixReviewTime, unit='s')
review_data = review_data.sort_values(by='reviewTime')

# 2 Data Preprocessing

### Only keep data in '2000-01-01'~'2017-12-09'

In [5]:
rating_data = rating_data[(rating_data.TimeStamp>='2000-01-01')&(rating_data.TimeStamp<='2017-12-09')]
review_data = review_data[(review_data.reviewTime>='2000-01-01')&(review_data.reviewTime<='2017-12-09')]
rating_data = rating_data.reset_index(drop=True)
review_data = review_data.reset_index(drop=True)

In [6]:
rating_data.head(3)
review_data.head(3)

Unnamed: 0,ProductID,ReviewerID,Rating,TimeStamp
0,B00002NDEJ,A3M033XETXXQKT,2.0,2000-01-01
1,B00002EIWU,A39WJUYXP17PEQ,4.0,2000-01-01
2,B0000296ZD,A2O88ZPQKSOWF0,1.0,2000-01-01


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,4.0,False,2000-01-01,A261TLAGXR52NH,B00002CF8V,THOR (Global Gamer Reviewer/Previewer),"Take Resident Evil,jack its graphics up alot,s...",D*MN NEAR PERFECT!,946684800,6.0,,
1,4.0,False,2000-01-01,A261TLAGXR52NH,B00002CF96,THOR (Global Gamer Reviewer/Previewer),Half-Life:Opposing Force takes place as your c...,Better than the first?,946684800,,,
2,5.0,False,2000-01-01,A261TLAGXR52NH,B00002CF8U,THOR (Global Gamer Reviewer/Previewer),GTA2 is set in a futuristic city where you try...,Just read it!,946684800,2.0,{'Format:': ' Video Game'},


review_data
- reviewerID - ID of the reviewer, e.g. A2SUAM1J3GNN3B
- asin - ID of the product, e.g. 0000013714
- reviewerName - name of the reviewer
- vote - helpful votes of the review
- style - a disctionary of the product metadata, e.g., "Format" is "Hardcover"
- reviewText - text of the review
- overall - rating of the product
- summary - summary of the review
- unixReviewTime - time of the review (unix time)
- reviewTime - time of the review (raw)
- image - images that users post after they have received the product
 
rating_data
- These datasets include no metadata or reviews, but only (item,user,rating,timestamp) tuples.

### Join in Review data & Rating data on 'ReviewID'

In [7]:
# review_data = review_data.rename(columns={'asin':'ProductID'})
# data = rating_data.merge(review_data, how='outer', on='ProductID')
# data = data.reset_index(drop=True)
# data.head(5)

### Filter reviewer & product 

In [8]:
def review_filters(data, filtered_var, base_var, threshold):
    '''
    only keep the values of filter_var that meets:
    filter_var[base_var].sum() >= threshold
    '''
    count_df = data[[filtered_var, base_var]].groupby(filtered_var).count()
    valid_id = count_df[count_df[base_var]>=threshold].index.tolist()
    data = data[data[filtered_var].isin(valid_id)]
    return data

In [9]:
before_filtering = len(rating_data)

data = review_filters(rating_data, 'ReviewerID', 'TimeStamp', 5) 
# only keep the reviewer that has more than 5 ratings
data = review_filters(rating_data, 'ProductID', 'TimeStamp', 20) 
# only keep the product that has more than 20 ratings


data = data.reset_index(drop=True)
after_filtering = len(data)
data.head(5)

after_filtering/before_filtering

Unnamed: 0,ProductID,ReviewerID,Rating,TimeStamp
0,B0000296ZD,A2O88ZPQKSOWF0,1.0,2000-01-01
1,B00000K4MC,AMENNPIINM03J,5.0,2000-01-01
2,B00002S6CC,A11D7B5QADZEMT,1.0,2000-01-01
3,B00002S6CC,A1S37U1J8H6TN5,3.0,2000-01-01
4,B00002CF96,A261TLAGXR52NH,4.0,2000-01-01


0.8962778968000917

### Save the data to RES_DIR

In [10]:
with open(os.path.join(PRE_DIR, 'rating_data.pkl'), 'wb') as f:
    pickle.dump(data, f)