In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import string
import sys

default_stdout = sys.stdout
default_stderr = sys.stderr
reload(sys)
sys.stdout = default_stdout
sys.stderr = default_stderr
sys.setdefaultencoding('utf8')

# Clean game data

In [79]:
lines = [line.rstrip('\n') for line in open('./raw/game.csv')]
lines_join = []
s = ""
for line in lines:
    s += line
    s = s.replace('\xe2\x84\xa2', '') # delete '™'
    if line[-1] == "\"" and line[-2] == "]":
        lines_join.append(s + '\n')
        s = ""
        
fatt = open('./raw/game_attribute.txt', 'r')
game_att = fatt.readlines()
fatt.close()
        
fout = open('./clean/game_p.csv', 'w')
fout.writelines(game_att[0] + '\n')
fout.writelines(lines_join)
fout.close()

In [80]:
game = pd.DataFrame.from_csv('./clean/game_p.csv')
print game.shape
print game.dtypes
game.head(15)

(15, 4)
game_name       object
release_date    object
price           object
game_tag        object
dtype: object


Unnamed: 0_level_0,game_name,release_date,price,game_tag
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
730,Counter-Strike: Global Offensive,2012-08-21,$14.99 USD,"['FPS', 'Multiplayer', 'Shooter', 'Action', 'T..."
252950,Rocket League,2015-07-07,$19.99 USD,"['Racing', 'Multiplayer', 'Soccer', 'Sports', ..."
374320,DARK SOULS III,2016-04-11,$59.99 USD,"['Dark Fantasy', 'Difficult', 'Atmospheric', '..."
363680,Battlefleet Gothic: Armada,2016-04-21,$39.99 USD,"['Strategy', 'Warhammer 40K', 'Space', 'RTS', ..."
322330,Don't Starve Together,2016-04-21,$10.04 USD,"['Survival', 'Multiplayer', 'Co-op', 'Adventur..."
433850,H1Z1: King of the Kill,2016-02-17,$19.99 USD,"['Early Access', 'Massively Multiplayer', 'Sur..."
271590,Grand Theft Auto V,2015-04-14,$44.99 USD,"['Open World', 'Action', 'Multiplayer', 'First..."
220200,Kerbal Space Program,2015-04-27,$39.99 USD,"['Space', 'Simulation', 'Sandbox', 'Physics', ..."
230410,Warframe,2013-03-25,Free to Play,"['Free to Play', 'Action', 'Co-op', 'Third-Per..."
427520,Factorio,2016-02-25,$20.00 USD,"['Early Access', 'Base-Building', 'Resource Ma..."


# Clean review data

In [2]:
reviews_df = pd.DataFrame.from_csv('./raw/review.csv')
reviews_df.polarity = reviews_df.polarity.astype(int)
reviews_df.hour_of_gameplay = reviews_df.hour_of_gameplay.str.replace(',', '').astype(float)
reviews_df.head()

Unnamed: 0_level_0,hour_of_gameplay,content_review,helpful_vote,total_vote,funny_vote,number_comment,polarity
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
730,312.4,Every team consists of 5 people:1.You2.Russian...,12407,13529,15281,234,1
730,1117.5,"Grades, ruined.Social life, ruined.Relationshi...",2122,2295,2771,38,1
730,546.7,"CS:GO is like Roulette, funny until it gets ru...",3340,3657,4058,34,1
730,578.5,Here's what you learn and get with Counter-Str...,3184,3521,4235,69,1
730,178.0,Over 15 maps in casual mode...You will only pl...,1406,1553,1051,48,1


### Split dataset into train/validate/test set

In [3]:
def get_split_df(reviews_df, num_train_reviews, num_val_reviews, num_test_reviews):
    num_reviews = num_train_reviews + num_val_reviews + num_test_reviews
    game_ids = reviews_df.index.unique()

    train_reviews_df = pd.DataFrame()
    val_reviews_df = pd.DataFrame()
    test_reviews_df = pd.DataFrame()
    for i in game_ids:
        game_reviews_df = reviews_df[reviews_df.index == i]
        small_reviews_df = game_reviews_df.sample(min(num_reviews, len(game_reviews_df)))
        reviews_frac = len(small_reviews_df) / float(num_reviews)

        
        train_reviews_df = pd.concat([train_reviews_df, small_reviews_df[ : int(num_train_reviews * reviews_frac)]])
        val_reviews_df = pd.concat([val_reviews_df, small_reviews_df[int(num_train_reviews * reviews_frac) : int((num_train_reviews + num_val_reviews) * reviews_frac)]])
        test_reviews_df = pd.concat([test_reviews_df, small_reviews_df[int((num_train_reviews + num_val_reviews) * reviews_frac) : ]])
    return train_reviews_df, val_reviews_df, test_reviews_df
    

In [4]:
num_train_reviews = 1000 # per game
num_val_reviews = 200 # per game
num_test_reviews = 500 # per game
    
train_reviews_df, val_reviews_df, test_reviews_df = get_split_df(reviews_df, num_train_reviews, num_val_reviews, num_test_reviews)
train_reviews_df.to_csv('./clean/train_reviews.csv')
val_reviews_df.to_csv('./clean/val_reviews.csv')
test_reviews_df.to_csv('./clean/test_reviews.csv')

In [9]:
tmp_df = pd.DataFrame.from_csv('./clean/train_reviews.csv')
tmp_df.describe()

Unnamed: 0,hour_of_gameplay,helpful_vote,total_vote,funny_vote,number_comment,polarity
count,59129.0,59129.0,59129.0,59129.0,59129.0,59129.0
mean,584.975822,519.694465,617.25441,245.658087,15.79839,0.624888
std,1212.945605,1149.474933,1364.55254,791.041344,39.24428,0.484156
min,0.1,0.0,1.0,0.0,0.0,0.0
25%,33.3,1.0,2.0,0.0,0.0,0.0
50%,115.4,11.0,21.0,1.0,1.0,1.0
75%,604.3,514.0,602.0,80.0,14.0,1.0
max,14430.4,12407.0,14475.0,15281.0,430.0,1.0


In [6]:
review = pd.DataFrame.from_csv('./clean/review_small_p.csv', encoding='utf-8')
review.polarity = review.polarity.astype(int)
review.hour_of_gameplay = review.hour_of_gameplay.str.replace(',', '').astype(float)
#review.content_review = review.content_review.astype('str')
review.content_review.apply(str)

print review.dtypes
print len(review)
#review.describe()
review.head(20)

hour_of_gameplay    float64
content_review       object
helpful_vote          int64
total_vote            int64
funny_vote            int64
number_comment        int64
polarity              int64
dtype: object
100


Unnamed: 0_level_0,hour_of_gameplay,content_review,helpful_vote,total_vote,funny_vote,number_comment,polarity
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
730,312.4,Every team consists of 5 people:1.You2.Russian...,12407,13529,15281,234,1
730,1117.5,"Grades, ruined.Social life, ruined.Relationshi...",2122,2295,2771,38,1
730,546.7,"CS:GO is like Roulette, funny until it gets ru...",3340,3657,4058,34,1
730,578.5,Here's what you learn and get with Counter-Str...,3184,3521,4235,69,1
730,178.0,Over 15 maps in casual mode...You will only pl...,1406,1553,1051,48,1
730,1595.7,Start playing at 6:00Play for five minutesLook...,707,765,728,16,1
730,3684.9,You can set the water on fire.You can survive ...,1133,1254,1195,54,1
730,1053.7,1.Plays 100 games to get exp to level 32. wait...,687,763,587,33,1
730,1234.6,"Kill someone with a P90 - ""You're a fuc**** no...",555,611,617,80,1
730,326.8,I spent 400 bucks for nothing but bluesI still...,447,500,391,73,1


In [7]:
review.content_review.map(lambda x: len(x)).mean()

514.88

In [8]:
review.content_review

ID
730    Every team consists of 5 people:1.You2.Russian...
730    Grades, ruined.Social life, ruined.Relationshi...
730    CS:GO is like Roulette, funny until it gets ru...
730    Here's what you learn and get with Counter-Str...
730    Over 15 maps in casual mode...You will only pl...
730    Start playing at 6:00Play for five minutesLook...
730    You can set the water on fire.You can survive ...
730    1.Plays 100 games to get exp to level 32. wait...
730    Kill someone with a P90 - "You're a fuc**** no...
730    I spent 400 bucks for nothing but bluesI still...
730    You play this game.You're apparently either a ...
730    -Opens game-Buys cases-400 blues later-Searchs...
730    Counter-Strike is one of the most popular fran...
730    If your looking for a review on this game you'...
730    Beautiful game. First bullet accuracy of rifle...
730    Don't Pay Hundreds Of Bucks To Schools To Teac...
730    Keep Low Move FastKill First Die LastOne ShotO...
730    *10 yr old teammate s

In [9]:
import string
printable = set(string.printable)
b = filter(lambda x: x in printable, a)
c = b.replace(',', ' ').replace('.', ' ').lower()
print a
print b
print c

NameError: name 'a' is not defined

In [None]:
import re
b = re.sub(r'[^\x00-\x7F]+',' ', a)
b