In [46]:
import numpy as np
import pandas as pd
from pandas import DataFrame as DF, Series as Se
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib

In [47]:
# data file path
FILE_TRAIN = 'data/train.csv'
FILE_TEST = 'data/test.csv'

In [48]:
# combine the train and test set
df_train = pd.read_csv(FILE_TRAIN)
df_test = pd.read_csv(FILE_TEST)
TRAIN_SIZE = df_train.shape[0]
df = df_train[df_test.columns].append(df_test)

In [49]:
df.columns

Index(['id', 'is_free', 'price', 'genres', 'categories', 'tags',
       'purchase_date', 'release_date', 'total_positive_reviews',
       'total_negative_reviews'],
      dtype='object')

In [50]:
# get unique words of 'genres', 'category' and 'tag'
genres = []
category = []
tag = []
for x in df['genres']:
    tmp = x.split(',')
    for y in tmp:
        if y not in genres:
            genres.append(y)
for x in df['categories']:
    tmp = x.split(',')
    for y in tmp:
        if y not in category:
            category.append(y)
for x in df['tags']:
    tmp = x.split(',')
    for y in tmp:
        if y not in tag:
            tag.append(y)

In [51]:
# one-hot encoding
def getDummies(colname ,tlist):
    for x in tlist:
        tmp = []
        for y in df[colname]:
            if x in y:
                tmp.append(1)
            else:
                tmp.append(0)
        if sum(tmp) < 5:
            continue
        df[x] = tmp
    df.drop([colname], axis=1, inplace=True)

In [52]:
# one-hot encoding for 'genres' and 'category'
getDummies('genres', genres)
getDummies('categories', category)

In [53]:
import re
import nltk
from gensim.models import word2vec
from sklearn.feature_extraction.text import TfidfVectorizer

In [54]:
wpt = nltk.WordPunctTokenizer()
sw = nltk.corpus.stopwords.words('english')

In [55]:
df['tags'][:10]

0    Indie,Adventure,Story Rich,Casual,Atmospheric,...
1    Mod,Utilities,RPG,Game Development,Singleplaye...
2    Point & Click,Adventure,Story Rich,Comedy,Indi...
3    Medieval,RPG,Open World,Strategy,Sandbox,Actio...
4    Tower Defense,Co-op,Action,Strategy,Online Co-...
5    RPG,Open World,Survival,Co-op,Fantasy,Online C...
6    Great Soundtrack,Action,Violent,Indie,Top-Down...
7                  VR,Simulation,Utilities,Multiplayer
8    RPG,Turn-Based,Adventure,Fantasy,Great Soundtr...
9    Action,Adventure,VR,Shooter,Puzzle,FPS,First-P...
Name: tags, dtype: object

In [56]:
# preset the valid word set
valid_word = ["a","b","c","d","e","f","g","h","i","j","k","l","m","n",
              "o","p","q","r","s","t","u","v","w","x","y","z","A","B",
              "C","D","E","F","G","H","I","J","K","L","M","N","O","P",
              "Q","R","S","T","U","V","W","X","Y","Z","0","1","2","3",
              "4","5","6","7","8","9", '-',' ',',']
single_lower = ["a","b","c","d","e","f","g","h","i","j","k","l","m","n"
                ,"o","p","q","r","s","t","u","v","w","x","y","z"]

In [57]:
# spliting tag from tags
tag_text = []
for x in df['tags']:
    tmp = ''
    for y in x:
        if y in valid_word:
            tmp = tmp + y
    tmp = tmp.lower()
    tmp = tmp.replace(',',' ')
    tag_text.append(tmp)

In [58]:
# use tf-idf to find most valuable tags
tfidf = TfidfVectorizer(min_df=0.25, max_df=0.75, use_idf=True)
tfidf_matrix = tfidf.fit_transform(tag_text)
tfidf_matrix = tfidf_matrix.toarray()
vocab = tfidf.get_feature_names()
vocab = ['tag_' + x for x in vocab]
len(vocab)

In [61]:
vocab

['tag_action',
 'tag_atmospheric',
 'tag_co',
 'tag_fantasy',
 'tag_fi',
 'tag_great',
 'tag_indie',
 'tag_multiplayer',
 'tag_op',
 'tag_open',
 'tag_person',
 'tag_rich',
 'tag_rpg',
 'tag_sci',
 'tag_soundtrack',
 'tag_story',
 'tag_strategy',
 'tag_world']

In [62]:
df_tfidf = DF(np.round(tfidf_matrix, 2), columns=vocab)
df.drop('tags', axis=1, inplace=True)
df[df_tfidf.columns] = df_tfidf

In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 447 entries, 0 to 89
Data columns (total 67 columns):
id                            447 non-null int64
is_free                       447 non-null bool
price                         447 non-null float64
purchase_date                 443 non-null object
release_date                  447 non-null object
total_positive_reviews        443 non-null float64
total_negative_reviews        443 non-null float64
Adventure                     447 non-null int64
Casual                        447 non-null int64
Indie                         447 non-null int64
RPG                           447 non-null int64
Action                        447 non-null int64
Strategy                      447 non-null int64
Simulation                    447 non-null int64
Sports                        447 non-null int64
Massively Multiplayer         447 non-null int64
Violent                       447 non-null int64
Free to Play                  447 non-null int64
Early A

In [66]:
# processing date data
p_date = df['purchase_date']
r_data = df['release_date']

In [67]:
p_ts = [pd.Timestamp(x) for x in p_date]
r_ts = [pd.Timestamp(x) for x in r_data]
df.drop(['release_date','purchase_date'],axis=1,inplace=True)
df['p_timestamp'] = p_ts
df['r_timestamp'] = r_ts
df['pyear'] = df['p_timestamp'].apply(lambda d : d.year)
df['pmonth'] = df['p_timestamp'].apply(lambda d : d.month)
df['pday'] = df['p_timestamp'].apply(lambda d : d.day)
df['ryear'] = df['r_timestamp'].apply(lambda d : d.year)
df['rmonth'] = df['r_timestamp'].apply(lambda d : d.month)
df['rday'] = df['r_timestamp'].apply(lambda d : d.day)
df['pyear'].fillna(df.pyear.mode(), inplace=True)
df['ryear'].fillna(df.ryear.mode(), inplace=True)
df['pmonth'].fillna(df.pmonth.mode(), inplace=True)
df['rmonth'].fillna(df.rmonth.mode(), inplace=True)
df['pday'].fillna(df.pday.mode(), inplace=True)
df['rday'].fillna(df.rday.mode(), inplace=True)

In [83]:
# adding delta time from release date to purchase date
delta_days = []
for x in range(df.shape[0]):
    d = list(df.p_timestamp)[x] - list(df.r_timestamp)[x]
    delta_days.append(d.days)
df['delta_days'] = delta_days
df['delta_days'].fillna(0, inplace=True)

In [93]:
df.drop(['p_timestamp','r_timestamp'],axis=1,inplace=True)

In [89]:
# adding a feature named 'scores' generated by 'total_positive_reviews' and 'total_negative_reviews'
df['scores'] = df['total_positive_reviews'] / (df['total_positive_reviews'] + df['total_negative_reviews']) * 100

In [90]:
# fill in -1
df.fillna(-1, inplace=True)

In [94]:
# spliting dataset to train and test
time = df_train['playtime_forever']
df_train = df[:TRAIN_SIZE]
df_test = df[TRAIN_SIZE:]
df_train['playtime_forever'] = time

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [95]:
# saving data to csv files
df_train.to_csv('data/fix_train.csv')
df_test.to_csv('data/fix_test.csv')