In [61]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [243]:
df = pd.read_csv("data/steam.csv")
requirements = pd.read_csv("data/steam.csv")
print(len(df.columns))
print(df.shape)
print(len(df['appid'].unique()))

18
(27075, 18)
27075


In [264]:
categorical = ['english','required_age','platforms','owners']
numerical = ['positive_ratings','negative_ratings','price','achievements','average_playtime','median_playtime']
for i in df.columns:
    if i in categorical:
        print(i)
        pd.value_counts(df[i]).plot(kind='bar')
        plt.ylabel('count')
        plt.xlabel(i)
#         plt.show()
        plt.savefig('figures/{} histogram.png'.format(i),bbox_inches = 'tight')
        plt.clf()
    elif i in numerical:
        print(i)
        df[i].plot.hist(bins=int(np.sqrt(df.shape[0])))
        plt.ylabel('count')
        plt.xlabel(i)
        plt.savefig('figures/{} histogram.png'.format(i),bbox_inches = 'tight')
        plt.clf()
#         plt.show()

english
platforms
required_age
achievements
positive_ratings
negative_ratings
average_playtime
median_playtime
owners
price


<Figure size 432x288 with 0 Axes>

In [265]:
categories = df['required_age'].unique()

for c in categories:
    ax = plt.hist(df[df['required_age']==c]['owners'],alpha=0.5,label=c,density=True)
plt.legend()
plt.ylabel('counts')
plt.xlabel('owners')
plt.xticks(rotation=90)
txt = '''Fig.5 The x axis is categorical variable "owners" and y axis is the count.
Legend is the categorical variable required_age for each game.
From the figure we can see that >18 group takes majority in game that has more owners.'''
plt.figtext(0.5, -0.5, txt, wrap=True, horizontalalignment='center', fontsize=12)
plt.savefig('figures/owners & required_age.png', bbox_inches = 'tight')
plt.clf()

<Figure size 432x288 with 0 Axes>

In [266]:
mask = df.owners.isin(['10000000-20000000', '20000000-50000000', '50000000-100000000', '100000000-200000000'])
df.loc[mask, 'owners'] = '10mil and more'
df.loc[df.owners == '5000000-10000000', 'owners'] = '5mil-10mil'
df.loc[df.owners == '2000000-5000000', 'owners'] = '2mil-5mil'
df.loc[df.owners == '1000000-2000000', 'owners'] = '1mil-2mil'
df.loc[df.owners == '500000-1000000', 'owners'] = '500k-1mil'
df.loc[df.owners == '200000-500000', 'owners'] = '200k-500k'
df.loc[df.owners == '100000-200000', 'owners'] = '100k-200k'
df.loc[df.owners == '50000-100000', 'owners'] = '50k-100k'
df.loc[df.owners == '20000-50000', 'owners'] = '20k-50k'
df.loc[df.owners == '0-20000', 'owners'] = '20k or less'
plt.figure(figsize=(12,8))
ax = df['owners'].hist()
ax.set_yscale('log')
plt.xlabel('owners range')
plt.ylabel('number of data scaled by log')
txt = '''Fig.1 X is the target categorical variable "owners" and y is the count of each category scaled by log.
From the plot, we can see that the number of games decreases when the number of owners decreases'''
plt.figtext(0.5, -0.005, txt, wrap=True, horizontalalignment='center', fontsize=12)
plt.savefig("figures/owners.png", bbox_inches = 'tight')
plt.clf()

<Figure size 864x576 with 0 Axes>

In [267]:
ax = df[['price','owners']].boxplot(by='owners')
plt.ylabel('price')
plt.xlabel('owners')
ax.set_yscale('log')
plt.xticks(rotation=90)
plt.title('')
txt = '''Fig.2 X is the target categorical variable "owners" and y is the price of the game scaled by log.
From the plot, we can see that the price range is larger for game that has more owners'''
plt.figtext(0.5, -0.3, txt, wrap=True, horizontalalignment='center', fontsize=12)
plt.savefig('figures/price & owners.png', bbox_inches = 'tight')
plt.clf()

<Figure size 432x288 with 0 Axes>

In [71]:
for i in df.columns:
    print(i, len(df[i].value_counts()))

appid 27075
name 27033
release_date 2619
english 2
developer 17113
publisher 14354
platforms 7
required_age 6
categories 3333
genres 1552
steamspy_tags 6423
achievements 410
positive_ratings 2800
negative_ratings 1492
average_playtime 1345
median_playtime 1312
owners 10
price 282


In [270]:
count = df['genres'].value_counts()
x = []
y = []
tot = sum(count)
for i in count.index:
    if count[i] >= 100:
        x.append(i)
        y.append(count[i]/tot)
plt.bar(x, y)
plt.xticks(rotation=90)
plt.xlabel('genres')
plt.ylabel('percentage')
plt.title('Majority Genres (>100 data points)')
txt = '''Fig.3 X is the target categorical variable "genres" and y is the percentage of each category.
From the plot, we can see that "Action" and "Indie" style game is the majority'''
plt.figtext(0.5, -0.8, txt, wrap=True, horizontalalignment='center', fontsize=12)
plt.savefig('figures/Genres.png', bbox_inches = 'tight')
plt.clf()

<Figure size 432x288 with 0 Axes>

In [271]:
pd.plotting.scatter_matrix(df[['required_age','average_playtime','positive_ratings','negative_ratings','achievements']], figsize=(11, 11), c=pd.get_dummies(df['owners']).iloc[:,1],marker='o',hist_kwds={'bins': 50}, 
                           s=30, alpha=.1)
txt = """Fig.4 Scatter Matrix of 'required_age','average_playtime','positive_ratings','negative_ratings','achievements'"""
plt.figtext(0.5, -0.01, txt, wrap=True, horizontalalignment='center', fontsize=12)
plt.savefig("figures/scatter matrix.png", bbox_inches='tight')
plt.clf()

<Figure size 792x792 with 0 Axes>

In [172]:
import time
import datetime
# change date to timestamps
df = pd.read_csv("data/steam.csv")
df['release_date'] = [time.mktime(datetime.datetime.strptime(s, "%Y-%m-%d").timetuple()) for s in df['release_date']]
print(df['release_date'].head())

0    9.730548e+08
1    9.229428e+08
2    1.051762e+09
3    9.913680e+08
4    9.414324e+08
Name: release_date, dtype: float64


In [213]:
df.loc[df.owners == '5000000-10000000', 'owners'] = '>5000000'
df.loc[df.owners == '10000000-20000000', 'owners'] = '>5000000'
df.loc[df.owners == '20000000-50000000', 'owners'] = '>5000000'
df.loc[df.owners == '50000000-100000000', 'owners'] = '>5000000'
df.loc[df.owners == '100000000-200000000', 'owners'] = '>5000000'
print(df['owners'].value_counts())

0-20000            18596
20000-50000         3059
50000-100000        1695
100000-200000       1386
200000-500000       1272
500000-1000000       513
1000000-2000000      288
2000000-5000000      193
>5000000              73
Name: owners, dtype: int64


In [209]:
reeval = ['developer', 'publisher', 'platforms', 'steamspy_tags','required_age','categories','genres','english']
for i in reeval:
    vc = df[i].value_counts()
    ones = [c for c in vc.index if vc[c]==1]
    for j in range(0, len(df[i])):
        if df.loc[j,i] in ones:
            df.loc[j, i] = 'other'
    print(df[i].value_counts())

other                         13266
Choice of Games                  94
KOEI TECMO GAMES CO., LTD.       72
Ripknot Systems                  62
Laush Dmitriy Sergeevich         51
                              ...  
Anarkis Gaming                    2
Alex Hanson-White                 2
Anton Shatalov                    2
PlayFusion                        2
Weekend Soft                      2
Name: developer, Length: 3848, dtype: int64
other               11047
Big Fish Games        212
Strategy First        136
Ubisoft               111
THQ Nordic             98
                    ...  
Joni Bäckström          2
Lazy Monday Ltd         2
OCP                     2
H-GAME                  2
Flying Fire, LLC        2
Name: publisher, Length: 3308, dtype: int64
windows              18398
windows;mac;linux     4623
windows;mac           3439
windows;linux          610
mac                      3
other                    2
Name: platforms, dtype: int64
other                               42

In [235]:
#Prerpocess
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import train_test_split 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, OrdinalEncoder

X = df.loc[:,df.columns!='owners'].drop(columns=['appid','name'])
y = df.loc[:,df.columns=='owners']
std_ftrs = ['price','positive_ratings','negative_ratings','average_playtime','median_playtime']
ordinal_owners = [[i for i in df['owners'].value_counts().index]]
onehot_ftrs = ['developer', 'publisher', 'platforms', 'steamspy_tags','required_age','categories','genres','english']
preprocessor = ColumnTransformer(transformers=[('onehot', OneHotEncoder(sparse=False,handle_unknown='ignore'), onehot_ftrs),
                                               ('std', StandardScaler(), std_ftrs)])
clf = Pipeline(steps=[('preprocessor', preprocessor)])
target_transform = ColumnTransformer(transformers=[('ord', OrdinalEncoder(categories = ordinal_owners), ['owners'])])
print_time = 1
for i in range(1,2):
    random_state = 42 * i
    print("==================test {} ======================".format(i))
    X_other, X_test, y_other, y_test = train_test_split(X,y,test_size = 1/27,stratify=y, random_state=random_state)
    if print_time <= 1:
        print('test balance:',y_test['owners'].value_counts(normalize=True))

        print(X_other.shape,y_other.shape)
        print('test set:',X_test.shape,y_test.shape)

    kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=random_state)
    for train_index, val_index in kf.split(X_other,y_other):
        X_train = X_other.iloc[train_index]
        y_train = y_other.iloc[train_index]
        X_val = X_other.iloc[val_index]
        y_val = y_other.iloc[val_index]
        X_train_prep = clf.fit_transform(X_train)
        X_val_prep = clf.transform(X_val)
        X_test_prep = clf.transform(X_test)
        y_train_prep = target_transform.fit_transform(y_train)
        y_val_prep = target_transform.transform(y_val)
        y_test_prep = target_transform.transform(y_test)
        if print_time <= 1:
            print("validation:", val_index)
            print("training:", train_index)
            print('   training set:',X_train_prep.shape, y_train_prep.shape) 
            print('   validation set:',X_val_prep.shape, y_val_prep.shape)
            print('train balance:')
            print(y_train['owners'].value_counts(normalize=True))
            print('val balance:')
            print(y_val['owners'].value_counts(normalize=True))
#             print(X_train.head())
    print_time += 1

test balance: 0-20000            0.686939
20000-50000        0.112662
50000-100000       0.062812
100000-200000      0.050847
200000-500000      0.046859
500000-1000000     0.018943
1000000-2000000    0.010967
2000000-5000000    0.006979
>5000000           0.002991
Name: owners, dtype: float64
(26072, 15) (26072, 1)
test set: (1003, 15) (1003, 1)
validation: [    0     1     4 ... 26038 26056 26063]
training: [    2     3     5 ... 26069 26070 26071]
   training set: (20857, 10988) (20857, 1)
   validation set: (5215, 10988) (5215, 1)
train balance:
0-20000            0.686868
20000-50000        0.112960
50000-100000       0.062617
100000-200000      0.051206
200000-500000      0.046987
500000-1000000     0.018938
1000000-2000000    0.010596
2000000-5000000    0.007144
>5000000           0.002685
Name: owners, dtype: float64
val balance:
0-20000            0.686673
20000-50000        0.113135
50000-100000       0.062512
100000-200000      0.051198
200000-500000      0.046980
500000-100