In [1]:
import numpy as np
import pandas as pd
import prepare as prep
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import explore as ex

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
df=prep.explore_df()

In [3]:
df.head()

Unnamed: 0,id,name,price,msrp,year_published,min_players,max_players,min_playtime,max_playtime,min_age,num_user_ratings,average_user_rating,num_user_complexity_votes,average_learning_complexity,average_strategy_complexity,rank,type,num_distributors
0,TAAifFP590,Root,45.0,60.0,2018.0,2.0,4.0,60.0,90.0,10.0,412,4.059395,11,3.818182,3.636364,1,game,6
1,yqR4PtpO8X,Scythe,54.39,90.0,2016.0,1.0,5.0,90.0,120.0,14.0,763,4.213439,17,3.117647,3.235294,2,game,9
2,5H5JS0KLzK,Wingspan,50.0,60.0,2019.0,1.0,5.0,40.0,70.0,10.0,589,4.162919,12,2.5,2.833333,3,game,11
3,RLlDWHh7hR,Gloomhaven,105.97,140.0,2017.0,1.0,4.0,60.0,150.0,12.0,491,4.351614,10,3.9,3.6,4,game,5
4,fDn9rQjH9O,Terraforming Mars,51.99,69.95,2016.0,1.0,5.0,90.0,120.0,12.0,713,4.146214,10,3.0,3.3,5,game,3


In [4]:
df.drop(columns=['id', 'name'], inplace=True)

In [5]:
for i, r in df.iterrows():
    if r['rank'] <= 100:
        df.loc[i, 'rank'] = 1
    else:
        df.loc[i, 'rank'] = 0

In [6]:
for i, t in df.iterrows():
    if t['type'] == 'game':
        df.loc[i, 'type'] = 0
    elif t['type'] == 'expansion':
        df.loc[i, 'type'] = 1
    else:
        df.loc[i, 'type'] = 2

In [7]:
df.head()

Unnamed: 0,price,msrp,year_published,min_players,max_players,min_playtime,max_playtime,min_age,num_user_ratings,average_user_rating,num_user_complexity_votes,average_learning_complexity,average_strategy_complexity,rank,type,num_distributors
0,45.0,60.0,2018.0,2.0,4.0,60.0,90.0,10.0,412,4.059395,11,3.818182,3.636364,1,0,6
1,54.39,90.0,2016.0,1.0,5.0,90.0,120.0,14.0,763,4.213439,17,3.117647,3.235294,1,0,9
2,50.0,60.0,2019.0,1.0,5.0,40.0,70.0,10.0,589,4.162919,12,2.5,2.833333,1,0,11
3,105.97,140.0,2017.0,1.0,4.0,60.0,150.0,12.0,491,4.351614,10,3.9,3.6,1,0,5
4,51.99,69.95,2016.0,1.0,5.0,90.0,120.0,12.0,713,4.146214,10,3.0,3.3,1,0,3


## Not all features will continue to modeling. Some features are only able to be determined after the game has been made and we want to predict if a game will be in the top 10% before it is put into circulation.

In [8]:
df.drop(columns=['num_user_ratings', 'average_user_rating', 'num_user_complexity_votes', 
                 'average_learning_complexity', 'average_strategy_complexity'], inplace=True)

In [9]:
#year published will be dropped, because you can't go back in time and make a game
df.drop(columns=['year_published'], inplace=True)

In [10]:
df.head()

Unnamed: 0,price,msrp,min_players,max_players,min_playtime,max_playtime,min_age,rank,type,num_distributors
0,45.0,60.0,2.0,4.0,60.0,90.0,10.0,1,0,6
1,54.39,90.0,1.0,5.0,90.0,120.0,14.0,1,0,9
2,50.0,60.0,1.0,5.0,40.0,70.0,10.0,1,0,11
3,105.97,140.0,1.0,4.0,60.0,150.0,12.0,1,0,5
4,51.99,69.95,1.0,5.0,90.0,120.0,12.0,1,0,3


In [11]:
train, val, test=ex.tts(df, stratify='rank')

In [12]:
train.shape, val.shape, test.shape

((716, 10), (179, 10), (100, 10))

In [13]:
X_train=train.drop(columns=['rank'])
y_train=train['rank']

X_val=val.drop(columns=['rank'])
y_val=val['rank']

X_test=test.drop(columns=['rank'])
y_test=test['rank']

In [14]:
X_train.head()

Unnamed: 0,price,msrp,min_players,max_players,min_playtime,max_playtime,min_age,type,num_distributors
386,45.456444,41.49,2.0,4.0,60.0,75.0,10.0,0,0
623,48.0,65.0,1.0,4.0,45.0,90.0,10.0,0,0
781,235.4,89.99,3.0,5.0,120.0,120.0,14.0,0,1
105,28.0,19.95,2.0,7.0,45.0,60.0,13.0,0,4
808,43.824,40.0,1.0,4.0,30.0,45.0,8.0,0,0


In [15]:
def modeling_prep():
    df=prep.explore_df()
    for i, r in df.iterrows():
        if r['rank'] <= 100:
            df.loc[i, 'rank'] = 1
        else:
            df.loc[i, 'rank'] = 0
    for i, t in df.iterrows():
        if t['type'] == 'game':
            df.loc[i, 'type'] = 0
        elif t['type'] == 'expansion':
            df.loc[i, 'type'] = 1
        else:
            df.loc[i, 'type'] = 2
    df.drop(columns=['num_user_ratings', 'average_user_rating', 'num_user_complexity_votes', 
                 'average_learning_complexity', 'average_strategy_complexity', 
                    'year_published', 'id', 'name'], inplace=True)
    train, val, test=ex.tts(df, stratify='rank')
    
    return X_train, y_train, X_val, y_val, X_test, y_test

In [16]:
X_train, y_train, X_val, y_val, X_test, y_test=modeling_prep()

In [17]:
X_train.head()

Unnamed: 0,price,msrp,min_players,max_players,min_playtime,max_playtime,min_age,type,num_distributors
386,45.456444,41.49,2.0,4.0,60.0,75.0,10.0,0,0
623,48.0,65.0,1.0,4.0,45.0,90.0,10.0,0,0
781,235.4,89.99,3.0,5.0,120.0,120.0,14.0,0,1
105,28.0,19.95,2.0,7.0,45.0,60.0,13.0,0,4
808,43.824,40.0,1.0,4.0,30.0,45.0,8.0,0,0


## Baseline

In [18]:
y_train.mode()

0    0
Name: rank, dtype: int64

In [19]:
train['baseline']=0
accuracy_score(y_train, train['baseline'])

0.8938547486033519

In [20]:
# baseline is 89.39%

In [21]:
train.drop(columns='baseline', inplace=True)

## Logistic Regression

In [22]:
logit = LogisticRegression(C=.5, random_state=8675309, intercept_scaling=1, solver='lbfgs')
logit.fit(X_train, y_train)
in_sample=logit.score(X_train,y_train)
out_of_sample=logit.score(X_val, y_val)

In [23]:
in_sample, out_of_sample

(0.8910614525139665, 0.9217877094972067)

## KNN

In [24]:
results=[]
for n in range(2, 10):
    knn= KNeighborsClassifier(n_neighbors=n, weights='uniform')
    knn.fit(X_train,y_train)
    in_sample= knn.score(X_train, y_train)
    out_of_sample= knn.score(X_val, y_val)
    output={
        'model': 'KNeighborsClassifier',
        'train_accuracy': in_sample,
        'validate_accuracy': out_of_sample,
        'KNN': n
    }
    results.append(output)
results=pd.DataFrame(data=results)
results['difference']=results['train_accuracy']-results['validate_accuracy']

In [25]:
results.sort_values('difference', ascending=False)

Unnamed: 0,model,train_accuracy,validate_accuracy,KNN,difference
1,KNeighborsClassifier,0.903631,0.882682,3,0.02095
3,KNeighborsClassifier,0.898045,0.899441,5,-0.001397
0,KNeighborsClassifier,0.906425,0.916201,2,-0.009777
2,KNeighborsClassifier,0.899441,0.910615,4,-0.011173
7,KNeighborsClassifier,0.893855,0.916201,9,-0.022346
5,KNeighborsClassifier,0.892458,0.916201,7,-0.023743
4,KNeighborsClassifier,0.893855,0.921788,6,-0.027933
6,KNeighborsClassifier,0.893855,0.921788,8,-0.027933


In [26]:
# knn of 5 looks like the best option for knn

## Decision Tree

In [35]:
results=[]
for i in range(1,16):
    for n in range(2,16):
        dtc=DecisionTreeClassifier(max_depth=n, min_samples_leaf=i, random_state=8675309)
        dtc.fit(X_train, y_train)
        in_sample= dtc.score(X_train, y_train)
        out_of_sample= dtc.score(X_val, y_val)
        output={
            'model': 'DecisionTreeClassifier',
            'train_accuracy': in_sample,
            'validate_accuracy': out_of_sample,
            'depth': n,
            'min_samples': i
    }
        results.append(output)

In [36]:
results=pd.DataFrame(results)
results['difference']=results['train_accuracy']-results['validate_accuracy']

In [38]:
results.sort_values('difference', ascending=True).head()

Unnamed: 0,model,train_accuracy,validate_accuracy,depth,min_samples,difference
0,DecisionTreeClassifier,0.893855,0.921788,2,1,-0.027933
42,DecisionTreeClassifier,0.893855,0.921788,2,4,-0.027933
43,DecisionTreeClassifier,0.893855,0.921788,3,4,-0.027933
44,DecisionTreeClassifier,0.893855,0.921788,4,4,-0.027933
170,DecisionTreeClassifier,0.893855,0.921788,4,13,-0.027933


In [None]:
#use depth 2 min_samples 4

## Random Forest

In [41]:
results=[]
for i in range(1,16):
    for n in range(2,16):
            rm= RandomForestClassifier(max_depth= n, min_samples_leaf= i, random_state=8675309)
            rm.fit(X_train, y_train)
            n_sample= rm.score(X_train, y_train)
            out_of_sample= rm.score(X_val, y_val)
            output={
            'model': 'Random Forest Classifier',
            'train_accuracy': in_sample,
            'validate_accuracy': out_of_sample,
            'depth': n,
            'min_samples': i
    }
            results.append(output)

In [42]:
results=pd.DataFrame(results)
results['difference']=results['train_accuracy']-results['validate_accuracy']

In [43]:
results.sort_values('difference', ascending=True).head()

Unnamed: 0,model,train_accuracy,validate_accuracy,depth,min_samples,difference
0,Random Forest Classifier,0.896648,0.921788,2,1,-0.02514
133,Random Forest Classifier,0.896648,0.921788,9,10,-0.02514
134,Random Forest Classifier,0.896648,0.921788,10,10,-0.02514
135,Random Forest Classifier,0.896648,0.921788,11,10,-0.02514
136,Random Forest Classifier,0.896648,0.921788,12,10,-0.02514


In [None]:
# use depth 9, min_samples 10

In [44]:
def models(train, val):
    '''
    this function prints results for models
    '''
    x_train=train.drop(columns=['rank'])
    y_train=train['rank']

    x_val=val.drop(columns=['rank'])
    y_val=val['rank']
    
    results=[]
    logit = LogisticRegression(C=.5, random_state=8675309, intercept_scaling=1, solver='lbfgs')
    logit.fit(x_train, y_train)
    in_sample=logit.score(x_train,y_train)
    out_of_sample=logit.score(x_val, y_val)
    output={
        'model': 'LogisticRegression (lbfgs)',
        'train_accuracy': in_sample,
        'validate_accuracy': out_of_sample
    }
    results.append(output)
    
    logit = LogisticRegression(C=1, random_state=8675309, solver='liblinear')
    logit.fit(x_train, y_train)
    in_sample=logit.score(x_train,y_train)
    out_of_sample=logit.score(x_val, y_val)
    output={
        'model': 'LogisticRegression (liblinear)',
        'train_accuracy': in_sample,
        'validate_accuracy': out_of_sample
    }
    results.append(output)
    
    knn= KNeighborsClassifier(n_neighbors=5, weights='uniform')
    knn.fit(x_train,y_train)
    in_sample= knn.score(x_train, y_train)
    out_of_sample= knn.score(x_val, y_val)
    output={
        'model': 'KNeighborsClassifier',
        'train_accuracy': in_sample,
        'validate_accuracy': out_of_sample
    }
    results.append(output)
    
    dtc=DecisionTreeClassifier(max_depth=2, min_samples_leaf=4, random_state=8675309)
    dtc.fit(x_train, y_train)
    in_sample= dtc.score(x_train, y_train)
    out_of_sample= dtc.score(x_val, y_val)
    output={
        'model': 'DecisionTreeClassifier',
        'train_accuracy': in_sample,
        'validate_accuracy': out_of_sample
    }
    results.append(output)
    
    rm= RandomForestClassifier(max_depth= 9, min_samples_leaf= 10, random_state=8675309)
    rm.fit(x_train, y_train)
    in_sample= rm.score(x_train, y_train)
    out_of_sample= rm.score(x_val, y_val)
    output={
        'model': 'RandomForestClassifier',
        'train_accuracy': in_sample,
        'validate_accuracy': out_of_sample
    }
    results.append(output)
    
    results=pd.DataFrame(data=results)
    results['difference']=results['train_accuracy']-results['validate_accuracy'] 
    return results

In [46]:
results=models(train, val)

In [47]:
results.sort_values('difference', ascending=False)

Unnamed: 0,model,train_accuracy,validate_accuracy,difference
2,KNeighborsClassifier,0.898045,0.899441,-0.001397
3,DecisionTreeClassifier,0.893855,0.921788,-0.027933
4,RandomForestClassifier,0.893855,0.921788,-0.027933
0,LogisticRegression (lbfgs),0.891061,0.921788,-0.030726
1,LogisticRegression (liblinear),0.891061,0.921788,-0.030726


## Use random forest for test data. KNN may have less of a difference, but historically is much more inconsistant than other models

In [54]:
rm= RandomForestClassifier(max_depth= 9, min_samples_leaf= 10, random_state=8675309)
rm.fit(X_train, y_train)
rm.score(X_test, y_test)

0.87

<div class="alert alert-info">
    <header>
    <h2>Modeling Summary</h2>
    </header>
    <dl>
        <dt>With a baseline accuracy of 89.39%, it was hard to beat. My final model did not beat baseline with an accuracy of 87%. This does not suprise me when considering that almost all features were not shown to be significant indicators of a highly rated game or not during the explore phase.</dt>
    </dl>
</div>