### Imports

In [40]:
from sklearn.metrics import confusion_matrix

In [41]:
from fastai.imports import *
from fastai.structured import *
import numpy as np
import pandas as pd
df = pd.read_csv('data/transform_hot_full.csv', index_col=0)
#df = pd.read_csv('data/transform_hot_mini.csv', index_col=0)

### Helper functions

In [42]:
def split_random_masks(a, n1, n2):
    msk = np.random.rand(len(a)) < (n1 + n2)
    msk1 = ((np.random.rand(len(a)) < n1/(n1+n2)) & msk)
    msk2 = (~msk1 & msk)
    return ~msk, msk1, msk2

def rmse(x,y):
    return math.sqrt(((x-y)**2).mean())

def print_score(m):
    res = ["rmse(trn):",rmse(m.predict(X_train), y_train), " rmse(val):",rmse(m.predict(X_valid), y_valid),
                " scr(trn):",m.score(X_train, y_train), " scr(val):",m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [43]:
m_names = pd.read_csv("exports/game_map_cats.csv",header=None, index_col=0, squeeze=True).to_dict()
map_names = [n for x, n, in m_names.items()]
map_names_swapped = dict([(value, key) for key, value in m_names.items()])

h_names= pd.read_csv("exports/hero_name_cats.csv",header=None, index_col=0, squeeze=True).to_dict()
hero_names = [n for x, n, in h_names.items()]

In [44]:
mns = [x[1] for x in list(enumerate(map_names))]
yhns = ['yours_'+x[1] for x in list(enumerate(hero_names))]
thns = ['theirs_'+x[1] for x in list(enumerate(hero_names))]
col_names =mns+yhns+thns

def hero_one_full(game_map = '', winners = [], losers = []):
    new_row = pd.Series(index = col_names, dtype = 'boolean')
    new_row[:] = False
    for x in hero_names:
        if x in winners:
            new_row['yours_'+x] = True
        if x in losers:
            new_row['theirs_'+x] = True
    new_row[game_map] = True
    return new_row

def hero_hot_mini(game_map = '', winners = [], losers = []):
    new_row = pd.Series(index = hero_names, dtype = 'int8')
    for x in hero_names:
        if x in winners:
            new_row[x] = 2
        if x in losers:
            new_row[x] = 1
    game_map_index = int(map_names_swapped[game_map])
    new_row = new_row.append(pd.Series(game_map_index, index = ['game_map'], dtype = 'int8'))
    return new_row

def hero_one_hot(game_map = '', winners = [], losers = []):
    return hero_one_full(game_map = game_map, winners = winners, losers = losers)

In [45]:
from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

### Training

#### Prepare X and Y for train and val sets.

In [46]:
SAMPLE_SIZE = 25000
data1 = df.sample(n = SAMPLE_SIZE, random_state = 1, axis = 0)
df_trn, y_trn, nas = proc_df(data1, 'outcome')
val_ratio = 0.2
tst_ratio = 0.1
#y_trn=y_trn.astype('bool')
#n_trn = int(len(df_trn) * train_required_ratio)
tr_mask, val_mask, test_mask = split_random_masks(df_trn, val_ratio, tst_ratio)
X_train = df_trn[tr_mask].copy()
y_train = y_trn[tr_mask].copy()
X_valid = df_trn[tr_mask].copy()
y_valid = y_trn[tr_mask].copy()
X_test = df_trn[tr_mask].copy()
y_test = y_trn[tr_mask].copy()

print("train: ",X_train.shape, "  val:",X_valid.shape, " tst: ", X_test.shape)
apply_cats(X_valid, X_train)

train:  (17447, 188)   val: (17447, 188)  tst:  (17447, 188)


## Training many models for comparison

In [47]:
#from: https://www.kaggle.com/ldfreeman3/a-data-science-framework-to-achieve-99-accuracy
from sklearn import linear_model, ensemble, gaussian_process, naive_bayes, neighbors, svm, tree, discriminant_analysis
from sklearn import model_selection
from xgboost import XGBClassifier

In [48]:
MLA = [
    #Ensemble Methods
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),

    #Gaussian Processes
    gaussian_process.GaussianProcessClassifier(),
    
    #GLM
    linear_model.LogisticRegressionCV(),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),
    
    #Navies Bayes
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),
    
    #Nearest Neighbor
    neighbors.KNeighborsClassifier(),
    
    #SVM
    svm.SVC(probability=True),
    svm.NuSVC(probability=True),
    svm.LinearSVC(),
    
    #Trees    
    tree.DecisionTreeClassifier(),
    tree.ExtraTreeClassifier(),
    
    #Discriminant Analysis
    discriminant_analysis.LinearDiscriminantAnalysis(),
    discriminant_analysis.QuadraticDiscriminantAnalysis(),

    
    #xgboost: http://xgboost.readthedocs.io/en/latest/model.html
    XGBClassifier()    
    ]

In [49]:
Target = ['outcome']

In [50]:
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm

cv_split = model_selection.ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 0 ) # run model 10x with 60/30 split intentionally leaving out 10%

#create table to compare MLA metrics
MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Accuracy Mean', 'MLA Test Accuracy Mean', 'MLA Test Accuracy 3*STD' ,'MLA Time']
MLA_compare = pd.DataFrame(columns = MLA_columns)

#create table to compare MLA predictions
MLA_predict = data1[Target]

#index through MLA and save performance to table
p_bar = tqdm(MLA)
row_index = 0
for alg in p_bar:
    #set name and parameters
    MLA_name = alg.__class__.__name__
    p_bar.set_description(f'Working on "{MLA_name}"')
    MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
    MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
    
    #score model with cross validation: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validate
    cv_results = model_selection.cross_validate(alg, data1[col_names], data1[Target], cv  = cv_split, return_train_score=True)

    MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean()
    MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = cv_results['train_score'].mean()
    MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = cv_results['test_score'].mean()   
    #if this is a non-bias random sample, then +/-3 standard deviations (std) from the mean, should statistically capture 99.7% of the subsets
    MLA_compare.loc[row_index, 'MLA Test Accuracy 3*STD'] = cv_results['test_score'].std()*3   #let's know the worst that can happen!
    

    #save MLA predictions - see section 6 for usage
    alg.fit(data1[col_names], data1[Target])
    MLA_predict[MLA_name] = alg.predict(data1[col_names])
    
    row_index+=1

    
#print and sort table: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sort_values.html
MLA_compare.sort_values(by = ['MLA Test Accuracy Mean'], ascending = False, inplace = True)
MLA_compare
#MLA_predict

Working on "GaussianProcessClassifier":  23%|██▎       | 5/22 [11:08<37:54, 133.80s/it]


KeyboardInterrupt: 

In [None]:
sns.barplot(x='MLA Test Accuracy Mean', y = 'MLA Name', data = MLA_compare, color = 'm')

#prettify using pyplot: https://matplotlib.org/api/pyplot_api.html
plt.title('Machine Learning Algorithm Accuracy Score \n')
plt.xlabel('Accuracy Score (%)')
plt.ylabel('Algorithm')