### INPUTS

In [1]:
sample_size = 30000 #30000

### Imports

In [63]:
from sklearn.metrics import confusion_matrix

In [3]:
from fastai.imports import *
from fastai.structured import *
import numpy as np
import pandas as pd
df = pd.read_csv('data/transform_hot_full.csv')

### Helper functions

In [13]:
def split_random_masks(a, n1, n2):
    msk = np.random.rand(len(a)) < (n1 + n2)
    msk1 = ((np.random.rand(len(a)) < n1/(n1+n2)) & msk)
    msk2 = (~msk1 & msk)
    return ~msk, msk1, msk2

def rmse(x,y):
    return math.sqrt(((x-y)**2).mean())

def print_score(m):
    res = ["rmse(trn):",rmse(m.predict(X_train), y_train), " rmse(val):",rmse(m.predict(X_valid), y_valid),
                " scr(trn):",m.score(X_train, y_train), " scr(val):",m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [18]:
def hero_one_hot(game_map = '', winners = [], losers = []):
    new_row = pd.Series(index = col_names, dtype = 'boolean')
    new_row[:] = False
    for x in hero_names:
        if x in winners:
            new_row['yours_'+x] = True
        if x in losers:
            new_row['theirs_'+x] = True
    new_row[game_map] = True
    return new_row

In [None]:
from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

### Training

#### Prepare X and Y for train and val sets.

In [23]:
df_trn, y_trn, nas = proc_df(df, 'outcome')
val_ratio = 0.2
tst_ratio = 0.1
#y_trn=y_trn.astype('bool')
#n_trn = int(len(df_trn) * train_required_ratio)
tr_mask, val_mask, test_mask = split_random_masks(df_trn, val_ratio, tst_ratio)
X_train = df_trn[tr_mask].copy()
y_train = y_trn[tr_mask].copy()
X_valid = df_trn[tr_mask].copy()
y_valid = y_trn[tr_mask].copy()
X_test = df_trn[tr_mask].copy()
y_test = y_trn[tr_mask].copy()

print("train: ",X_train.shape, "  val:",X_valid.shape, " tst: ", X_test.shape)
apply_cats(X_valid, X_train)

train:  (20838, 188)   val: (20838, 188)  tst:  (20838, 188)


#### Training binary logistic regression model

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
model = LogisticRegression(random_state=0).fit(X_valid, y_valid)



In [25]:
game_map = 'Garden of Terror'
game_type = 'UnrankedDraft'
winners=['Auriel', 'Muradin']
losers=['Illidian', 'Abathur']
row = [hero_one_hot(game_map = game_map, winners = winners, losers=losers)]
model.predict(row)

array([ True])

In [26]:
model.score(X_valid, y_valid)

0.5364718303100106

### Optimized functions, find best accuracy

#### Define functions

In [31]:
def check_C():
    j = 0
    C_param_range = [0.75,0.9,1,1.1,1.5,2]
    acc_table = pd.DataFrame(columns = ['C_parameter','Accuracy'])
    acc_table['C_parameter'] = C_param_range
    for i in C_param_range:
        # Apply logistic regression model to training data
        lr = LogisticRegression(solver = 'lbfgs', penalty = 'l2', C = i,random_state = 0)
        lr.fit(X_train, y_train)

        # Predict using model
        y_pred = lr.predict(X_test)

        # Saving accuracy score in table
        acc_table.iloc[j,1] = accuracy_score(y_test,y_pred)
        j += 1
    return acc_table

In [32]:
def check_solver():
    j = 0
    S_param_range = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
    acc_table = pd.DataFrame(columns = ['C_parameter','Accuracy', 'Time'])
    acc_table['C_parameter'] = S_param_range
    result = %timeit -n1 -r1 -o
    for i in S_param_range:

        # Apply logistic regression model to training data
        lr = LogisticRegression(solver = i, penalty = 'l2', C = 1,random_state = 0) #pen:
        lr.fit(X_train, y_train)

        # Predict using model
        y_pred = lr.predict(X_test)

        # Saving accuracy score in table
        acc_table.iloc[j,1] = accuracy_score(y_test,y_pred)
        acc_table.iloc[j,2] = result

        j += 1
    return acc_table

In [33]:
def check_penalties():
    j = 0
    P_param_range = ['l1', 'l2', 'elasticnet', 'none']
    acc_table = pd.DataFrame(columns = ['P_parameter','Accuracy'])
    acc_table['P_parameter'] = P_param_range
    for i in P_param_range:
        # Apply logistic regression model to training data
        lr = LogisticRegression(solver = 'saga', penalty = i, C = 1,random_state = 0, l1_ratio=0.5)
        lr.fit(X_train, y_train)

        # Predict using model
        y_pred = lr.predict(X_test)

        # Saving accuracy score in table
        acc_table.iloc[j,1] = accuracy_score(y_test,y_pred)
        j += 1
    return acc_table

#### Run Optimizers and display results

In [34]:
df1 = check_C()
df2 = check_penalties()
df3 = check_solver()

  "(penalty={})".format(self.penalty))
  "(penalty={})".format(self.penalty))
  "(penalty={})".format(self.penalty))


Unnamed: 0,C_parameter,Accuracy,P_parameter,Time
0,0.75,0.536376,,
1,0.9,0.536472,,
2,1,0.536424,,
3,1.1,0.536472,,
4,1.5,0.53652,,
5,2,0.536616,,
0,,0.537144,l1,
1,,0.536472,l2,
2,,0.537048,elasticnet,
3,,0.536856,none,


In [39]:
display_side_by_side(df1, df2, df3)

Unnamed: 0,C_parameter,Accuracy
0,0.75,0.536376
1,0.9,0.536472
2,1.0,0.536424
3,1.1,0.536472
4,1.5,0.53652
5,2.0,0.536616

Unnamed: 0,P_parameter,Accuracy
0,l1,0.537144
1,l2,0.536472
2,elasticnet,0.537048
3,none,0.536856

Unnamed: 0,C_parameter,Accuracy,Time
0,newton-cg,0.536472,
1,lbfgs,0.536424,
2,liblinear,0.536472,
3,sag,0.536424,
4,saga,0.536472,


In [52]:
final_model = LogisticRegression(solver = 'saga', penalty = 'l1', C = 1.5,random_state = 0)
final_model.fit(X_train, y_train)

f_pred = final_model.predict(X_test)
print(f_pred.mean(),  f_pred.sum())

0.5083021403205682 10592


In [51]:
final_model.score(X_test, y_test)

0.5373836260677608

In [67]:
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [66]:

titles_options = [("Confusion matrix, without normalization", None),
                  ("Normalized confusion matrix", 'true')]
for title, normalize in titles_options:
    disp = confusion_matrix(final_model, X_test, y_test,
                                 display_labels=hero_names,
                                 cmap=plt.cm.Blues,
                                 normalize=normalize)
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)

plt.show()

TypeError: confusion_matrix() got an unexpected keyword argument 'display_labels'

In [72]:
from sklearn.utils.multiclass import unique_labels
plot_confusion_matrix(y_test, y_pred, classes=df.outcome,
                      title='Confusion matrix, without normalization')

AttributeError: 'DataFrame' object has no attribute 'outcome'

In [61]:
print(sklearn.__version__)

0.21.3
