In [1]:
# Importing all the libraries 
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler
import datetime
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd
from joblib import dump, load
import matplotlib.pyplot as plt
from statsmodels.graphics.mosaicplot import mosaic
import seaborn as sns


In [2]:
class CList:
    def __init__(self, start, stop, step):
        self.start = start
        self.stop = stop
        self.step = step
    def __iter__(self):
        self.c = self.start
        return self
    def __next__(self):
        if self.c <= self.stop:
            x = self.c
            self.c += self.step
            return x
        else:
            raise StopIteration

In [3]:
# Reading the Dataset
data = pd.read_csv('features.csv', index_col = 'match_id')

In [4]:
data.head()

Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time,duration,radiant_win,tower_status_radiant,tower_status_dire,barracks_status_radiant,barracks_status_dire
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1430198770,7,11,5,2098,1489,20,0,0,7,...,4,2,2,-52.0,2874,1,1796,0,51,0
1,1430220345,0,42,4,1188,1033,9,0,1,12,...,4,3,1,-5.0,2463,1,1974,0,63,1
2,1430227081,7,33,4,1319,1270,22,0,0,12,...,4,3,1,13.0,2130,0,0,1830,0,63
3,1430263531,1,29,4,1779,1056,14,0,0,5,...,4,2,0,27.0,1459,0,1920,2047,50,63
4,1430282290,7,13,4,1431,1090,8,1,0,8,...,3,3,0,-16.0,2449,0,4,1974,3,63


In [5]:
# Removing all the unnecessary columns from the dataset
columns_to_drop = ['tower_status_radiant', 'duration', 'tower_status_dire', 'barracks_status_radiant', 'barracks_status_dire']

data = data.drop(columns_to_drop, axis=1)

In [6]:
values_full = data.count()
rows_quantity = data.shape[0]
columns_with_empty = []
for column, length in values_full.iteritems():
    if rows_quantity - length > 0:
        columns_with_empty.append(column)
columns_with_empty

['first_blood_time',
 'first_blood_team',
 'first_blood_player1',
 'first_blood_player2',
 'radiant_bottle_time',
 'radiant_courier_time',
 'radiant_flying_courier_time',
 'radiant_first_ward_time',
 'dire_bottle_time',
 'dire_courier_time',
 'dire_flying_courier_time',
 'dire_first_ward_time']

In [7]:
# These columns do not always occur, so missing values are filled with 0.
value = 0
data[columns_with_empty] = data[columns_with_empty].fillna(value=value, inplace=False)

In [8]:
y = data[["radiant_win"]].values.ravel()
X = data.drop(["radiant_win"], axis=1).values

In [9]:
x_data = data.drop(["radiant_win"], axis=1)

In [10]:
# Data must be scaled as logistic regression is sensitive to outliers
def scale_data(X_train):
    scaler = StandardScaler()
    scaler.fit(X_train)
    StandardScaler(copy=True, with_mean=True, with_std=True)
    X_train = scaler.transform(X_train)
    return X_train

X_scaled = scale_data(X)
X_scaled

array([[-2.54436416,  1.54068827, -1.24422828, ..., -0.55115386,
         1.84600409, -1.12149424],
       [-2.54045236, -0.92779756, -0.29225805, ...,  0.67817009,
         0.43778816,  0.04394713],
       [-2.53923104,  1.54068827, -0.5686365 , ...,  0.67817009,
         0.43778816,  0.49028637],
       ...,
       [ 1.09874571, -0.57515673,  1.42743012, ...,  0.67817009,
         0.43778816, -0.20401912],
       [ 1.09895204, -0.57515673,  1.48884755, ...,  0.67817009,
         0.43778816, -0.87352799],
       [ 1.1026479 ,  1.54068827, -0.04658831, ..., -0.55115386,
        -0.97042777, -0.79913812]])

In [11]:

def process_lr(kf, X, y, c_list_iterator):
    qualities = []
    for i, c in enumerate(c_list_iterator):
        start_time = datetime.datetime.now()
        clf = LogisticRegression(penalty='l2', C=c, solver="lbfgs", max_iter=200)
        qualities_c = []
        for train_index, test_index in kf.split(X):
            x_train, x_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            clf = clf.fit(x_train, y_train)
            predictions = clf.predict_proba(x_test)[:, 1]
            qualities_c.append(roc_auc_score(y_test, predictions))

        mean_quality = round(np.mean(qualities_c), 5)
        qualities.append(mean_quality)

        # print("C: " + str(c))
        print('Time:', datetime.datetime.now() - start_time)
        # print("Quality AUC-ROC: " + str(mean_quality))
        # print("-")

    return qualities

def get_best_quality_c(qualities, start, step):
    max_q = max(qualities)
    best_c = (qualities.index(max_q) * step) + start
    return max_q, best_c

In [12]:

kf = KFold(n_splits=5, shuffle=True)
start, stop, step = 1, 4, 0.2
c_list_iterator = iter(CList(start, stop, step))

qualities = process_lr(kf, X_scaled, y, c_list_iterator)
max_q, best_c = get_best_quality_c(qualities, start, step)

print("Max quality: ", max_q)
print("Max quality C: ", best_c)


Time: 0:00:03.968966
Time: 0:00:04.111005
Time: 0:00:04.741319
Time: 0:00:04.758272
Time: 0:00:04.284540
Time: 0:00:04.449098
Time: 0:00:04.452091
Time: 0:00:04.307480
Time: 0:00:04.744310
Time: 0:00:04.849600
Time: 0:00:05.015493
Time: 0:00:04.415886
Time: 0:00:04.294513
Time: 0:00:04.271574
Time: 0:00:04.038199
Max quality:  0.71655
Max quality C:  1.2


In [13]:
# Removing columns with categorical features
x_data_cat_dropped = x_data.drop(['lobby_type'], axis=1)
for i in range(1, 6):
    x_data_cat_dropped = x_data_cat_dropped.drop(["r" + str(i) + "_hero"], axis=1)
    x_data_cat_dropped = x_data_cat_dropped.drop(["d" + str(i) + "_hero"], axis=1)

In [14]:
# Scale data, cross-validate with parameter selection
X_cat_dropped_scaled = scale_data(x_data_cat_dropped.values)

qualities = process_lr(kf, X_cat_dropped_scaled, y, c_list_iterator)
max_q, best_c = get_best_quality_c(qualities, start, step)

print("Max quality: ", max_q)
print("Max quality C: ", best_c)

Time: 0:00:03.782884
Time: 0:00:04.268581
Time: 0:00:04.550827
Time: 0:00:04.094049
Time: 0:00:03.907546
Time: 0:00:03.810808
Time: 0:00:03.958412
Time: 0:00:03.738998
Time: 0:00:03.822775
Time: 0:00:04.159874
Time: 0:00:04.030219
Time: 0:00:04.401228
Time: 0:00:04.184805
Time: 0:00:04.077094
Time: 0:00:04.245644
Max quality:  0.71662
Max quality C:  1.4


In [15]:
hero_columns = []
for i in range(1, 6):
    hero_columns.append("r" + str(i) + "_hero")
    hero_columns.append("d" + str(i) + "_hero")
unic_ids = pd.unique(data[hero_columns].values.ravel())
heroes_quantity = len(unic_ids)
print("Unic heroes:", sorted(unic_ids))
print("Unic heroes id quantity:", heroes_quantity)

Unic heroes: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 109, 110, 112]
Unic heroes id quantity: 108


In [16]:
def process_categorial(heroes_quantity, x_data, X):
    X_pick = np.zeros((x_data.shape[0], heroes_quantity + 4))

    for i, match_id in enumerate(x_data.index):
        for p in range(5):
            X_pick[i, x_data.loc[match_id, 'r%d_hero' % (p + 1)] - 1] = 1
            X_pick[i, x_data.loc[match_id, 'd%d_hero' % (p + 1)] - 1] = -1
    return np.hstack((X, X_pick))

X = process_categorial(heroes_quantity, x_data, X_scaled)

In [17]:
qualities = process_lr(kf, X, y, c_list_iterator)
max_q, best_c = get_best_quality_c(qualities, start, step)

print("Max quality: ", max_q)
print("Max quality C: ", best_c)

Time: 0:00:18.475581
Time: 0:00:16.677390
Time: 0:00:16.862894
Time: 0:00:18.692002
Time: 0:00:19.131827
Time: 0:00:19.006265
Time: 0:00:19.625570
Time: 0:00:19.372990
Time: 0:00:19.827351
Time: 0:00:19.911739
Time: 0:00:16.688361
Time: 0:00:17.355577
Time: 0:00:17.243874
Time: 0:00:17.667742
Time: 0:00:18.949313
Max quality:  0.75215
Max quality C:  2.8


In [18]:
lr = LogisticRegression(penalty='l2', C=best_c, solver="lbfgs", max_iter=200)
for train_index, test_index in kf.split(X):
        x_train, x_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        lr = lr.fit(x_train, y_train)
dump(lr, "lr.joblib")

['lr.joblib']

In [19]:
def prepare_test_data_lr():
    # Reading the test dataset
    data = pd.read_csv('./features_test.csv', index_col='match_id')
   # Fill in missing calues
    data[columns_with_empty] = data[columns_with_empty].fillna(value=0, inplace=False)
    # Scale data
    X_scaled = scale_data(data.values)
    X = process_categorial(108, data, X_scaled)
    return X, data

def test_lr(X, data):
    # Load the Logistic regression model which we saved earlier
    clf = load("lr.joblib")
    predictions_df = pd.DataFrame(clf.predict_proba(X))
    predictions_df.columns = ['dire_win', 'radiant_win']
    predictions_df.index = data.index
    print(predictions_df.head())
    print("Mean:\n", predictions_df.mean())
    print("Max prob. radiant win: ", max(predictions_df['radiant_win']))  
    print("Min prob. radiant win: ", min(predictions_df['radiant_win']))
    result = predictions_df.drop(['dire_win'], axis=1)
    result.to_csv('./result.csv')

In [20]:
X, data = prepare_test_data_lr()
test_lr(X, data)

          dire_win  radiant_win
match_id                       
6         0.191766     0.808234
7         0.240165     0.759835
10        0.812790     0.187210
13        0.134908     0.865092
16        0.772465     0.227535
Mean:
 dire_win       0.482334
radiant_win    0.517666
dtype: float64
Max prob. radiant win:  0.9966908904528313
Min prob. radiant win:  0.007824167662299122
