In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from warnings import simplefilter
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score

# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [17]:
#1 
data = pd.read_csv('features.csv', index_col = 'match_id')
data_test = pd.read_csv('features_test.csv', index_col = 'match_id')
y = data.radiant_win

useless_features = ['start_time']
for feature in data:
    if feature not in data_test: 
        useless_features.append(feature)    
print(*useless_features, sep = '\n')

data = data.drop(useless_features, axis = 1)

start_time
duration
radiant_win
tower_status_radiant
tower_status_dire
barracks_status_radiant
barracks_status_dire


In [3]:
#2
lenght = data.shape[0]

for feature in data:
    if lenght - data[feature].count() != 0:
        print(':'.join([feature, str(lenght - data[feature].count())]))

first_blood_time:19553
first_blood_team:19553
first_blood_player1:19553
first_blood_player2:43987
radiant_bottle_time:15691
radiant_courier_time:692
radiant_flying_courier_time:27479
radiant_first_ward_time:1836
dire_bottle_time:16143
dire_courier_time:676
dire_flying_courier_time:26098
dire_first_ward_time:1826


In [4]:
#3
data = data.fillna(0)
data_test = data_test.fillna(0)

In [5]:
#4
print('radiant_win')

radiant_win


In [6]:
#5 GradiaentBoosting

Kfold = KFold(n_splits = 5, shuffle = True, random_state = 42)

estimators = [10, 20, 30]

X = data

for n in estimators:
    start_time = datetime.now()
    
    model = GradientBoostingClassifier(n_estimators = n, random_state = 42 )
    score = cross_val_score(model, X, y, cv = Kfold, scoring = 'roc_auc').mean()
    
    print(n, score, datetime.now() - start_time, sep = ": ")


10: 0.6648506879750012: 0:00:34.280285
20: 0.6824618768044435: 0:01:06.519922
30: 0.6900064710388155: 0:02:04.130266


In [8]:
#1 LogisticRegression

scaler = StandardScaler()

X_train = scaler.fit_transform(X)

Kfold = KFold(n_splits = 5, shuffle = True, random_state = 42)

for i in range(-5, 6):
    C = 10.0 ** i
    
    model = LogisticRegression(C = C, random_state = 42)
    start_time = datetime.now()
    score = cross_val_score(model, X_train, y, cv = Kfold, scoring = 'roc_auc').mean()
    print(C, score, datetime.now() - start_time, sep = ': ')

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


1e-05: 0.6951279316988187: 0:00:03.352676
0.0001: 0.711244122262053: 0:00:05.870301
0.001: 0.7162450543465029: 0:00:09.248632
0.01: 0.7164357706829788: 0:00:11.214087
0.1: 0.7164100547748318: 0:00:11.271984
1.0: 0.7164079384372608: 0:00:12.613465
10.0: 0.7164073770138559: 0:00:16.065362
100.0: 0.7164072520277948: 0:00:13.187008
1000.0: 0.7164072372000616: 0:00:11.348722
10000.0: 0.7164072266072526: 0:00:11.644851
100000.0: 0.7164072266072526: 0:00:11.614683


In [9]:
#2
categorial_features = ['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero',
                      'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']

X = data.drop(categorial_features, axis = 1)
X_train = scaler.fit_transform(X)

for i in range(-5, 6):
    C = 10.0 ** i
    
    model = LogisticRegression(C = C, random_state = 42)
    start_time = datetime.now()
    score = cross_val_score(model, X_train, y, cv = Kfold, scoring = 'roc_auc').mean()
    
    print(C, score, datetime.now() - start_time, sep = ': ')

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


1e-05: 0.695064532999037: 0:00:02.838029
0.0001: 0.7112201334097993: 0:00:04.711490
0.001: 0.7162550133772639: 0:00:07.665262
0.01: 0.7164424484366461: 0:00:10.395201
0.1: 0.7164166513037841: 0:00:11.125501
1.0: 0.7164140413807418: 0:00:11.184084
10.0: 0.7164137680368765: 0:00:10.683429
100.0: 0.7164137341365804: 0:00:10.753232
1000.0: 0.7164137298947635: 0:00:10.566517
10000.0: 0.7164137362495984: 0:00:11.255086
100000.0: 0.7164137362495984: 0:00:10.855950


In [10]:
#3 
unique_heroes = pd.unique(data[['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero',
                      'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']].values.ravel())
N = max(unique_heroes)
print(len(unique_heroes), 'уникальных героев')

108 уникальных героев


In [11]:
#4 
X_pick = np.zeros((data.shape[0], N))

for i, match_id in enumerate(data.index):
    for p in range(5):
        X_pick[i, data.loc[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, data.loc[match_id, 'd%d_hero' % (p+1)]-1] = -1

X_pick = pd.DataFrame(X_pick, index = data.index, columns = [f'hero_{i}' for i in range(N)])
X = pd.concat([X, X_pick], axis = 1)


In [12]:
#5
X_train = scaler.fit_transform(X)

for i in range(-5, 6):
    C = 10.0 ** i
    
    model = LogisticRegression(C = C, random_state = 42)
    start_time = datetime.now()
    score = cross_val_score(model, X, y, cv = Kfold, scoring = 'roc_auc').mean()
    
    print(C, score, datetime.now() - start_time, sep = ': ')

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


1e-05: 0.7177854597962655: 0:00:57.009688
0.0001: 0.7283867631881902: 0:01:46.187295
0.001: 0.7459061442504172: 0:03:01.618360
0.01: 0.7515231350137639: 0:04:23.855758
0.1: 0.751791646013773: 0:04:28.558336
1.0: 0.7517460096439752: 0:03:59.938228
10.0: 0.7517160423022433: 0:04:34.982120
100.0: 0.7517749279713775: 0:03:54.751677
1000.0: 0.7517151961669672: 0:04:24.191843
10000.0: 0.7517603492213729: 0:04:16.597228
100000.0: 0.751750060653966: 0:03:59.722225


In [13]:
#Preprocessing test data
X_test = data_test
X_test = X_test.drop('start_time', axis = 1)
X_test = X_test.drop(categorial_features, axis = 1)

unique_heroes = pd.unique(data_test[['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero',
                      'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']].values.ravel())
N = max(unique_heroes)

X_pick = np.zeros((data_test.shape[0], N))

for i, match_id in enumerate(data_test.index):
    for p in range(5):
        X_pick[i, data_test.loc[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, data_test.loc[match_id, 'd%d_hero' % (p+1)]-1] = -1

X_pick = pd.DataFrame(X_pick, index = data_test.index, columns = [f'hero_{i}' for i in range(N)])
X_test = pd.concat([X_test, X_pick], axis = 1)

X_test = scaler.fit_transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [20]:
#6
model = LogisticRegression(C = 0.1, random_state = 42)
model.fit(X, y)

preds = pd.Series(model.predict_proba(X_test)[:, 1])
preds.describe()


count    17177.000000
mean         0.500382
std          0.303883
min          0.000046
25%          0.229049
50%          0.499540
75%          0.770704
max          0.999923
dtype: float64

In [None]:
#Some extra possible preprocessing

data = pd.read_csv('features.csv', index_col = 'match_id')
data_test = pd.read_csv('features_test.csv', index_col = 'match_id')

#Possible Preprocessing test

data_test.d1_kills = data_test.d1_kills + data_test.d2_kills + data_test.d3_kills + data_test.d4_kills + data_test.d5_kills
data_test.r1_kills = data_test.r1_kills + data_test.r2_kills + data_test.r3_kills + data_test.r4_kills + data_test.r5_kills

data_test.r1_lh = data_test.r1_lh + data_test.r2_lh + data_test.r3_lh + data_test.r4_lh + data_test.r5_lh
data_test.d2_lh = data_test.d1_lh + data_test.d2_lh + data_test.d3_lh + data_test.d4_lh + data_test.d5_lh


data_test.radiant_boots_count = data_test.radiant_boots_count + data_test.radiant_tpscroll_count + data_test.radiant_ward_observer_count + data_test.radiant_ward_sentry_count
data_test.dire_boots_count = data_test.dire_boots_count + data_test.dire_tpscroll_count + data_test.dire_ward_observer_count + data_test.dire_ward_sentry_count

useless_features = ['start_time', 'lobby_type', 'first_blood_player1', 'first_blood_player2', 'd2_kills', 'd3_kills', 'd4_kills', 'd5_kills', 'd2_lh', 'd3_lh', 'd4_lh', 'd5_lh', 'r2_kills', 'r3_kills', 'r4_kills', 'r5_kills', 'r2_lh', 'r3_lh', 'r4_lh', 'r5_lh',
                   'radiant_tpscroll_count', 'radiant_ward_observer_count', 'radiant_ward_sentry_count', 'dire_tpscroll_count', 'dire_ward_observer_count', 'dire_ward_sentry_count']

#train


data.d1_kills = data.d1_kills + data.d2_kills + data.d3_kills + data.d4_kills + data.d5_kills
data.r1_kills = data.r1_kills + data.r2_kills + data.r3_kills + data.r4_kills + data.r5_kills

data.r1_lh = data.r1_lh + data.r2_lh + data.r3_lh + data.r4_lh + data.r5_lh
data.d2_lh = data.d1_lh + data.d2_lh + data.d3_lh + data.d4_lh + data.d5_lh


data.radiant_boots_count = data.radiant_boots_count + data.radiant_tpscroll_count + data.radiant_ward_observer_count + data.radiant_ward_sentry_count
data.dire_boots_count = data.dire_boots_count + data.dire_tpscroll_count + data.dire_ward_observer_count + data.dire_ward_sentry_count

useless_features = ['start_time', 'lobby_type', 'first_blood_player1', 'first_blood_player2', 'd2_kills', 'd3_kills', 'd4_kills', 'd5_kills', 'd2_lh', 'd3_lh', 'd4_lh', 'd5_lh', 'r2_kills', 'r3_kills', 'r4_kills', 'r5_kills', 'r2_lh', 'r3_lh', 'r4_lh', 'r5_lh',
                   'radiant_tpscroll_count', 'radiant_ward_observer_count', 'radiant_ward_sentry_count', 'dire_tpscroll_count', 'dire_ward_observer_count', 'dire_ward_sentry_count']
for feature in data:
    if feature not in data_test:
        useless_features.append(feature)

y = data.radiant_win
X = data.drop(useless_features, axis = 1)
X_test = data_test.drop(useless_features, axis = 1)
