# <Center> Dota2 Competition </Center>
<img src='https://habrastorage.org/webt/ua/vn/pq/uavnpqfoih4zwwznvxubu33ispy.jpeg'>

1. [Data Preprocessing](#Data-Preprocessing)
2. [Create New Features](#Create-New-Features)

In [63]:
# import libraries
import pandas as pd
import numpy as np
import json
import ujson
%matplotlib inline
from matplotlib import pyplot as plt
import os
import collections
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

In [33]:
pd.set_option('max_columns', 100)

In [18]:
import os
import pandas as pd

PATH_TO_DATA = 'Data/'

df_train_features = pd.read_csv(os.path.join(PATH_TO_DATA, 
                                             'train_features.csv'), 
                                    index_col='match_id_hash')
df_train_targets = pd.read_csv(os.path.join(PATH_TO_DATA, 
                                            'train_targets.csv'), 
                                   index_col='match_id_hash')

In [22]:
df_test_features = pd.read_csv(os.path.join(PATH_TO_DATA, 'test_features.csv'), 
                                   index_col='match_id_hash')

In [66]:
y = df_train_targets['radiant_win'].values

### Data Preprocessing

In [20]:
def add_new_features(df_features, matches_file):
    
    # Process raw data and add new features
    for match in read_matches(matches_file):
        match_id_hash = match['match_id_hash']

        # Counting ruined towers for both teams
        radiant_tower_kills = 0
        dire_tower_kills = 0
        for objective in match['objectives']:
            if objective['type'] == 'CHAT_MESSAGE_TOWER_KILL':
                if objective['team'] == 2:
                    radiant_tower_kills += 1
                if objective['team'] == 3:
                    dire_tower_kills += 1

        # Write new features
        df_features.loc[match_id_hash, 'radiant_tower_kills'] = radiant_tower_kills
        df_features.loc[match_id_hash, 'dire_tower_kills'] = dire_tower_kills
        df_features.loc[match_id_hash, 'diff_tower_kills'] = radiant_tower_kills - dire_tower_kills
        
        # ... here you can add more features ...
        

In [26]:
# copy the dataframe with features
df_train_features_extended = df_train_features.copy()

# add new features
add_new_features(df_train_features_extended, 
                 os.path.join(PATH_TO_DATA, 
                              'train_matches.jsonl'))

HBox(children=(IntProgress(value=0, max=39675), HTML(value='')))




In [23]:
%%time
# Build the same features for the test set
df_test_features_extended = df_test_features.copy()
add_new_features(df_test_features_extended, 
                 os.path.join(PATH_TO_DATA, 'test_matches.jsonl'))

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))


Wall time: 29.4 s


### Create New Features

In [99]:
# Concat train and test for preprocessing
split = df_new_features.shape[0]
df_train_test = pd.concat([df_train_features_extended, df_test_features_extended])

In [100]:
df_train_features.head()

Unnamed: 0_level_0,game_time,game_mode,lobby_type,objectives_len,chat_len,r1_hero_id,r1_kills,r1_deaths,r1_assists,r1_denies,r1_gold,r1_lh,r1_xp,r1_health,r1_max_health,r1_max_mana,r1_level,r1_x,r1_y,r1_stuns,r1_creeps_stacked,r1_camps_stacked,r1_rune_pickups,r1_firstblood_claimed,r1_teamfight_participation,r1_towers_killed,r1_roshans_killed,r1_obs_placed,r1_sen_placed,r2_hero_id,r2_kills,r2_deaths,r2_assists,r2_denies,r2_gold,r2_lh,r2_xp,r2_health,r2_max_health,r2_max_mana,r2_level,r2_x,r2_y,r2_stuns,r2_creeps_stacked,r2_camps_stacked,r2_rune_pickups,r2_firstblood_claimed,r2_teamfight_participation,r2_towers_killed,...,d3_obs_placed,d3_sen_placed,d4_hero_id,d4_kills,d4_deaths,d4_assists,d4_denies,d4_gold,d4_lh,d4_xp,d4_health,d4_max_health,d4_max_mana,d4_level,d4_x,d4_y,d4_stuns,d4_creeps_stacked,d4_camps_stacked,d4_rune_pickups,d4_firstblood_claimed,d4_teamfight_participation,d4_towers_killed,d4_roshans_killed,d4_obs_placed,d4_sen_placed,d5_hero_id,d5_kills,d5_deaths,d5_assists,d5_denies,d5_gold,d5_lh,d5_xp,d5_health,d5_max_health,d5_max_mana,d5_level,d5_x,d5_y,d5_stuns,d5_creeps_stacked,d5_camps_stacked,d5_rune_pickups,d5_firstblood_claimed,d5_teamfight_participation,d5_towers_killed,d5_roshans_killed,d5_obs_placed,d5_sen_placed
match_id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
a400b8f29dece5f4d266f49f1ae2e98a,155,22,7,1,11,11,0,0,0,0,543,7,533,358,600,350.93784,2,116,122,0.0,0,0,1,0,0.0,0,0,0,0,78,0,0,0,3,399,4,478,636,720,254.93774,2,124,126,0.0,0,0,0,0,0.0,0,...,0,0,84,1,0,0,0,796,0,421,760,760,326.9378,2,90,150,0.0,0,0,2,1,1.0,0,0,1,0,34,0,0,0,0,851,11,870,593,680,566.93805,3,128,128,0.0,0,0,0,0,0.0,0,0,0,0
b9c57c450ce74a2af79c9ce96fac144d,658,4,0,3,10,15,7,2,0,7,5257,52,3937,1160,1160,566.93805,8,76,78,0.0,0,0,0,0,0.4375,0,0,0,0,96,3,1,2,3,3394,19,3897,1352,1380,386.93787,8,78,166,8.397949,0,0,4,0,0.3125,0,...,0,0,56,0,3,2,3,2808,18,2730,567,1160,410.9379,6,124,142,0.0,0,0,6,0,0.5,0,0,0,0,92,0,2,0,1,1423,8,1136,800,800,446.93793,4,180,176,0.0,0,0,0,0,0.0,0,0,0,0
6db558535151ea18ca70a6892197db41,21,23,0,0,0,101,0,0,0,0,176,0,0,680,680,506.938,1,118,118,0.0,0,0,0,0,0.0,0,0,0,0,51,0,0,0,0,176,0,0,720,720,278.93777,1,156,104,0.0,0,0,0,0,0.0,0,...,0,0,40,0,0,0,0,96,0,0,600,600,302.93777,1,176,110,0.0,0,0,0,0,0.0,0,0,0,0,17,0,0,0,0,96,0,0,640,640,446.93793,1,162,162,0.0,0,0,0,0,0.0,0,0,0,0
46a0ddce8f7ed2a8d9bd5edcbb925682,576,22,7,1,4,14,1,0,3,1,1613,0,1471,900,900,290.93777,4,170,96,2.366089,0,0,5,0,0.571429,0,0,0,0,99,1,0,1,2,2816,30,3602,878,1100,494.93796,8,82,154,0.0,0,0,1,0,0.285714,0,...,0,0,69,0,2,0,0,2004,16,1644,1160,1160,386.93787,4,176,100,4.998863,0,0,2,0,0.0,0,0,0,0,86,0,1,0,1,1333,2,1878,630,740,518.938,5,82,160,8.664527,3,1,3,0,0.0,0,0,2,0
b1b35ff97723d9b7ade1c9c3cf48f770,453,22,7,1,3,42,0,1,1,0,1404,9,1351,1000,1000,338.93784,4,80,164,9.930903,0,0,4,0,0.5,0,0,0,0,69,1,0,0,0,1840,14,1693,868,1000,350.93784,5,78,166,1.832892,0,0,0,1,0.5,0,...,0,0,72,2,1,0,0,1697,12,1651,680,680,374.93787,4,176,108,13.596678,0,0,2,0,0.5,0,0,0,0,1,0,1,1,8,2199,32,1919,692,740,302.93777,5,104,162,0.0,2,1,2,0,0.25,0,0,0,0


In [101]:
# mean level
df_train_test['r_mean_level'] = (df_train_test['r1_level'] + df_train_test['r2_level'] + df_train_test['r3_level'] + \
            df_train_test['r4_level'] + df_train_test['r5_level'])
df_train_test['d_mean_level'] = (df_train_test['d1_level'] + df_train_test['d2_level'] + df_train_test['d3_level'] + \
            df_train_test['d4_level'] + df_train_test['d5_level'])

df_train_test['level_diff'] = df_train_test['r_mean_level'] - df_train_test['d_mean_level']


# max level
df_train_test['r_max_level'] = np.max(list(zip(df_train_test['r1_level'], df_train_test['r2_level'], df_train_test['r3_level'],
            df_train_test['r4_level'], df_train_test['r5_level'])), axis=1)
df_train_test['d_max_level'] = np.max(list(zip(df_train_test['d1_level'], df_train_test['d2_level'], df_train_test['d3_level'],
            df_train_test['d4_level'], df_train_test['d5_level'])), axis=1)

df_train_test['max_level_diff'] = df_train_test['r_max_level'] - df_train_test['d_max_level']

In [102]:
# kills sum
df_train_test['r_max_kills'] = np.sum(list(zip(df_train_test['r1_kills'], df_train_test['r2_kills'], df_train_test['r3_kills'],
            df_train_test['r4_kills'], df_train_test['r5_kills'])), axis=1)
df_train_test['d_max_kills'] = np.sum(list(zip(df_train_test['d1_kills'], df_train_test['d2_kills'], df_train_test['d3_kills'],
            df_train_test['d4_kills'], df_train_test['d5_kills'])), axis=1)

df_train_test['max_kills_diff'] = df_train_test['r_max_kills'] - df_train_test['d_max_kills']

#### OHE for hero id

In [103]:
ohe_cols = []
for col in df_train_test.columns:
    if 'hero_id' in col:
#         df_train_test[col] = df_train_test[col].astype('str')
        ohe_cols.append(col)
df_train_test_ohe = pd.get_dummies(df_train_test, columns=ohe_cols)

In [104]:
df_train_test_ohe.shape

(49675, 1397)

#### Split for train and test

In [105]:
X_train = df_train_test_ohe.iloc[:split]
X_test = df_train_test_ohe.iloc[split:]

In [106]:
X_train_eli = df_train_test[:split]
X_test_eli = df_train[split:]

## Training models

### LightGBM

In [107]:
lgbm_model = LGBMClassifier(learning_rate=0.02,
                            n_estimators=700,
                            max_depth=6,
                            random_state=17,
                            n_jobs=4)

In [108]:
%%time
np.mean(cross_val_score(lgbm_model, X_train_eli, y, scoring='roc_auc', cv=4))

Wall time: 46.9 s


0.8137787594943483

In [118]:
from catboost import CatBoostClassifier

In [None]:
model = CatBoostClassifier(n_estimators=500,
                   learning_rate=0.05,
                   max_depth=8,
                   task_type='GPU',
                   cat_features=[1, 245,256],
                   verbose=0)

In [None]:
%%time
model.fit(X_train_eli, y, plot=True)

In [122]:
list(X_train_eli.columns)

['game_time',
 'game_mode',
 'lobby_type',
 'objectives_len',
 'chat_len',
 'r1_hero_id',
 'r1_kills',
 'r1_deaths',
 'r1_assists',
 'r1_denies',
 'r1_gold',
 'r1_lh',
 'r1_xp',
 'r1_health',
 'r1_max_health',
 'r1_max_mana',
 'r1_level',
 'r1_x',
 'r1_y',
 'r1_stuns',
 'r1_creeps_stacked',
 'r1_camps_stacked',
 'r1_rune_pickups',
 'r1_firstblood_claimed',
 'r1_teamfight_participation',
 'r1_towers_killed',
 'r1_roshans_killed',
 'r1_obs_placed',
 'r1_sen_placed',
 'r2_hero_id',
 'r2_kills',
 'r2_deaths',
 'r2_assists',
 'r2_denies',
 'r2_gold',
 'r2_lh',
 'r2_xp',
 'r2_health',
 'r2_max_health',
 'r2_max_mana',
 'r2_level',
 'r2_x',
 'r2_y',
 'r2_stuns',
 'r2_creeps_stacked',
 'r2_camps_stacked',
 'r2_rune_pickups',
 'r2_firstblood_claimed',
 'r2_teamfight_participation',
 'r2_towers_killed',
 'r2_roshans_killed',
 'r2_obs_placed',
 'r2_sen_placed',
 'r3_hero_id',
 'r3_kills',
 'r3_deaths',
 'r3_assists',
 'r3_denies',
 'r3_gold',
 'r3_lh',
 'r3_xp',
 'r3_health',
 'r3_max_health',
 '

In [124]:
list(X_train_eli.columns).index('max_kills_diff')

256

### Eli5 feature importances

In [92]:
import eli5
from eli5.sklearn import PermutationImportance

In [109]:
%%time
model = lgbm_model.fit(X_train_eli, y)

Wall time: 18.9 s


In [110]:
%%time
perm = PermutationImportance(model, random_state=1).fit(X_train_eli, y)
eli5.show_weights(perm, top=50)

Weight,Feature
0.0869  ± 0.0036,x250
0.0338  ± 0.0022,x247
0.0178  ± 0.0023,x256
0.0053  ± 0.0009,x109
0.0052  ± 0.0004,x125
0.0050  ± 0.0008,x253
0.0040  ± 0.0009,x149
0.0039  ± 0.0007,x46
0.0038  ± 0.0005,x110
0.0036  ± 0.0016,x181


## Preparing a submission

In [98]:
%%time
model = lgbm_model.fit(X_train_eli, y)

ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in fields r1_hero_id, r2_hero_id, r3_hero_id, r4_hero_id, r5_hero_id, d1_hero_id, d2_hero_id, d3_hero_id, d4_hero_id, d5_hero_id

In [70]:
y_test_pred = model.predict_proba(X_test)[:, 1]

df_submission = pd.DataFrame({'radiant_win_prob': y_test_pred}, 
                                 index=df_test_features.index)

In [71]:
import datetime
submission_filename = 'Submits/submission_{}.csv'.format(
    datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
df_submission.to_csv(submission_filename)
print('Submission saved to {}'.format(submission_filename))

Submission saved to Submits/submission_2019-03-28_22-55-00.csv
