In [1]:
import pandas as pd
import numpy as np

In [2]:
data_path = './data/'

def read(filename):
    current_df = pd.read_csv(data_path + filename + '.csv', index_col=None)
    if 'mid' in current_df:
        current_df = current_df.rename(columns={'mid' : 'match_id'})
    return current_df

events = read('events')
gold = read('gold')
heroes = read('heroes')
items = read('items')
lh = read('lh')
xp = read('xp')

train_union_val = read('train')
test = read('test')

## Создаем фичи

In [3]:
def series_sum(df, columns):
    return sum([df[column] for column in columns])

radiant_players = ['player_'+str(i) for i in range(0, 5)]
dire_players = ['player_'+str(i) for i in range(5, 10)]

# суммарное золото
gold['radiant_gold'] = series_sum(gold, radiant_players)
gold['dire_gold'] = series_sum(gold, dire_players)

# суммарный опыт
xp['radiant_xp'] = series_sum(xp, radiant_players)
xp['dire_xp'] = series_sum(xp, dire_players)

# ластхиты
lh['radiant_lh'] = series_sum(lh, radiant_players)
lh['dire_lh'] = series_sum(lh, dire_players)

In [4]:
gold_540 = gold[gold.times == 540].reset_index(drop=True)
xp_540 = xp[xp.times == 540].reset_index(drop=True)
lh_540 = lh[lh.times == 540].reset_index(drop=True)

gold_600 = gold[gold.times == 600].reset_index(drop=True)
xp_600 = xp[xp.times == 600].reset_index(drop=True)
lh_600 = lh[lh.times == 600].reset_index(drop=True)

default_match_id = gold_600.match_id.tolist()

In [5]:
gold_times = {}
xp_times = {}
lh_times = {}

dict_list = [gold_times, xp_times, lh_times]
df_list = [gold, xp, lh]

for df, dict in zip(df_list, dict_list):
    df_time_groupbied = df.groupby('times')
    
    for current_time, group_data in df_time_groupbied:
        dict[current_time] = group_data[radiant_players + dire_players].reset_index(drop=True)


In [6]:
player_gold = gold[gold.times == 600][radiant_players + dire_players].reset_index().drop(['index'], axis=1)

player_xp = xp[xp.times == 600][radiant_players + dire_players].reset_index().drop(['index'], axis=1)

player_lh = lh[lh.times == 600][radiant_players + dire_players].reset_index().drop(['index'], axis=1)

In [7]:
pref_gold = gold.groupby(['match_id']).aggregate('sum')[radiant_players + dire_players]
pref_xp = xp.groupby(['match_id']).aggregate('sum')[radiant_players + dire_players]
pref_lh = lh.groupby(['match_id']).aggregate('sum')[radiant_players + dire_players]

### Tower

In [8]:
tower = events[events['event_type'] == 6].copy()

tower['radiant_tower'] = (tower['from_team'] == 'radiant').astype(int)
tower['dire_tower'] = (tower['from_team'] == 'dire').astype(int)

tower

tower_sum = tower.groupby(['match_id']).aggregate({
    'radiant_tower' : 'sum',
    'dire_tower' : 'sum',
    'time' : 'sum'
}).reset_index()

tower_sum[['radiant_tower', 'dire_tower']] = tower_sum[['radiant_tower', 'dire_tower']].div(tower_sum['time'], axis=0)

tower_sum = tower_sum.drop(['time'], axis=1)

tower_sum

# ###

exist_ids = set(sorted(tower_sum['match_id'].tolist()))

tower_zero_list = []
for id in range(len(gold_600)):
    if id not in exist_ids:
        tower_zero_list.append([id, 0, 0])

tower_zero = pd.DataFrame(tower_zero_list, columns=tower_sum.columns)

tower_sum = pd.concat([tower_sum, tower_zero], axis=0)

tower_sum = tower_sum.sort_values(by='match_id').reset_index()

tower_sum.columns

assert tower_sum.match_id.tolist() == gold_600.match_id.tolist()

### Tower Deny

In [9]:
tower_deny = events[events['event_type'] == 5].copy()

tower_deny['radiant_tower_deny'] = (tower_deny['from_team'] == 'radiant').astype(int)
tower_deny['dire_tower_deny'] = (tower_deny['from_team'] == 'dire').astype(int)

tower_deny_sum = tower_deny.groupby(['match_id']).aggregate({
    'radiant_tower_deny' : 'sum',
    'dire_tower_deny' : 'sum'
}).reset_index()

# ###

exist_ids = set(sorted(tower_deny_sum['match_id'].tolist()))

tower_zero_list = []
for id in range(len(gold_600)):
    if id not in exist_ids:
        tower_zero_list.append([id, 0, 0])

tower_zero = pd.DataFrame(tower_zero_list, columns=tower_deny_sum.columns)

tower_deny_sum = pd.concat([tower_deny_sum, tower_zero], axis=0)

tower_deny_sum = tower_deny_sum.sort_values(by='match_id').reset_index()

tower_deny_sum

assert tower_deny_sum.match_id.tolist() == gold_600.match_id.tolist()

In [10]:
tower_deny_sum

Unnamed: 0,index,match_id,radiant_tower_deny,dire_tower_deny
0,0,0,0,0
1,1,1,0,0
2,2,2,0,0
3,3,3,0,0
4,4,4,0,0
...,...,...,...,...
49943,47744,49943,0,0
49944,47745,49944,0,0
49945,47746,49945,0,0
49946,47747,49946,0,0


###  Aegis

In [11]:
default_aegis_time = 0

aegis = pd.DataFrame({
    'match_id' : default_match_id,
    'radiant_aegis' : [default_aegis_time for i in range(len(default_match_id))],
    'dire_aegis' : [default_aegis_time for i in range(len(default_match_id))],
})

for index, row in events[events.event_type == 4].iterrows():
   match_id = row['match_id']
   from_team = row['from_team'] + '_aegis'
   time = row['time']

   aegis.loc[aegis.match_id == match_id, from_team] = 1

aegis

Unnamed: 0,match_id,radiant_aegis,dire_aegis
0,0,0,0
1,1,0,0
2,2,0,0
3,3,0,0
4,4,0,0
...,...,...,...
49943,49943,0,0
49944,49944,0,0
49945,49945,0,0
49946,49946,0,0


### First Blood

In [12]:
default_first_blood_time = 0

first_blood = pd.DataFrame({
    'match_id' : default_match_id,
    'radiant_first_blood' : [default_first_blood_time for i in range(len(default_match_id))],
    'dire_first_blood' : [default_first_blood_time for i in range(len(default_match_id))],
})

for index, row in events[events.event_type == 3].iterrows():
   match_id = row['match_id']
   from_team = row['from_team'] + '_first_blood'
   time = row['time']

   first_blood.loc[aegis.match_id == match_id, from_team] = 1

first_blood

Unnamed: 0,match_id,radiant_first_blood,dire_first_blood
0,0,1,0
1,1,1,0
2,2,0,1
3,3,1,0
4,4,0,1
...,...,...,...
49943,49943,1,0
49944,49944,0,1
49945,49945,0,1
49946,49946,0,1


### Heroes

In [13]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder()
heroes_one_hot = enc.fit_transform(heroes.drop(['match_id'], axis=1)).toarray()

heroes_one_hot

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### Items

In [14]:
items

Unnamed: 0,match_id,player,item_0,item_1,item_2,item_3,item_4,item_5,item_6,item_7,...,item_111,item_112,item_113,item_114,item_115,item_116,item_117,item_118,item_119,item_120
0,0,0,,,,,,,,,...,,,,,,,,,,
1,0,1,,,,,,,,,...,,,,,,,,,,
2,0,2,,,,,,,,,...,,,,,,,,,,
3,0,3,,,,1.0,,,,,...,,,,,,,,,,
4,0,4,,,,,,,,,...,,,,,1.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499407,49947,5,,2.0,,,,,,,...,,,,,,,,,,
499408,49947,6,,,,,,,,,...,,,,,,3.0,,,,
499409,49947,7,,2.0,,,,,,,...,,,,,,,,,,
499410,49947,8,,,,,,,,,...,,,,,,,,,,


In [15]:
items = items.fillna(0)
items.player = items.player.apply(lambda x: 'radiant' if x <= 4 else 'dire')

item_dict = {}
for i in range(0, 121):
    item_dict['item_'+str(i)] = 'sum'

items_radiant_sum = items[items.player == 'radiant'].groupby(['match_id']).aggregate(item_dict)
items_dire_sum = items[items.player == 'dire'].groupby(['match_id']).aggregate(item_dict)

In [16]:
events.event_type.value_counts()

event_type
3    48655
6    16264
5     2251
4      968
0      953
2       17
1       12
Name: count, dtype: int64

In [17]:
gold_times[600]

Unnamed: 0,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9
0,3454,5206,2613,4426,5755,4072,3997,5917,1725,6384
1,2477,5760,3816,4353,5759,7659,5066,2748,4440,4623
2,3604,1948,8581,4390,2869,3096,2301,5130,2530,2491
3,3457,5464,4432,2961,4314,3345,4791,1906,5328,2247
4,3675,4103,5154,3030,2076,3920,3494,3392,4458,2220
...,...,...,...,...,...,...,...,...,...,...
49943,3754,3556,5388,3391,5634,2853,4447,3636,8245,5598
49944,4895,2580,4109,5610,2732,3564,5763,5538,4005,3027
49945,2325,1813,2530,4807,5542,2242,2359,5997,5523,2044
49946,4134,3141,4086,4405,3517,2887,7450,3634,5431,5451


In [18]:
alpha = 0.0001
exp_alpha = lambda x: np.exp(alpha * x)
log = lambda x: np.log(x)
square = lambda x: x * x

X = pd.concat([

    gold_600.match_id,  

    gold_times[600],
    gold_times[600].apply(square),

    xp_times[600],
    xp_times[600].apply(log),
    
    lh_times[600],
    lh_times[600].apply(square)

], axis=1)

# for current_dict in dict_list:
#     df_list = list(current_dict.values())
#     concat_df = pd.concat(df_list, axis=1)

#     # exp_df = concat_df.apply(exp_alpha)
#     square_df = concat_df.apply(square)
#     # log_df = concat_df.apply(log)

#     X = pd.concat([
#         X, concat_df, 
#         # exp_df, 
#         square_df, 
#         # log_df
#     ], axis=1)

scale_first_columns = len(X.columns) - 1

X_not_scale = pd.concat([
    tower_sum.radiant_tower + tower_deny_sum.dire_tower_deny,
    tower_sum.dire_tower + tower_deny_sum.radiant_tower_deny,

    items_radiant_sum,
    items_dire_sum,

    aegis.radiant_aegis,
    aegis.dire_aegis,

    pd.DataFrame(heroes_one_hot),

    pd.DataFrame(first_blood.radiant_first_blood, columns=['radiant_first_blood']),
    pd.DataFrame(first_blood.dire_first_blood, columns=['dire_first_blood']),
], axis=1)

X = pd.concat([
    X,
    X_not_scale
], axis=1)

In [19]:
X.columns = X.columns.map(str)

X

Unnamed: 0,match_id,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,...,1102,1103,1104,1105,1106,1107,1108,1109,radiant_first_blood,dire_first_blood
0,0,3454,5206,2613,4426,5755,4072,3997,5917,1725,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
1,1,2477,5760,3816,4353,5759,7659,5066,2748,4440,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
2,2,3604,1948,8581,4390,2869,3096,2301,5130,2530,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
3,3,3457,5464,4432,2961,4314,3345,4791,1906,5328,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
4,4,3675,4103,5154,3030,2076,3920,3494,3392,4458,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49943,49943,3754,3556,5388,3391,5634,2853,4447,3636,8245,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
49944,49944,4895,2580,4109,5610,2732,3564,5763,5538,4005,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
49945,49945,2325,1813,2530,4807,5542,2242,2359,5997,5523,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
49946,49946,4134,3141,4086,4405,3517,2887,7450,3634,5431,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1


## Перестаем создавать фичи

In [20]:
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

from sklearn.metrics import roc_auc_score

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

from typing import List, Optional
from sklearn.base import TransformerMixin

In [21]:
continous_columns = [
    'radiant_gold',
    'dire_gold',    
    'radiant_xp',
    'dire_xp',   
    'radiant_lh',       
    'dire_lh',
]

class BaseDataPreprocessor(TransformerMixin):
    def __init__(self, scale_first_columns):
        """
        :param needed_columns: if not None select these columns from the dataframe
        """
        self.scaler = StandardScaler()
        self.scale_first_columns = scale_first_columns

    def fit(self, data, *args):
        """
        Prepares the class for future transformations
        :param data: pd.DataFrame with all available columns
        :return: self
        """
        self.scaler.fit(data[:, 0:self.scale_first_columns])
        return self

    def transform(self, data) -> np.array:
        """
        Transforms features so that they can be fed into the regressors
        :param data: pd.DataFrame with all available columns
        :return: np.array with preprocessed features
        """
        data[:, 0:self.scale_first_columns] = self.scaler.transform(data[:, 0:self.scale_first_columns])
        return data

In [53]:
def pipeline_score(model, scale_first_columns):
    X_id = train_union_val.match_id.tolist()
    X_cv = X.iloc[X_id].drop(['match_id'], axis=1).to_numpy()
    y_cv = train_union_val.radiant_won.tolist()

    sss = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=667)
    #kf = KFold(n_splits=4, random_state=667, shuffle=True)

    pipeline = Pipeline([
        ('scaler', BaseDataPreprocessor(scale_first_columns)),
        ('model', model)
    ])

    cv_scores = cross_validate(
        pipeline,
        X_cv, y_cv,
        cv=sss,
        scoring='roc_auc',
        return_train_score=True
    )
    
    round_numbers = 5

    print("min roc_auc score on train: ", round(cv_scores['train_score'].min(), round_numbers))
    print("mean roc_auc score on train: ", round(cv_scores['train_score'].mean(), round_numbers))
    print("max roc_auc score on train: ", round(cv_scores['train_score'].max(), round_numbers))
    print()

    print("min roc_auc score on validation: ", round(cv_scores['test_score'].min(), round_numbers))
    print("mean roc_auc score on validation: ", round(cv_scores['test_score'].mean(), round_numbers))
    print("max roc_auc score on validation: ", round(cv_scores['test_score'].max(), round_numbers))

    scaler = StandardScaler()

    X_cv = scaler.fit_transform(X_cv)
    pipeline.fit(X_cv, y_cv)

    X_test_id = test.match_id.tolist()
    X_test = X.iloc[X_test_id].drop(['match_id'], axis=1).to_numpy()
    
    X_test = scaler.transform(X_test)
    test_probas = pipeline.predict_proba(X_test)[:, 1].tolist()

    submission = pd.DataFrame({
        'mid' : test.match_id,
        'radiant_won' : test_probas
    })

    submission.to_csv('submission.csv', index=False)

In [54]:
print(scale_first_columns)

60


In [55]:
pipeline_score(
    LogisticRegression(
        max_iter=3000
    ),
    scale_first_columns=scale_first_columns
)

min roc_auc score on train:  0.7223
mean roc_auc score on train:  0.73439
max roc_auc score on train:  0.74557

min roc_auc score on validation:  0.73331
mean roc_auc score on validation:  0.7385
max roc_auc score on validation:  0.74752


In [56]:
# pipeline_score(
#     KNeighborsClassifier(),
#     scale_first_columns=scale_first_columns
# )