Продолжение первого ноутбука

In [1]:
import os
import json
import pandas as pd
import datetime
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import ujson
import tqdm

from sklearn.model_selection import train_test_split, ShuffleSplit, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score

from sklearn.model_selection import GridSearchCV

%matplotlib inline

In [2]:
import seaborn as sns
import catboost
import xgboost
import lightgbm
from sklearn.ensemble import (VotingClassifier, ExtraTreesClassifier)
from sklearn.preprocessing import StandardScaler

from sklearn.tree import (DecisionTreeRegressor,
                          DecisionTreeClassifier)
from sklearn.neighbors import KNeighborsClassifier

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

  from pandas import MultiIndex, Int64Index


In [3]:
SEED = 10801
sns.set_style(style="whitegrid")
plt.rcParams["figure.figsize"] = 12, 8
warnings.filterwarnings("ignore")

# Загрузка всех данных

In [4]:
PATH_TO_DATA = "../data/"

df_train_features = pd.read_csv(os.path.join(PATH_TO_DATA, 
                                             "train_data.csv"), 
                                    index_col="match_id_hash")
df_train_targets = pd.read_csv(os.path.join(PATH_TO_DATA, 
                                            "train_targets.csv"), 
                                   index_col="match_id_hash")

df_test_features = pd.read_csv(os.path.join(PATH_TO_DATA, "test_data.csv"), 
                                            index_col="match_id_hash")

In [16]:
try:
    import ujson as json
except ModuleNotFoundError:
    import json
    print ("Подумайте об установке ujson, чтобы работать с JSON объектами быстрее")
    
try:
    from tqdm.notebook import tqdm
except ModuleNotFoundError:
    tqdm_notebook = lambda x: x
    print ("Подумайте об установке tqdm, чтобы следить за прогрессом")

    
def read_matches(matches_file, total_matches=31698, n_matches_to_read=None):
    """
    Аргуент
    -------
    matches_file: JSON файл с сырыми данными
    
    Результат
    ---------
    Возвращает записи о каждом матче
    """
    
    if n_matches_to_read is None:
        n_matches_to_read = total_matches
        
    c = 0
    with open(matches_file) as fin:
        for line in tqdm(fin, total=total_matches):
            if c >= n_matches_to_read:
                break
            else:
                c += 1
                yield json.loads(line)

In [20]:
def add_new_features(df_features, matches_file, total_matches=31698):
    """
    Аргуенты
    -------
    df_features: таблица с данными
    matches_file: JSON файл с сырыми данными
    
    Результат
    ---------
    Добавляет новые признаки в таблицу
    """
    
    for match in read_matches(matches_file, total_matches=total_matches):
        match_id_hash = match['match_id_hash']

        # Посчитаем количество разрушенных вышек обеими командами
        radiant_tower_kills = 0
        dire_tower_kills = 0
        # и другие данные
        first_blood = 0
        aegis_stolen_radiant, aegis_stolen_dire = 0, 0
        aegis_denied_radiant, aegis_denied_dire = 0, 0
        radiant_roshan_kills, dire_roshan_kills = 0, 0
        barracks_kill = 0
        dire_ability_uses = 0
        rad_ability_uses = 0
        
        dire_observers_placed, rad_observers_placed = 0, 0
        dire_purchase_log, rad_purchase_log = 0, 0
        dire_inventory, rad_inventory = 0, 0
        
        for objective in match["objectives"]:
            if objective["type"] == "CHAT_MESSAGE_TOWER_KILL":
                if objective["team"] == 2:
                    radiant_tower_kills += 1
                if objective["team"] == 3:
                    dire_tower_kills += 1
            if objective["type"] == "CHAT_MESSAGE_ROSHAN_KILL":
                if objective["team"] == 2:
                    radiant_roshan_kills += 1
                if objective["team"] == 3:
                    dire_roshan_kills += 1
                    
            if objective["type"] == "CHAT_MESSAGE_BARRACKS_KILL":
                barracks_kill += 1
                    
            if objective["type"] == "CHAT_MESSAGE_FIRSTBLOOD":
                if objective["player_slot"] < 6:
                    first_blood = 1
                    
            if objective["type"] == "CHAT_MESSAGE_AEGIS_STOLEN":
                if objective["player_slot"] < 6:
                    aegis_stolen_radiant += 1
                if objective["player_slot"] > 50:
                    aegis_stolen_dire += 1
            
            if objective["type"] == "CHAT_MESSAGE_DENIED_AEGIS":
                if objective["player_slot"] < 6:
                    aegis_denied_radiant += 1
                if objective["player_slot"] > 50:
                    aegis_denied_dire += 1
                    
                
        df_features.loc[match_id_hash, "radiant_tower_kills"] = radiant_tower_kills
        df_features.loc[match_id_hash, "dire_tower_kills"] = dire_tower_kills
        df_features.loc[match_id_hash, "diff_tower_kills"] = radiant_tower_kills - dire_tower_kills
        
        df_features.loc[match_id_hash, "radiant_roshan_kills"] = radiant_roshan_kills
        df_features.loc[match_id_hash, "dire_roshan_kills"] = dire_roshan_kills
        df_features.loc[match_id_hash, "diff_roshan_kills"] = radiant_roshan_kills - dire_roshan_kills
        
        df_features.loc[match_id_hash, "first_blood"] = first_blood
        df_features.loc[match_id_hash, "barracks_kill"] = barracks_kill       
        
        df_features.loc[match_id_hash, "aegis_stolen_diff"] = aegis_stolen_radiant - aegis_stolen_dire
        df_features.loc[match_id_hash, "aegis_stolen_radiant"] = aegis_stolen_radiant
        df_features.loc[match_id_hash, "aegis_stolen_dire"] = aegis_stolen_dire
            
        df_features.loc[match_id_hash, "aegis_denied_diff"] = aegis_denied_radiant - aegis_denied_dire
        df_features.loc[match_id_hash, "aegis_denied_radiant"] = aegis_denied_radiant
        df_features.loc[match_id_hash, "aegis_denied_dire"] = aegis_denied_dire
        
        for player in match["players"]:
            if player['player_slot'] > 90:
                for i in player["ability_uses"].values():
                    dire_ability_uses += i
            else:
                for i in player["ability_uses"].values():
                    rad_ability_uses += i
                    
            if player['player_slot'] > 90:
                dire_observers_placed += player["observers_placed"]
                dire_purchase_log += len(player["purchase_log"])
                dire_inventory += len(player["hero_inventory"])

            else:
                rad_observers_placed += player["observers_placed"]
                rad_purchase_log += len(player["purchase_log"])
                rad_inventory += len(player["hero_inventory"])
                
        df_features.loc[match_id_hash, "dire_purchases"] = dire_purchase_log
        df_features.loc[match_id_hash, "rad_purchases"] = rad_purchase_log 
        
        df_features.loc[match_id_hash, "dire_inventory"] = dire_inventory
        df_features.loc[match_id_hash, "rad_inventory"] = rad_inventory 
        
        df_features.loc[match_id_hash, "rad_ability_uses"] = rad_ability_uses
        df_features.loc[match_id_hash, "dire_ability_uses"] = dire_ability_uses 
        
        for team_fight in match["teamfights"]:
            for i, team_player in enumerate(team_fight['players']):
                if i < 5:
                    df_features.loc[match_id_hash, f"r{i+1}_damage"] = team_player.get("damage", 0)
                    df_features.loc[match_id_hash, f"r{i+1}_healing"] = team_player.get("healing", 0)
                    
                else:
                    df_features.loc[match_id_hash, f"d{i-4}_damage"] = team_player.get("damage", 0)      
                    df_features.loc[match_id_hash, f"d{i-4}_healing"] = team_player.get("healing", 0)  

In [21]:
# Добавим новые
add_new_features(df_test_features, 
                 os.path.join('../data', 
                              "test_raw_data.jsonl"),
                 total_matches = 7977)

  0%|          | 0/7977 [00:00<?, ?it/s]

In [22]:
add_new_features(df_train_features, 
                 os.path.join('../data', 
                              "train_raw_data.jsonl"),
                 total_matches = 31698)

  0%|          | 0/31698 [00:00<?, ?it/s]

In [25]:
train = df_train_features.copy()
test = df_test_features.copy()

In [7]:
df_train_targets = pd.read_csv("../data/train_targets.csv", index_col="match_id_hash")
y = df_train_targets["radiant_win"].values.astype("int8")

In [89]:
train.to_csv('../data/fin_train.csv')  
test.to_csv('../data/fin_test.csv')  

In [53]:
train = pd.read_csv('../data/fin_train.csv', index_col=0)
test = pd.read_csv('../data/fin_test.csv', index_col=0)

# Работа с данными

In [54]:
# id hero to hero win rate

id_hero = set(list(train['r1_hero_id']))

new_train = train.copy()
new_train['win'] = y

win_rate_for_hero = dict()

for index, row in train.iterrows():
    for i in range(1, 6):
        try:
            win_rate_for_hero[row[f'r{i}_hero_id']] += new_train.loc[index, 'win']

        except KeyError:
            win_rate_for_hero[row[f'r{i}_hero_id']] = new_train.loc[index, 'win']
            
        try:
            win_rate_for_hero[row[f'd{i}_hero_id']] += new_train.loc[index, 'win']

        except KeyError:
            win_rate_for_hero[row[f'd{i}_hero_id']] = new_train.loc[index, 'win']
            
for h_id in win_rate_for_hero:
    win_rate_for_hero[h_id] = win_rate_for_hero[h_id] / new_train.shape[0]
    
for index, row in new_train.iterrows():
    for i in range(1, 6):
        train.loc[index, f"r{i}h_win_rate"] = win_rate_for_hero[row[f"r{i}_hero_id"]]
        train.loc[index, f"d{i}h_win_rate"] = win_rate_for_hero[row[f"d{i}_hero_id"]]
    
for i in range(1, 6):
    train = train.drop([f"r{i}_hero_id", f"d{i}_hero_id"], axis=1)

In [55]:
for index, row in test.iterrows():
    for i in range(1, 6):
        test.loc[index, f"r{i}h_win_rate"] = win_rate_for_hero[row[f"r{i}_hero_id"]]
        test.loc[index, f"d{i}h_win_rate"] = win_rate_for_hero[row[f"d{i}_hero_id"]]
    
for i in range(1, 6):
    test = test.drop([f"r{i}_hero_id", f"d{i}_hero_id"], axis=1)

In [56]:
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

# LogisticRegression: 0.8105232298578847

In [57]:
train['r_max_mana'] = 0
train['d_max_mana'] = 0

for i in range(1, 6):
    train['r_max_mana'] += train[f'r{i}_max_mana']
    train['d_max_mana'] += train[f'd{i}_max_mana']
    
    train = train.drop([f'r{i}_max_mana',
                        f'd{i}_max_mana'],
                      axis=1)
    
train['diff_max_mana'] = (train['r_max_mana'] + 1) / (train['d_max_mana'] + 1)

train = train.drop(['r_max_mana', 'd_max_mana'], axis=1)
# LogisticRegression: 0.8107347130416839

In [62]:
train['r_creeps_stacked'] = 0
train['d_creeps_stacked'] = 0
train['r_camps_stacked'] = 0
train['d_camps_stacked'] = 0


for i in range(1, 6):
    train['r_creeps_stacked'] += train[f'r{i}_creeps_stacked'] 
    train['d_creeps_stacked'] += train[f'd{i}_creeps_stacked'] 
    
    train['r_camps_stacked'] += train[f'r{i}_camps_stacked'] 
    train['d_camps_stacked'] += train[f'd{i}_camps_stacked'] 
    
    train = train.drop([f'r{i}_creeps_stacked', f'd{i}_creeps_stacked',
                        f'r{i}_camps_stacked', f'd{i}_camps_stacked'],
                      axis=1)
    
# LogisticRegression: 0.810941765578241

In [69]:
train['d_rune_pickups'] = 0
train['r_rune_pickups'] = 0

test['d_rune_pickups'] = 0
test['r_rune_pickups'] = 0

for i in range(1, 6):
    train['d_rune_pickups'] += train[f'r{i}_rune_pickups'] 
    train['r_rune_pickups'] += train[f'd{i}_rune_pickups'] 
    
    train = train.drop([f'r{i}_rune_pickups', f'd{i}_rune_pickups',], axis=1)
    
    test['d_rune_pickups'] += test[f'r{i}_rune_pickups'] 
    test['r_rune_pickups'] += test[f'd{i}_rune_pickups'] 
    
    test = test.drop([f'r{i}_rune_pickups', f'd{i}_rune_pickups',], axis=1)
    
# LogisticRegression: 0.8111139580828519

In [76]:
train['d_obs_placed'] = 0
train['r_obs_placed'] = 0

test['d_obs_placed'] = 0
test['r_obs_placed'] = 0

for i in range(1, 6):
    train['d_obs_placed'] += train[f'r{i}_obs_placed'] 
    train['r_obs_placed'] += train[f'd{i}_obs_placed'] 
    
    train = train.drop([f'r{i}_obs_placed', f'd{i}_obs_placed',], axis=1)
    
    test['d_obs_placed'] += test[f'r{i}_obs_placed'] 
    test['r_obs_placed'] += test[f'd{i}_obs_placed'] 
    
    test = test.drop([f'r{i}_obs_placed', f'd{i}_obs_placed',], axis=1)
    
# LogisticRegression: 0.8112648829339387

In [82]:
train['d_healing'] = 0
train['r_healing'] = 0

test['d_healing'] = 0
test['r_healing'] = 0

for i in range(1, 6):
    train['d_healing'] += train[f'r{i}_healing'] 
    train['r_healing'] += train[f'd{i}_healing'] 
    
    all_train = train.drop([f'r{i}_healing', f'd{i}_healing',], axis=1)
    
    test['d_healing'] += train[f'r{i}_healing'] 
    test['r_healing'] += train[f'd{i}_healing'] 
    
    test = test.drop([f'r{i}_healing', f'd{i}_healing',], axis=1)

# LogisticRegression: 0.8112687749635854

In [None]:
train['d_denies'] = 0
train['r_denies'] = 0

test['d_denies'] = 0
test['r_denies'] = 0

for i in range(1, 6):
    train['d_denies'] += train[f'r{i}_denies'] 
    train['r_denies'] += train[f'd{i}_denies'] 
    
    train = train.drop([f'r{i}_denies', f'd{i}_denies',], axis=1)
    
    test['d_denies'] += test[f'r{i}_denies'] 
    test['r_denies'] += test[f'd{i}_denies'] 
    
    test = test.drop([f'r{i}_denies', f'd{i}_denies',], axis=1)
    
# 

In [None]:
train['diff_purchases'] = np.where(train['dire_purchases'] > 0,
                                   train['rad_purchases']  / train['dire_purchases'],
                                   train['rad_purchases'])

test['diff_purchases'] = np.where(test['dire_purchases'] > 0,
                                   test['rad_purchases']  / test['dire_purchases'],
                                   test['rad_purchases'])

train = train.drop(['rad_purchases', 'dire_purchases'], axis=1)
test = test.drop(['rad_purchases', 'dire_purchases'], axis=1)

# 

In [31]:
X_train, X_valid, y_train, y_valid = train_test_split(train, y, 
                                                      test_size=0.35, 
                                                      random_state=SEED)

X_S_train, X_S_valid, y_S_train, y_S_valid = train_test_split(X_valid, y_valid, 
                                                              test_size=0.3, 
                                                              random_state=SEED)

In [None]:
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train)

In [83]:
test_scaled = scaler.transform(test)

ValueError: X has 261 features, but StandardScaler is expecting 244 features as input.

In [14]:
X_S_train = scaler.transform(X_S_train)
X_S_valid = scaler.transform(X_S_valid)

# Тест

In [None]:
lr = LogisticRegression(C=0.1, 
                        class_weight='balanced', 
                        max_iter=1000, 
                        penalty='l1',
                        solver='liblinear', 
                        random_state=SEED,
                        n_jobs=2)

lr.fit(train_scaled, y)

coefficients = np.hstack((lr.intercept_, lr.coef_[0]))
coefficients_df = pd.DataFrame(data={'variable': ['intercept'] + list(train.columns), 'coefficient': coefficients})
coefficients_df = coefficients_df.sort_values(by='coefficient', ascending=False, key=lambda col: col.abs())

In [None]:
scores = cross_val_score(lr, train_scaled, y, cv=3, scoring="roc_auc")
print(f"{lr.__class__.__name__}: {scores.mean()}")
# LogisticRegression: 0.8105232298578847
# LogisticRegression: 0.8107347130416839
# LogisticRegression: 0.810941765578241
# LogisticRegression: 0.8111139580828519
# LogisticRegression: 0.8112648829339387
# LogisticRegression: 0.8112687749635854

In [None]:
coefficients_df.iloc[:15,:]

In [None]:
coefficients_df.iloc[100:,:]

In [None]:
rf = RandomForestClassifier(max_depth=14, 
                            min_samples_leaf=4,
                            n_estimators=400, 
                            n_jobs=2, 
                            random_state=SEED)

rf.fit(train_scaled, y)

scores = cross_val_score(rf, train_scaled, y, cv=3, scoring="roc_auc")
print(f"{rf.__class__.__name__}: {scores.mean()}")

# RandomForestClassifier: 0.792346761151754
# RandomForestClassifier: 0.7926450328906629

In [25]:
import statsmodels.api as sm
from sklearn.datasets import make_blobs

train_scaled = pd.DataFrame(train_scaled)
train_scaled.columns = train.columns

logit_model = sm.Logit(y, train_scaled).fit()
print(logit_model.summary())

Optimization terminated successfully.
         Current function value: 0.509507
         Iterations 28
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                31698
Model:                          Logit   Df Residuals:                    31417
Method:                           MLE   Df Model:                          280
Date:                Sun, 03 Apr 2022   Pseudo R-squ.:                  0.2635
Time:                        16:01:18   Log-Likelihood:                -16150.
converged:                       True   LL-Null:                       -21929.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                 coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------
game_time                     -0.0395      0.173     -0.228      0.820     

# Посылки

In [21]:
# Score: 0.83629

rf = RandomForestClassifier(max_depth=14, min_samples_leaf=4,
                            n_estimators=400, n_jobs=-1, random_state=SEED)
cat = catboost.CatBoostClassifier(verbose=0, random_seed=SEED)
xgb_rf =  xgboost.XGBRFClassifier(random_state=SEED)
lr = LogisticRegression(C=0.1, 
                        class_weight='balanced', 
                        max_iter=1000, 
                        penalty='l1',
                        solver='liblinear', 
                        random_state=SEED)


base_models = [("RF", rf), ("CAT", cat), ("XGB_RF", xgb_rf), ("LR", lr)]

voting_soft = VotingClassifier(estimators=base_models, voting = 'soft')
voting_soft.fit(train_scaled, y)

y_test_pred = voting_soft.predict_proba(test_scaled)[:, 1]

df_submission = pd.DataFrame({"radiant_win_prob": y_test_pred}, 
                                 index=test.index)

submission_filename = "submission_{}.csv".format(
    datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))

df_submission.to_csv(''.join(['../data/', submission_filename]))

print("Файл посылки сохранен, как: {}".format(submission_filename))

# Score: 0.82076



NameError: name 'df_test' is not defined

In [22]:
df_submission = pd.DataFrame({"radiant_win_prob": y_test_pred}, 
                                 index=test.index)

submission_filename = "submission_{}.csv".format(
    datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))

df_submission.to_csv(''.join(['../data/', submission_filename]))

print("Файл посылки сохранен, как: {}".format(submission_filename))

Файл посылки сохранен, как: submission_2022-04-03_15-57-14.csv


In [None]:
# Score: 0.83124
rf = RandomForestClassifier(max_depth=14, min_samples_leaf=4,
                            n_estimators=400, n_jobs=-1, random_state=SEED)
rf.fit(train_scaled, y)

y_test_pred = rf.predict_proba(test_scaled)[:, 1]

df_submission = pd.DataFrame({"radiant_win_prob": y_test_pred}, 
                                 index=df_test.index)

submission_filename = "submission_{}.csv".format(
    datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))

df_submission.to_csv(''.join(['../data/', submission_filename]))

print("Файл посылки сохранен, как: {}".format(submission_filename))