In [1]:
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import numpy as np
%matplotlib inline

In [2]:
from catboost import CatBoostClassifier

In [3]:
df_train = pd.read_csv('onetwotrip_challenge_train.csv')
df_test = pd.read_csv('onetwotrip_challenge_test.csv')

In [4]:
# features for training
features = list(filter(lambda x: 'field' in x, df_train.columns))

In [5]:
goals = list(filter(lambda x: 'goal' in x and 'indicator' not in x, df_train.columns))[:-1]

In [6]:
kf = StratifiedKFold(n_splits=5, random_state=0)

In [7]:
clfs = [CatBoostClassifier(logging_level='Silent', random_state=0) for goal in goals]

In [8]:
def train_clfs(X, goals, clfs):
    for idx in range(goals.shape[1]):
        y = goals.iloc[:, idx]
        clfs[idx].fit(X, y)
    return clfs

def predict_goals_proba(X, clfs, goals_cols):
    res = pd.DataFrame([], columns=goals_cols)
    for i, clf in enumerate(clfs):
        res[goals_cols[i]] = clf.predict_proba(X)[:, 1]
    return pd.DataFrame(res, columns=goals_cols)

def score_probas(goals_probas, goals_true):
    scores = []
    for goal in goals_probas.columns:
        scores.append(roc_auc_score(goals_true[goal], goals_probas[goal]))
    return np.mean(scores)

In [9]:
Y = (df_train[goals] == 1).any(axis=1)*1

In [10]:
# cross validation
scores = []
for train_idx, test_idx in kf.split(df_train[features], Y):
    tmp_train, tmp_test = df_train.loc[train_idx], df_train.loc[test_idx]
    train_clfs(tmp_train[features], tmp_train[goals], clfs)
    probas = predict_goals_proba(tmp_test[features], clfs, goals)
    scores += [score_probas(probas, tmp_test[goals])]
print(f"Score {np.mean(scores)} ± {np.std(scores)}")

Score 0.8045328812891773 ± 0.0020956252722654956


In [11]:
# fit classifier on the whole dataset
train_clfs(df_train[features], df_train[goals], clfs)

[<catboost.core.CatBoostClassifier at 0x125df1e48>,
 <catboost.core.CatBoostClassifier at 0x125df1e80>,
 <catboost.core.CatBoostClassifier at 0x125df1eb8>,
 <catboost.core.CatBoostClassifier at 0x125df1ef0>,
 <catboost.core.CatBoostClassifier at 0x125df1f28>]

In [12]:
# get probabilities
probas = predict_goals_proba(df_test[features], clfs, goals)

In [13]:
probas.to_csv('sub2.csv')