In [9]:
from datetime import datetime
import glob

import pandas as pd
import numpy as np

from fasttext_worker import pipeline, LABEL_COLUMNS

In [2]:
args ={}

args["train_path"] = "input/train_small.csv"
args["test_path"] = "input/test_small.csv"
args["train_path"] = "input/train.csv.zip"
args["test_path"] = "input/test.csv.zip"
args["n_splits"] = 10

args["save_proba"] = "probas/"

In [5]:
train = pd.read_csv(args["train_path"])
test = pd.read_csv(args["test_path"])

def get_features(path):
    data = pd.read_pickle(path)
    columns = [col for col in list(data) if "_proba_" in col]
    return data[columns].as_matrix()

train_paths = [path for path in glob.glob(args["save_proba"] + "*") if "train" in path]
test_paths = [path for path in glob.glob(args["save_proba"] + "*") if "test" in path]

train_features = np.hstack([get_features(path) for path in train_paths])
train_labels = train[LABEL_COLUMNS].as_matrix()

test_features = np.hstack([get_features(path) for path in test_paths])

In [30]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold


def fix_pred(probas):
    return np.hstack(proba[:, 1][:, np.newaxis] for proba in probas)


kfold = KFold(n_splits=args["n_splits"])
roc_aucs = []
train_pred = np.zeros(train_labels.shape)
test_pred = np.zeros((test_features.shape[0], train_labels.shape[1]))
for i, (train_index, val_index) in enumerate(kfold.split(train)):
    print("Fold %s in progress..." % i)
    train_features_ = train_features[train_index, :]
    train_labels_ = train_labels[train_index, :]
    val_features_ = train_features[val_index, :]
    val_labels_ = train_labels[val_index, :]
    
    from catboost import CatBoostClassifier
    for i in range(len(LABEL_COLUMNS)):
        model = CatBoostClassifier(loss_function='Logloss', logging_level='Verbose',
                                   iterations)
    #     model = ExtraTreesClassifier(min_samples_leaf=50, n_jobs=-1)
        model.fit(train_features_, train_labels_[:, i])
        print("train roc_auc:", LABEL_COLUMNS[i],
              roc_auc_score(train_labels_[:, i], model.predict_proba(train_features_)))
        print("val roc_auc:", LABEL_COLUMNS[i],
              roc_auc_score(val_labels_[:, i], model.predict_proba(val_features_)))

        train_pred[:, i] += model.predict_proba(train_features)
        test_pred[:, i] += model.predict_proba(test_features)
    print("Fold %s completed" % i)

train_pred = train_pred / args["n_splits"]
test_pred = test_pred / args["n_splits"]


Fold 0 in progress...
0:	learn: 0.6062579	total: 121ms	remaining: 9.55s
1:	learn: 0.5485793	total: 244ms	remaining: 9.53s
2:	learn: 0.4975906	total: 384ms	remaining: 9.86s
3:	learn: 0.4540137	total: 511ms	remaining: 9.71s
4:	learn: 0.4093827	total: 655ms	remaining: 9.83s
5:	learn: 0.3674007	total: 848ms	remaining: 10.5s
6:	learn: 0.3307088	total: 1.15s	remaining: 12s
7:	learn: 0.3064244	total: 1.39s	remaining: 12.5s
8:	learn: 0.2854983	total: 1.56s	remaining: 12.3s
9:	learn: 0.2633559	total: 1.7s	remaining: 11.9s
10:	learn: 0.2436962	total: 1.81s	remaining: 11.4s
11:	learn: 0.2245191	total: 1.95s	remaining: 11s
12:	learn: 0.2071420	total: 2.07s	remaining: 10.7s
13:	learn: 0.1930027	total: 2.21s	remaining: 10.4s
14:	learn: 0.1807335	total: 2.34s	remaining: 10.1s
15:	learn: 0.1704213	total: 2.48s	remaining: 9.93s
16:	learn: 0.1605655	total: 2.6s	remaining: 9.65s
17:	learn: 0.1501597	total: 2.76s	remaining: 9.5s
18:	learn: 0.1420498	total: 2.88s	remaining: 9.26s
19:	learn: 0.1352267	total

val roc_auc: severe_toxic 0.989828845699
0:	learn: 0.5872183	total: 251ms	remaining: 19.9s
1:	learn: 0.5089442	total: 368ms	remaining: 14.4s
2:	learn: 0.4346369	total: 499ms	remaining: 12.8s
3:	learn: 0.3738533	total: 617ms	remaining: 11.7s
4:	learn: 0.3232462	total: 757ms	remaining: 11.4s
5:	learn: 0.2918064	total: 898ms	remaining: 11.1s
6:	learn: 0.2514227	total: 1.04s	remaining: 10.9s
7:	learn: 0.2207865	total: 1.18s	remaining: 10.6s
8:	learn: 0.1952259	total: 1.48s	remaining: 11.7s
9:	learn: 0.1737097	total: 1.86s	remaining: 13s
10:	learn: 0.1544914	total: 2.06s	remaining: 12.9s
11:	learn: 0.1389755	total: 2.22s	remaining: 12.6s
12:	learn: 0.1255872	total: 2.36s	remaining: 12.2s
13:	learn: 0.1129335	total: 2.5s	remaining: 11.8s
14:	learn: 0.1043395	total: 2.64s	remaining: 11.4s
15:	learn: 0.0949708	total: 2.78s	remaining: 11.1s
16:	learn: 0.0882185	total: 2.9s	remaining: 10.8s
17:	learn: 0.0830139	total: 3.06s	remaining: 10.5s
18:	learn: 0.0781142	total: 3.19s	remaining: 10.2s
19:	

train roc_auc: threat 0.989415078067
val roc_auc: threat 0.985098880788
0:	learn: 0.6181472	total: 113ms	remaining: 8.96s
1:	learn: 0.5196754	total: 234ms	remaining: 9.14s
2:	learn: 0.4464898	total: 366ms	remaining: 9.4s
3:	learn: 0.3950884	total: 487ms	remaining: 9.26s
4:	learn: 0.3552741	total: 621ms	remaining: 9.32s
5:	learn: 0.3177875	total: 737ms	remaining: 9.09s
6:	learn: 0.2777308	total: 868ms	remaining: 9.05s
7:	learn: 0.2555782	total: 990ms	remaining: 8.91s
8:	learn: 0.2335995	total: 1.12s	remaining: 8.87s
9:	learn: 0.2102104	total: 1.24s	remaining: 8.67s
10:	learn: 0.1884865	total: 1.37s	remaining: 8.57s
11:	learn: 0.1717747	total: 1.49s	remaining: 8.43s
12:	learn: 0.1572594	total: 1.62s	remaining: 8.34s
13:	learn: 0.1457297	total: 1.74s	remaining: 8.22s
14:	learn: 0.1361310	total: 1.87s	remaining: 8.12s
15:	learn: 0.1286228	total: 1.99s	remaining: 7.97s
16:	learn: 0.1232558	total: 2.13s	remaining: 7.91s
17:	learn: 0.1176847	total: 2.27s	remaining: 7.83s
18:	learn: 0.1119253	

train roc_auc: identity_hate 0.988615260018
val roc_auc: identity_hate 0.981231082556
Fold 0 completed, val roc_auc 0.988086056682
Fold 1 in progress...
0:	learn: 0.6333197	total: 115ms	remaining: 9.11s
1:	learn: 0.5767068	total: 230ms	remaining: 8.96s
2:	learn: 0.5206200	total: 361ms	remaining: 9.27s
3:	learn: 0.4744301	total: 483ms	remaining: 9.18s
4:	learn: 0.4247850	total: 626ms	remaining: 9.38s
5:	learn: 0.3898618	total: 740ms	remaining: 9.12s
6:	learn: 0.3473522	total: 873ms	remaining: 9.1s
7:	learn: 0.3145409	total: 995ms	remaining: 8.95s
8:	learn: 0.2897162	total: 1.13s	remaining: 8.91s
9:	learn: 0.2604684	total: 1.25s	remaining: 8.77s
10:	learn: 0.2370862	total: 1.38s	remaining: 8.64s
11:	learn: 0.2181163	total: 1.5s	remaining: 8.49s
12:	learn: 0.2031346	total: 1.62s	remaining: 8.37s
13:	learn: 0.1923082	total: 1.74s	remaining: 8.19s
14:	learn: 0.1734761	total: 1.86s	remaining: 8.07s
15:	learn: 0.1626383	total: 1.98s	remaining: 7.93s
16:	learn: 0.1538070	total: 2.11s	remaining

78:	learn: 0.0194809	total: 16.9s	remaining: 214ms
79:	learn: 0.0193878	total: 17.2s	remaining: 0us
train roc_auc: severe_toxic 0.994333193091
val roc_auc: severe_toxic 0.990177059338
0:	learn: 0.6055586	total: 120ms	remaining: 9.46s
1:	learn: 0.5352940	total: 238ms	remaining: 9.3s
2:	learn: 0.4629950	total: 383ms	remaining: 9.84s
3:	learn: 0.4171953	total: 519ms	remaining: 9.86s
4:	learn: 0.3710437	total: 665ms	remaining: 9.98s
5:	learn: 0.3283865	total: 804ms	remaining: 9.92s
6:	learn: 0.2949102	total: 943ms	remaining: 9.83s
7:	learn: 0.2639317	total: 1.08s	remaining: 9.71s
8:	learn: 0.2329208	total: 1.22s	remaining: 9.63s
9:	learn: 0.2024143	total: 1.37s	remaining: 9.61s
10:	learn: 0.1897246	total: 1.53s	remaining: 9.58s
11:	learn: 0.1710363	total: 1.66s	remaining: 9.4s
12:	learn: 0.1550826	total: 1.8s	remaining: 9.28s
13:	learn: 0.1443546	total: 1.94s	remaining: 9.13s
14:	learn: 0.1316110	total: 2.09s	remaining: 9.05s
15:	learn: 0.1220091	total: 2.23s	remaining: 8.9s
16:	learn: 0.1

78:	learn: 0.0077107	total: 10.7s	remaining: 136ms
79:	learn: 0.0076510	total: 10.9s	remaining: 0us
train roc_auc: threat 0.987150878957
val roc_auc: threat 0.983417669442
0:	learn: 0.5930940	total: 114ms	remaining: 8.99s
1:	learn: 0.5160905	total: 233ms	remaining: 9.09s
2:	learn: 0.4498033	total: 358ms	remaining: 9.18s
3:	learn: 0.3976515	total: 480ms	remaining: 9.12s
4:	learn: 0.3600803	total: 618ms	remaining: 9.27s
5:	learn: 0.3260345	total: 750ms	remaining: 9.25s
6:	learn: 0.2943294	total: 883ms	remaining: 9.21s
7:	learn: 0.2592242	total: 1.01s	remaining: 9.08s
8:	learn: 0.2287103	total: 1.15s	remaining: 9.04s
9:	learn: 0.2133336	total: 1.27s	remaining: 8.87s
10:	learn: 0.1966494	total: 1.39s	remaining: 8.74s
11:	learn: 0.1817683	total: 1.51s	remaining: 8.55s
12:	learn: 0.1656933	total: 1.64s	remaining: 8.46s
13:	learn: 0.1512757	total: 1.75s	remaining: 8.28s
14:	learn: 0.1372825	total: 1.89s	remaining: 8.18s
15:	learn: 0.1289133	total: 2.02s	remaining: 8.09s
16:	learn: 0.1224019	t

77:	learn: 0.0197442	total: 11s	remaining: 282ms
78:	learn: 0.0195640	total: 11.1s	remaining: 141ms
79:	learn: 0.0194498	total: 11.3s	remaining: 0us
train roc_auc: identity_hate 0.992300576144
val roc_auc: identity_hate 0.980806789454
Fold 1 completed, val roc_auc 0.987481194615
Fold 2 in progress...
0:	learn: 0.6169081	total: 113ms	remaining: 8.95s
1:	learn: 0.5589476	total: 234ms	remaining: 9.12s
2:	learn: 0.5078883	total: 370ms	remaining: 9.51s
3:	learn: 0.4551379	total: 481ms	remaining: 9.13s
4:	learn: 0.4125212	total: 612ms	remaining: 9.18s
5:	learn: 0.3732151	total: 726ms	remaining: 8.95s
6:	learn: 0.3395503	total: 858ms	remaining: 8.95s
7:	learn: 0.3133659	total: 979ms	remaining: 8.81s
8:	learn: 0.2908704	total: 1.11s	remaining: 8.79s
9:	learn: 0.2616438	total: 1.24s	remaining: 8.65s
10:	learn: 0.2429972	total: 1.37s	remaining: 8.61s
11:	learn: 0.2255091	total: 1.49s	remaining: 8.46s
12:	learn: 0.2075314	total: 1.62s	remaining: 8.35s
13:	learn: 0.1937872	total: 1.73s	remaining: 

75:	learn: 0.0183903	total: 14.1s	remaining: 744ms
76:	learn: 0.0183281	total: 14.3s	remaining: 556ms
77:	learn: 0.0182988	total: 14.4s	remaining: 370ms
78:	learn: 0.0182227	total: 14.6s	remaining: 184ms
79:	learn: 0.0181332	total: 14.8s	remaining: 0us
train roc_auc: severe_toxic 0.995010485678
val roc_auc: severe_toxic 0.990102114932
0:	learn: 0.6088655	total: 202ms	remaining: 16s
1:	learn: 0.5314188	total: 320ms	remaining: 12.5s
2:	learn: 0.4718106	total: 435ms	remaining: 11.2s
3:	learn: 0.4236589	total: 605ms	remaining: 11.5s
4:	learn: 0.3762010	total: 818ms	remaining: 12.3s
5:	learn: 0.3317843	total: 946ms	remaining: 11.7s
6:	learn: 0.2910063	total: 1.07s	remaining: 11.1s
7:	learn: 0.2649250	total: 1.19s	remaining: 10.7s
8:	learn: 0.2369935	total: 1.31s	remaining: 10.3s
9:	learn: 0.2116891	total: 1.43s	remaining: 10s
10:	learn: 0.1885070	total: 1.54s	remaining: 9.67s
11:	learn: 0.1755818	total: 1.66s	remaining: 9.4s
12:	learn: 0.1633998	total: 1.78s	remaining: 9.18s
13:	learn: 0.15

74:	learn: 0.0077186	total: 9.81s	remaining: 654ms
75:	learn: 0.0076782	total: 9.94s	remaining: 523ms
76:	learn: 0.0076292	total: 10.1s	remaining: 393ms
77:	learn: 0.0075922	total: 10.2s	remaining: 262ms
78:	learn: 0.0075707	total: 10.4s	remaining: 131ms
79:	learn: 0.0075203	total: 10.5s	remaining: 0us
train roc_auc: threat 0.996186595991
val roc_auc: threat 0.979287190804
0:	learn: 0.6063107	total: 115ms	remaining: 9.08s
1:	learn: 0.5447830	total: 229ms	remaining: 8.93s
2:	learn: 0.4795524	total: 351ms	remaining: 9.01s
3:	learn: 0.4139375	total: 483ms	remaining: 9.18s
4:	learn: 0.3614914	total: 606ms	remaining: 9.09s
5:	learn: 0.3256973	total: 720ms	remaining: 8.88s
6:	learn: 0.2966208	total: 843ms	remaining: 8.79s
7:	learn: 0.2701296	total: 957ms	remaining: 8.62s
8:	learn: 0.2398980	total: 1.08s	remaining: 8.52s
9:	learn: 0.2095044	total: 1.2s	remaining: 8.39s
10:	learn: 0.1927840	total: 1.32s	remaining: 8.29s
11:	learn: 0.1754112	total: 1.45s	remaining: 8.19s
12:	learn: 0.1591019	to

74:	learn: 0.0207541	total: 9.9s	remaining: 660ms
75:	learn: 0.0206720	total: 10s	remaining: 529ms
76:	learn: 0.0205169	total: 10.2s	remaining: 396ms
77:	learn: 0.0204682	total: 10.3s	remaining: 264ms
78:	learn: 0.0204199	total: 10.4s	remaining: 132ms
79:	learn: 0.0203607	total: 10.6s	remaining: 0us
train roc_auc: identity_hate 0.989735502435
val roc_auc: identity_hate 0.981418749527
Fold 2 completed, val roc_auc 0.987018238009
Fold 3 in progress...
0:	learn: 0.6325505	total: 113ms	remaining: 8.95s
1:	learn: 0.5752074	total: 234ms	remaining: 9.12s
2:	learn: 0.5271083	total: 357ms	remaining: 9.15s
3:	learn: 0.4819558	total: 477ms	remaining: 9.06s
4:	learn: 0.4287763	total: 603ms	remaining: 9.05s
5:	learn: 0.3846201	total: 718ms	remaining: 8.86s
6:	learn: 0.3485069	total: 835ms	remaining: 8.7s
7:	learn: 0.3175132	total: 957ms	remaining: 8.61s
8:	learn: 0.2865026	total: 1.08s	remaining: 8.51s
9:	learn: 0.2610167	total: 1.2s	remaining: 8.4s
10:	learn: 0.2340752	total: 1.32s	remaining: 8.29

72:	learn: 0.0182986	total: 9.59s	remaining: 920ms
73:	learn: 0.0182618	total: 9.72s	remaining: 788ms
74:	learn: 0.0181431	total: 9.86s	remaining: 657ms
75:	learn: 0.0180965	total: 10s	remaining: 526ms
76:	learn: 0.0180494	total: 10.1s	remaining: 395ms
77:	learn: 0.0180205	total: 10.3s	remaining: 263ms
78:	learn: 0.0179589	total: 10.4s	remaining: 132ms
79:	learn: 0.0179062	total: 10.5s	remaining: 0us
train roc_auc: severe_toxic 0.99313520321
val roc_auc: severe_toxic 0.990252405215
0:	learn: 0.6083584	total: 124ms	remaining: 9.82s
1:	learn: 0.5258741	total: 245ms	remaining: 9.56s
2:	learn: 0.4521968	total: 367ms	remaining: 9.42s
3:	learn: 0.3842601	total: 482ms	remaining: 9.16s
4:	learn: 0.3418840	total: 604ms	remaining: 9.07s
5:	learn: 0.3071887	total: 726ms	remaining: 8.95s
6:	learn: 0.2771152	total: 842ms	remaining: 8.78s
7:	learn: 0.2479287	total: 955ms	remaining: 8.6s
8:	learn: 0.2220628	total: 1.08s	remaining: 8.49s
9:	learn: 0.1981415	total: 1.2s	remaining: 8.4s
10:	learn: 0.177

72:	learn: 0.0088595	total: 9.68s	remaining: 929ms
73:	learn: 0.0088017	total: 9.81s	remaining: 796ms
74:	learn: 0.0087351	total: 9.95s	remaining: 663ms
75:	learn: 0.0087106	total: 10.1s	remaining: 531ms
76:	learn: 0.0086592	total: 10.2s	remaining: 398ms
77:	learn: 0.0085964	total: 10.4s	remaining: 266ms
78:	learn: 0.0085376	total: 10.5s	remaining: 133ms
79:	learn: 0.0084748	total: 10.6s	remaining: 0us
train roc_auc: threat 0.990420390599
val roc_auc: threat 0.985038215535
0:	learn: 0.5993964	total: 114ms	remaining: 9.02s
1:	learn: 0.5207320	total: 228ms	remaining: 8.9s
2:	learn: 0.4522268	total: 351ms	remaining: 9s
3:	learn: 0.3935922	total: 475ms	remaining: 9.03s
4:	learn: 0.3477206	total: 599ms	remaining: 8.98s
5:	learn: 0.3065283	total: 712ms	remaining: 8.78s
6:	learn: 0.2708948	total: 834ms	remaining: 8.7s
7:	learn: 0.2428834	total: 955ms	remaining: 8.59s
8:	learn: 0.2162431	total: 1.08s	remaining: 8.5s
9:	learn: 0.1971802	total: 1.2s	remaining: 8.38s
10:	learn: 0.1767481	total: 1

72:	learn: 0.0193532	total: 9.53s	remaining: 914ms
73:	learn: 0.0192216	total: 9.66s	remaining: 784ms
74:	learn: 0.0190855	total: 9.81s	remaining: 654ms
75:	learn: 0.0189690	total: 9.94s	remaining: 523ms
76:	learn: 0.0188373	total: 10.1s	remaining: 393ms
77:	learn: 0.0187387	total: 10.2s	remaining: 262ms
78:	learn: 0.0186738	total: 10.4s	remaining: 131ms
79:	learn: 0.0185650	total: 10.5s	remaining: 0us
train roc_auc: identity_hate 0.985252764613
val roc_auc: identity_hate 0.982505381496
Fold 3 completed, val roc_auc 0.988355353835
Fold 4 in progress...
0:	learn: 0.6091084	total: 119ms	remaining: 9.44s
1:	learn: 0.5477750	total: 234ms	remaining: 9.12s
2:	learn: 0.4888802	total: 346ms	remaining: 8.89s
3:	learn: 0.4464909	total: 462ms	remaining: 8.77s
4:	learn: 0.4050400	total: 583ms	remaining: 8.75s
5:	learn: 0.3610640	total: 705ms	remaining: 8.69s
6:	learn: 0.3315345	total: 826ms	remaining: 8.62s
7:	learn: 0.3016250	total: 949ms	remaining: 8.54s
8:	learn: 0.2722680	total: 1.06s	remainin

70:	learn: 0.0210960	total: 9.29s	remaining: 1.18s
71:	learn: 0.0210031	total: 9.43s	remaining: 1.05s
72:	learn: 0.0209305	total: 9.56s	remaining: 917ms
73:	learn: 0.0208789	total: 9.7s	remaining: 786ms
74:	learn: 0.0207931	total: 9.83s	remaining: 656ms
75:	learn: 0.0207710	total: 9.96s	remaining: 524ms
76:	learn: 0.0207287	total: 10.1s	remaining: 393ms
77:	learn: 0.0206921	total: 10.2s	remaining: 262ms
78:	learn: 0.0206300	total: 10.4s	remaining: 131ms
79:	learn: 0.0205811	total: 10.5s	remaining: 0us
train roc_auc: severe_toxic 0.993163484213
val roc_auc: severe_toxic 0.990363719905
0:	learn: 0.5997557	total: 121ms	remaining: 9.53s
1:	learn: 0.5171395	total: 234ms	remaining: 9.13s
2:	learn: 0.4588533	total: 352ms	remaining: 9.03s
3:	learn: 0.3940870	total: 474ms	remaining: 9s
4:	learn: 0.3459164	total: 588ms	remaining: 8.82s
5:	learn: 0.3036556	total: 709ms	remaining: 8.75s
6:	learn: 0.2681682	total: 825ms	remaining: 8.6s
7:	learn: 0.2368674	total: 946ms	remaining: 8.51s
8:	learn: 0.2

70:	learn: 0.0072243	total: 9.35s	remaining: 1.18s
71:	learn: 0.0071806	total: 9.48s	remaining: 1.05s
72:	learn: 0.0071502	total: 9.61s	remaining: 922ms
73:	learn: 0.0071087	total: 9.75s	remaining: 791ms
74:	learn: 0.0070806	total: 9.88s	remaining: 659ms
75:	learn: 0.0070534	total: 10s	remaining: 527ms
76:	learn: 0.0070187	total: 10.2s	remaining: 396ms
77:	learn: 0.0069735	total: 10.3s	remaining: 264ms
78:	learn: 0.0069388	total: 10.4s	remaining: 132ms
79:	learn: 0.0069061	total: 10.6s	remaining: 0us
train roc_auc: threat 0.993483236839
val roc_auc: threat 0.983219867078
0:	learn: 0.6014331	total: 113ms	remaining: 8.96s
1:	learn: 0.5280281	total: 236ms	remaining: 9.19s
2:	learn: 0.4654101	total: 361ms	remaining: 9.26s
3:	learn: 0.4147745	total: 481ms	remaining: 9.14s
4:	learn: 0.3720910	total: 603ms	remaining: 9.05s
5:	learn: 0.3335232	total: 727ms	remaining: 8.97s
6:	learn: 0.2998758	total: 849ms	remaining: 8.86s
7:	learn: 0.2688240	total: 963ms	remaining: 8.67s
8:	learn: 0.2383337	to

70:	learn: 0.0194376	total: 9.34s	remaining: 1.18s
71:	learn: 0.0193381	total: 9.47s	remaining: 1.05s
72:	learn: 0.0192197	total: 9.61s	remaining: 922ms
73:	learn: 0.0190789	total: 9.75s	remaining: 790ms
74:	learn: 0.0189763	total: 9.89s	remaining: 659ms
75:	learn: 0.0188799	total: 10s	remaining: 527ms
76:	learn: 0.0187903	total: 10.2s	remaining: 396ms
77:	learn: 0.0187135	total: 10.3s	remaining: 264ms
78:	learn: 0.0185746	total: 10.4s	remaining: 132ms
79:	learn: 0.0185037	total: 10.5s	remaining: 0us
train roc_auc: identity_hate 0.99193199547
val roc_auc: identity_hate 0.982338116033
Fold 4 completed, val roc_auc 0.988076034346
Fold 5 in progress...
0:	learn: 0.6244057	total: 121ms	remaining: 9.52s
1:	learn: 0.5592147	total: 234ms	remaining: 9.13s
2:	learn: 0.5015404	total: 350ms	remaining: 8.97s
3:	learn: 0.4518911	total: 471ms	remaining: 8.96s
4:	learn: 0.4071445	total: 596ms	remaining: 8.93s
5:	learn: 0.3633073	total: 711ms	remaining: 8.77s
6:	learn: 0.3317946	total: 834ms	remaining

68:	learn: 0.0197573	total: 9.43s	remaining: 1.5s
69:	learn: 0.0196559	total: 9.57s	remaining: 1.37s
70:	learn: 0.0196105	total: 9.71s	remaining: 1.23s
71:	learn: 0.0195401	total: 9.85s	remaining: 1.09s
72:	learn: 0.0194922	total: 9.99s	remaining: 958ms
73:	learn: 0.0194410	total: 10.1s	remaining: 820ms
74:	learn: 0.0193695	total: 10.3s	remaining: 684ms
75:	learn: 0.0192571	total: 10.4s	remaining: 547ms
76:	learn: 0.0191803	total: 10.5s	remaining: 410ms
77:	learn: 0.0191478	total: 10.7s	remaining: 273ms
78:	learn: 0.0190500	total: 10.8s	remaining: 137ms
79:	learn: 0.0190050	total: 10.9s	remaining: 0us
train roc_auc: severe_toxic 0.994200303904
val roc_auc: severe_toxic 0.989992997701
0:	learn: 0.6100943	total: 115ms	remaining: 9.06s
1:	learn: 0.5425030	total: 231ms	remaining: 9.01s
2:	learn: 0.4732248	total: 364ms	remaining: 9.35s
3:	learn: 0.4211124	total: 478ms	remaining: 9.09s
4:	learn: 0.3725918	total: 610ms	remaining: 9.15s
5:	learn: 0.3311145	total: 726ms	remaining: 8.95s
6:	lear

68:	learn: 0.0080821	total: 9.44s	remaining: 1.5s
69:	learn: 0.0080114	total: 9.61s	remaining: 1.37s
70:	learn: 0.0079651	total: 9.75s	remaining: 1.24s
71:	learn: 0.0078960	total: 9.88s	remaining: 1.1s
72:	learn: 0.0078402	total: 10s	remaining: 962ms
73:	learn: 0.0078000	total: 10.2s	remaining: 824ms
74:	learn: 0.0077421	total: 10.3s	remaining: 687ms
75:	learn: 0.0077023	total: 10.4s	remaining: 549ms
76:	learn: 0.0076510	total: 10.6s	remaining: 412ms
77:	learn: 0.0075956	total: 10.7s	remaining: 275ms
78:	learn: 0.0075593	total: 10.9s	remaining: 138ms
79:	learn: 0.0075247	total: 11s	remaining: 0us
train roc_auc: threat 0.981158915238
val roc_auc: threat 0.981636018727
0:	learn: 0.6159852	total: 115ms	remaining: 9.09s
1:	learn: 0.5269578	total: 236ms	remaining: 9.21s
2:	learn: 0.4622426	total: 369ms	remaining: 9.48s
3:	learn: 0.4034252	total: 483ms	remaining: 9.19s
4:	learn: 0.3532260	total: 619ms	remaining: 9.28s
5:	learn: 0.3223251	total: 741ms	remaining: 9.14s
6:	learn: 0.2818742	tota

68:	learn: 0.0210526	total: 9.69s	remaining: 1.54s
69:	learn: 0.0209678	total: 9.83s	remaining: 1.4s
70:	learn: 0.0208554	total: 9.98s	remaining: 1.26s
71:	learn: 0.0207060	total: 10.1s	remaining: 1.13s
72:	learn: 0.0205938	total: 10.3s	remaining: 986ms
73:	learn: 0.0205001	total: 10.4s	remaining: 844ms
74:	learn: 0.0204269	total: 10.6s	remaining: 704ms
75:	learn: 0.0203114	total: 10.7s	remaining: 562ms
76:	learn: 0.0202377	total: 10.8s	remaining: 422ms
77:	learn: 0.0201774	total: 11s	remaining: 281ms
78:	learn: 0.0200852	total: 11.1s	remaining: 141ms
79:	learn: 0.0200325	total: 11.3s	remaining: 0us
train roc_auc: identity_hate 0.97735971012
val roc_auc: identity_hate 0.981716716447
Fold 5 completed, val roc_auc 0.987413954685
Fold 6 in progress...
0:	learn: 0.6203162	total: 121ms	remaining: 9.56s
1:	learn: 0.5561327	total: 234ms	remaining: 9.14s
2:	learn: 0.5013165	total: 361ms	remaining: 9.26s
3:	learn: 0.4586005	total: 488ms	remaining: 9.27s
4:	learn: 0.4162038	total: 624ms	remainin

66:	learn: 0.0184343	total: 9.13s	remaining: 1.77s
67:	learn: 0.0183528	total: 9.26s	remaining: 1.63s
68:	learn: 0.0183109	total: 9.41s	remaining: 1.5s
69:	learn: 0.0182621	total: 9.54s	remaining: 1.36s
70:	learn: 0.0182351	total: 9.68s	remaining: 1.23s
71:	learn: 0.0181642	total: 9.81s	remaining: 1.09s
72:	learn: 0.0180985	total: 9.96s	remaining: 955ms
73:	learn: 0.0180648	total: 10.1s	remaining: 818ms
74:	learn: 0.0179836	total: 10.2s	remaining: 682ms
75:	learn: 0.0179298	total: 10.4s	remaining: 546ms
76:	learn: 0.0178710	total: 10.5s	remaining: 410ms
77:	learn: 0.0177917	total: 10.6s	remaining: 273ms
78:	learn: 0.0177387	total: 10.8s	remaining: 137ms
79:	learn: 0.0177005	total: 10.9s	remaining: 0us
train roc_auc: severe_toxic 0.994505443833
val roc_auc: severe_toxic 0.989852532532
0:	learn: 0.6104688	total: 122ms	remaining: 9.61s
1:	learn: 0.5273039	total: 235ms	remaining: 9.16s
2:	learn: 0.4733090	total: 364ms	remaining: 9.34s
3:	learn: 0.4021207	total: 480ms	remaining: 9.13s
4:	le

66:	learn: 0.0059127	total: 9.12s	remaining: 1.77s
67:	learn: 0.0058101	total: 9.26s	remaining: 1.63s
68:	learn: 0.0057490	total: 9.41s	remaining: 1.5s
69:	learn: 0.0056525	total: 9.54s	remaining: 1.36s
70:	learn: 0.0056100	total: 9.68s	remaining: 1.23s
71:	learn: 0.0055339	total: 9.81s	remaining: 1.09s
72:	learn: 0.0055040	total: 9.95s	remaining: 955ms
73:	learn: 0.0054193	total: 10.1s	remaining: 818ms
74:	learn: 0.0053763	total: 10.2s	remaining: 683ms
75:	learn: 0.0053116	total: 10.4s	remaining: 546ms
76:	learn: 0.0052349	total: 10.5s	remaining: 410ms
77:	learn: 0.0051988	total: 10.6s	remaining: 273ms
78:	learn: 0.0051256	total: 10.8s	remaining: 137ms
79:	learn: 0.0050823	total: 10.9s	remaining: 0us
train roc_auc: threat 0.997502157303
val roc_auc: threat 0.984000991483
0:	learn: 0.6176082	total: 237ms	remaining: 18.8s
1:	learn: 0.5448999	total: 474ms	remaining: 18.5s
2:	learn: 0.4746790	total: 655ms	remaining: 16.8s
3:	learn: 0.4112785	total: 879ms	remaining: 16.7s
4:	learn: 0.35629

66:	learn: 0.0220846	total: 15s	remaining: 2.91s
67:	learn: 0.0219621	total: 15.1s	remaining: 2.67s
68:	learn: 0.0218972	total: 15.3s	remaining: 2.44s
69:	learn: 0.0218295	total: 15.4s	remaining: 2.2s
70:	learn: 0.0217558	total: 15.6s	remaining: 1.97s
71:	learn: 0.0216416	total: 15.7s	remaining: 1.75s
72:	learn: 0.0215600	total: 15.9s	remaining: 1.52s
73:	learn: 0.0214659	total: 16s	remaining: 1.3s
74:	learn: 0.0213450	total: 16.3s	remaining: 1.08s
75:	learn: 0.0212646	total: 16.5s	remaining: 867ms
76:	learn: 0.0211462	total: 16.6s	remaining: 648ms
77:	learn: 0.0210556	total: 16.8s	remaining: 431ms
78:	learn: 0.0209648	total: 16.9s	remaining: 214ms
79:	learn: 0.0209107	total: 17.2s	remaining: 0us
train roc_auc: identity_hate 0.988209419816
val roc_auc: identity_hate 0.982597089888
Fold 6 completed, val roc_auc 0.987721613338
Fold 7 in progress...
0:	learn: 0.6251109	total: 130ms	remaining: 10.3s
1:	learn: 0.5462133	total: 247ms	remaining: 9.64s
2:	learn: 0.4977206	total: 365ms	remainin

65:	learn: 0.0213720	total: 9.17s	remaining: 1.95s
66:	learn: 0.0212649	total: 9.32s	remaining: 1.81s
67:	learn: 0.0212074	total: 9.5s	remaining: 1.68s
68:	learn: 0.0211236	total: 9.72s	remaining: 1.55s
69:	learn: 0.0210635	total: 9.86s	remaining: 1.41s
70:	learn: 0.0210098	total: 10s	remaining: 1.27s
71:	learn: 0.0209268	total: 10.2s	remaining: 1.14s
72:	learn: 0.0208726	total: 10.5s	remaining: 1s
73:	learn: 0.0207957	total: 10.6s	remaining: 861ms
74:	learn: 0.0207077	total: 10.8s	remaining: 720ms
75:	learn: 0.0206361	total: 11.1s	remaining: 586ms
76:	learn: 0.0206038	total: 11.3s	remaining: 441ms
77:	learn: 0.0205067	total: 11.5s	remaining: 294ms
78:	learn: 0.0204362	total: 11.6s	remaining: 147ms
79:	learn: 0.0203745	total: 11.9s	remaining: 0us
train roc_auc: severe_toxic 0.994132633299
val roc_auc: severe_toxic 0.990142993009
0:	learn: 0.6205075	total: 153ms	remaining: 12.1s
1:	learn: 0.5537288	total: 287ms	remaining: 11.2s
2:	learn: 0.4966246	total: 412ms	remaining: 10.6s
3:	learn:

64:	learn: 0.0071275	total: 8.99s	remaining: 2.08s
65:	learn: 0.0070592	total: 9.13s	remaining: 1.94s
66:	learn: 0.0070127	total: 9.29s	remaining: 1.8s
67:	learn: 0.0069623	total: 9.43s	remaining: 1.66s
68:	learn: 0.0068654	total: 9.57s	remaining: 1.52s
69:	learn: 0.0068181	total: 9.7s	remaining: 1.39s
70:	learn: 0.0067694	total: 9.84s	remaining: 1.25s
71:	learn: 0.0066815	total: 9.97s	remaining: 1.11s
72:	learn: 0.0066339	total: 10.1s	remaining: 971ms
73:	learn: 0.0065782	total: 10.3s	remaining: 832ms
74:	learn: 0.0065113	total: 10.4s	remaining: 694ms
75:	learn: 0.0064277	total: 10.6s	remaining: 555ms
76:	learn: 0.0063503	total: 10.7s	remaining: 417ms
77:	learn: 0.0062913	total: 10.8s	remaining: 278ms
78:	learn: 0.0062216	total: 11s	remaining: 139ms
79:	learn: 0.0061534	total: 11.1s	remaining: 0us
train roc_auc: threat 0.991269205702
val roc_auc: threat 0.98021368317
0:	learn: 0.6196262	total: 122ms	remaining: 9.65s
1:	learn: 0.5398128	total: 243ms	remaining: 9.46s
2:	learn: 0.4716817

64:	learn: 0.0224524	total: 8.87s	remaining: 2.05s
65:	learn: 0.0223302	total: 9s	remaining: 1.91s
66:	learn: 0.0222538	total: 9.15s	remaining: 1.77s
67:	learn: 0.0222052	total: 9.28s	remaining: 1.64s
68:	learn: 0.0221159	total: 9.42s	remaining: 1.5s
69:	learn: 0.0220265	total: 9.55s	remaining: 1.36s
70:	learn: 0.0219511	total: 9.69s	remaining: 1.23s
71:	learn: 0.0218495	total: 9.83s	remaining: 1.09s
72:	learn: 0.0217998	total: 9.97s	remaining: 956ms
73:	learn: 0.0216541	total: 10.1s	remaining: 819ms
74:	learn: 0.0215751	total: 10.3s	remaining: 684ms
75:	learn: 0.0215313	total: 10.4s	remaining: 546ms
76:	learn: 0.0214796	total: 10.5s	remaining: 410ms
77:	learn: 0.0213968	total: 10.7s	remaining: 273ms
78:	learn: 0.0213217	total: 10.8s	remaining: 137ms
79:	learn: 0.0212406	total: 10.9s	remaining: 0us
train roc_auc: identity_hate 0.987866215912
val roc_auc: identity_hate 0.981672469249
Fold 7 completed, val roc_auc 0.987204079106
Fold 8 in progress...
0:	learn: 0.6233314	total: 113ms	rema

62:	learn: 0.0194831	total: 8.69s	remaining: 2.35s
63:	learn: 0.0193935	total: 8.83s	remaining: 2.21s
64:	learn: 0.0193432	total: 8.98s	remaining: 2.07s
65:	learn: 0.0192957	total: 9.11s	remaining: 1.93s
66:	learn: 0.0192749	total: 9.26s	remaining: 1.8s
67:	learn: 0.0192382	total: 9.39s	remaining: 1.66s
68:	learn: 0.0191992	total: 9.53s	remaining: 1.52s
69:	learn: 0.0191309	total: 9.66s	remaining: 1.38s
70:	learn: 0.0190520	total: 9.81s	remaining: 1.24s
71:	learn: 0.0189578	total: 9.95s	remaining: 1.1s
72:	learn: 0.0188655	total: 10.1s	remaining: 968ms
73:	learn: 0.0188178	total: 10.2s	remaining: 830ms
74:	learn: 0.0187384	total: 10.4s	remaining: 693ms
75:	learn: 0.0186626	total: 10.5s	remaining: 554ms
76:	learn: 0.0186130	total: 10.7s	remaining: 416ms
77:	learn: 0.0185635	total: 10.8s	remaining: 277ms
78:	learn: 0.0184941	total: 11s	remaining: 139ms
79:	learn: 0.0184037	total: 11.1s	remaining: 0us
train roc_auc: severe_toxic 0.99451527944
val roc_auc: severe_toxic 0.989922560692
0:	le

62:	learn: 0.0061606	total: 8.56s	remaining: 2.31s
63:	learn: 0.0060982	total: 8.69s	remaining: 2.17s
64:	learn: 0.0060332	total: 8.83s	remaining: 2.04s
65:	learn: 0.0059745	total: 8.97s	remaining: 1.9s
66:	learn: 0.0059365	total: 9.11s	remaining: 1.77s
67:	learn: 0.0058924	total: 9.25s	remaining: 1.63s
68:	learn: 0.0058404	total: 9.39s	remaining: 1.5s
69:	learn: 0.0058026	total: 9.52s	remaining: 1.36s
70:	learn: 0.0057488	total: 9.66s	remaining: 1.22s
71:	learn: 0.0057044	total: 9.79s	remaining: 1.09s
72:	learn: 0.0056529	total: 9.93s	remaining: 952ms
73:	learn: 0.0056210	total: 10.1s	remaining: 816ms
74:	learn: 0.0055892	total: 10.2s	remaining: 680ms
75:	learn: 0.0055391	total: 10.3s	remaining: 544ms
76:	learn: 0.0054956	total: 10.5s	remaining: 409ms
77:	learn: 0.0054477	total: 10.6s	remaining: 272ms
78:	learn: 0.0054144	total: 10.8s	remaining: 136ms
79:	learn: 0.0053728	total: 10.9s	remaining: 0us
train roc_auc: threat 0.992442738955
val roc_auc: threat 0.981112438665
0:	learn: 0.60

62:	learn: 0.0219960	total: 8.72s	remaining: 2.35s
63:	learn: 0.0218705	total: 8.86s	remaining: 2.21s
64:	learn: 0.0217631	total: 9s	remaining: 2.08s
65:	learn: 0.0216506	total: 9.13s	remaining: 1.94s
66:	learn: 0.0215837	total: 9.28s	remaining: 1.8s
67:	learn: 0.0214450	total: 9.41s	remaining: 1.66s
68:	learn: 0.0213409	total: 9.56s	remaining: 1.52s
69:	learn: 0.0212128	total: 9.7s	remaining: 1.39s
70:	learn: 0.0211547	total: 9.85s	remaining: 1.25s
71:	learn: 0.0209699	total: 9.98s	remaining: 1.11s
72:	learn: 0.0208755	total: 10.1s	remaining: 972ms
73:	learn: 0.0207468	total: 10.3s	remaining: 833ms
74:	learn: 0.0206807	total: 10.4s	remaining: 695ms
75:	learn: 0.0206282	total: 10.6s	remaining: 556ms
76:	learn: 0.0204669	total: 10.7s	remaining: 418ms
77:	learn: 0.0204190	total: 10.9s	remaining: 278ms
78:	learn: 0.0203745	total: 11s	remaining: 139ms
79:	learn: 0.0202954	total: 11.1s	remaining: 0us
train roc_auc: identity_hate 0.992204371262
val roc_auc: identity_hate 0.982343578177
Fold 

60:	learn: 0.0208684	total: 8.46s	remaining: 2.63s
61:	learn: 0.0208093	total: 8.6s	remaining: 2.5s
62:	learn: 0.0207344	total: 8.74s	remaining: 2.36s
63:	learn: 0.0206821	total: 8.87s	remaining: 2.22s
64:	learn: 0.0206395	total: 9.02s	remaining: 2.08s
65:	learn: 0.0205721	total: 9.15s	remaining: 1.94s
66:	learn: 0.0204947	total: 9.3s	remaining: 1.8s
67:	learn: 0.0204092	total: 9.45s	remaining: 1.67s
68:	learn: 0.0203659	total: 9.6s	remaining: 1.53s
69:	learn: 0.0202818	total: 9.73s	remaining: 1.39s
70:	learn: 0.0202450	total: 9.87s	remaining: 1.25s
71:	learn: 0.0202049	total: 10s	remaining: 1.11s
72:	learn: 0.0201634	total: 10.2s	remaining: 973ms
73:	learn: 0.0201434	total: 10.3s	remaining: 834ms
74:	learn: 0.0200900	total: 10.4s	remaining: 695ms
75:	learn: 0.0200225	total: 10.6s	remaining: 556ms
76:	learn: 0.0199494	total: 10.7s	remaining: 417ms
77:	learn: 0.0198968	total: 10.8s	remaining: 278ms
78:	learn: 0.0198264	total: 11s	remaining: 139ms
79:	learn: 0.0197808	total: 11.1s	remain

60:	learn: 0.0084647	total: 8.44s	remaining: 2.63s
61:	learn: 0.0083849	total: 8.57s	remaining: 2.49s
62:	learn: 0.0083353	total: 8.72s	remaining: 2.35s
63:	learn: 0.0082732	total: 8.85s	remaining: 2.21s
64:	learn: 0.0082057	total: 9.01s	remaining: 2.08s
65:	learn: 0.0081403	total: 9.14s	remaining: 1.94s
66:	learn: 0.0080419	total: 9.29s	remaining: 1.8s
67:	learn: 0.0079746	total: 9.43s	remaining: 1.66s
68:	learn: 0.0078631	total: 9.57s	remaining: 1.52s
69:	learn: 0.0078065	total: 9.71s	remaining: 1.39s
70:	learn: 0.0077751	total: 9.86s	remaining: 1.25s
71:	learn: 0.0077363	total: 10s	remaining: 1.11s
72:	learn: 0.0076682	total: 10.1s	remaining: 973ms
73:	learn: 0.0076245	total: 10.3s	remaining: 834ms
74:	learn: 0.0075852	total: 10.4s	remaining: 695ms
75:	learn: 0.0075521	total: 10.6s	remaining: 556ms
76:	learn: 0.0074730	total: 10.7s	remaining: 417ms
77:	learn: 0.0074047	total: 10.8s	remaining: 278ms
78:	learn: 0.0073497	total: 11s	remaining: 139ms
79:	learn: 0.0073004	total: 11.1s	re

60:	learn: 0.0202182	total: 8.48s	remaining: 2.64s
61:	learn: 0.0200409	total: 8.61s	remaining: 2.5s
62:	learn: 0.0198925	total: 8.78s	remaining: 2.37s
63:	learn: 0.0197888	total: 8.91s	remaining: 2.23s
64:	learn: 0.0196166	total: 9.05s	remaining: 2.09s
65:	learn: 0.0194873	total: 9.19s	remaining: 1.95s
66:	learn: 0.0193695	total: 9.34s	remaining: 1.81s
67:	learn: 0.0192727	total: 9.47s	remaining: 1.67s
68:	learn: 0.0191443	total: 9.63s	remaining: 1.53s
69:	learn: 0.0190652	total: 9.76s	remaining: 1.39s
70:	learn: 0.0189669	total: 9.9s	remaining: 1.25s
71:	learn: 0.0188518	total: 10s	remaining: 1.11s
72:	learn: 0.0187494	total: 10.2s	remaining: 977ms
73:	learn: 0.0186609	total: 10.3s	remaining: 837ms
74:	learn: 0.0185461	total: 10.5s	remaining: 698ms
75:	learn: 0.0184545	total: 10.6s	remaining: 559ms
76:	learn: 0.0183473	total: 10.8s	remaining: 419ms
77:	learn: 0.0182522	total: 10.9s	remaining: 280ms
78:	learn: 0.0181875	total: 11s	remaining: 140ms
79:	learn: 0.0180693	total: 11.2s	rem

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold


def fix_pred(probas):
    return np.hstack(proba[:, 1][:, np.newaxis] for proba in probas)


kfold = KFold(n_splits=args["n_splits"])
roc_aucs = []
train_pred = np.zeros(train_labels.shape)
test_pred = np.zeros((test_features.shape[0], train_labels.shape[1]))
for i, (val_index, train_index) in enumerate(kfold.split(train)):
    print("Fold %s in progress..." % i)
    train_features_ = train_features[train_index, :]
    train_labels_ = train_labels[train_index, :]
    val_features_ = train_features[val_index, :]
    val_labels_ = train_labels[val_index, :]
    
    from catboost import CatBoostClassifier
    val_roc_auc = []
    for j in range(len(LABEL_COLUMNS)):
        model = CatBoostClassifier(loss_function='Logloss', iterations=80)
    #     model = ExtraTreesClassifier(min_samples_leaf=50, n_jobs=-1)
        model.fit(train_features_, train_labels_[:, j])
        train_pred_ = model.predict_proba(train_features_)[:, 1]
        val_pred_ = model.predict_proba(val_features_)[:, 1]
        print("train roc_auc:", LABEL_COLUMNS[j], roc_auc_score(train_labels_[:, j], train_pred_))
        val_roc_auc.append(roc_auc_score(val_labels_[:, j], val_pred_))
        print("val roc_auc:", LABEL_COLUMNS[j], val_roc_auc[-1])

        test_pred[:, j] += model.predict_proba(test_features)[:, 1]
    print("Fold %s completed, val roc_auc %s" % (i, np.mean(val_roc_auc) ))

train_pred = train_pred / args["n_splits"]
test_pred = test_pred / args["n_splits"]


In [31]:
for i, label in enumerate(LABEL_COLUMNS):
    test[label] = test_pred[:, i]

output = "output/%s-fasttext-baseline-submission.txt" % datetime.now()
cols = ["id"] + LABEL_COLUMNS
test[cols].to_csv(output, sep=",", index=False, quotechar=" ", float_format="%.16f")