In [18]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, roc_auc_score
from sklearn.metrics import confusion_matrix

from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier

import warnings
from copy import deepcopy
import gc
from collections import Counter

%matplotlib inline

In [31]:
from tqdm import tqdm

In [2]:
%cd ../

In [21]:
cat_features = ["type", "priority", "is_privatecategory", "class", "is_in_yandex", "mailctg", "directctg", "mailtype"]

def to_cat_features(df):
    for cat_feature in cat_features:
        df[cat_feature] = df[cat_feature].apply(lambda text: str(text))
    return df
        
df = to_cat_features(pd.read_csv('data/train_prepared.csv'))
test_df = to_cat_features(pd.read_csv('data/test_df_prepared.csv'))
y = pd.read_csv('data/train_y_prepared.csv')

In [24]:
test_ids = pd.read_csv('data/test_ids.csv')

In [22]:
def score(pred, y_true):
    tn, fp, fn, tp = confusion_matrix(y_true, pred).ravel()
    tpr = (tp) / (tp + fn)
    fpr = (fp) / (fp + tn)
    r_score = (tp) / (tp + fn)
    ra_score = (1 + tpr - fpr) / 2
    score = 0.1 * r_score + 0.9 * ra_score
    return score
    

def predict(models, X, threshold=0.5):
    preds = []
    for model in tqdm(models):
        preds.append(model.predict_proba(X)[:, 1])
    preds = np.array(preds) 
    preds = preds.mean(0) > threshold
    return preds

In [32]:
models = []

In [33]:
#!c1.32
for state in range(0, 10):
        X_train, X_val, y_train, y_val = train_test_split(df, y, test_size=0.2, random_state=state)
        model = CatBoostClassifier(random_state=state, max_depth=8, verbose=200, iterations=2000, auto_class_weights="Balanced", cat_features=cat_features)
        # print(X_train.head(), y_train.head())
        model.fit(X_train, y_train) 
        models.append(model)
        print(f'MODEL {state} score: {score(predict([model], X_val), y_val)}')

Learning rate set to 0.203601
0:	learn: 0.5058721	total: 1.02s	remaining: 34m 1s
200:	learn: 0.2626192	total: 2m 58s	remaining: 26m 40s
400:	learn: 0.2557737	total: 5m 57s	remaining: 23m 47s
600:	learn: 0.2508552	total: 8m 55s	remaining: 20m 45s
800:	learn: 0.2461723	total: 11m 58s	remaining: 17m 56s
1000:	learn: 0.2425140	total: 14m 56s	remaining: 14m 54s
1200:	learn: 0.2385799	total: 18m 1s	remaining: 11m 59s
1400:	learn: 0.2352684	total: 21m 4s	remaining: 9m
1600:	learn: 0.2320549	total: 24m 6s	remaining: 6m
1800:	learn: 0.2291993	total: 27m 11s	remaining: 3m
1999:	learn: 0.2267698	total: 30m 14s	remaining: 0us


100%|██████████| 1/1 [00:04<00:00,  4.50s/it]


MODEL 0 score: 0.8800285685116038
Learning rate set to 0.203601
0:	learn: 0.5060251	total: 1.09s	remaining: 36m 17s
200:	learn: 0.2634472	total: 2m 57s	remaining: 26m 30s
400:	learn: 0.2563013	total: 5m 59s	remaining: 23m 54s
600:	learn: 0.2508113	total: 9m 3s	remaining: 21m 4s
800:	learn: 0.2461862	total: 12m 5s	remaining: 18m 6s
1000:	learn: 0.2423104	total: 15m 8s	remaining: 15m 6s
1200:	learn: 0.2389621	total: 18m 10s	remaining: 12m 5s
1400:	learn: 0.2356340	total: 21m 12s	remaining: 9m 3s
1600:	learn: 0.2324825	total: 24m 12s	remaining: 6m 2s
1800:	learn: 0.2295577	total: 27m 15s	remaining: 3m
1999:	learn: 0.2268689	total: 30m 18s	remaining: 0us


100%|██████████| 1/1 [00:05<00:00,  5.02s/it]


MODEL 1 score: 0.8821934647840843
Learning rate set to 0.203601
0:	learn: 0.5156549	total: 1.05s	remaining: 34m 55s
200:	learn: 0.2629709	total: 2m 58s	remaining: 26m 39s
400:	learn: 0.2561551	total: 6m 3s	remaining: 24m 8s
600:	learn: 0.2507522	total: 9m 7s	remaining: 21m 14s
800:	learn: 0.2465767	total: 12m 10s	remaining: 18m 13s
1000:	learn: 0.2421846	total: 15m 13s	remaining: 15m 11s
1200:	learn: 0.2386007	total: 18m 13s	remaining: 12m 7s
1400:	learn: 0.2352184	total: 21m 16s	remaining: 9m 5s
1600:	learn: 0.2321830	total: 24m 18s	remaining: 6m 3s
1800:	learn: 0.2294298	total: 27m 21s	remaining: 3m 1s
1999:	learn: 0.2267854	total: 30m 24s	remaining: 0us


100%|██████████| 1/1 [00:04<00:00,  4.84s/it]


MODEL 2 score: 0.880273491099677
Learning rate set to 0.203601
0:	learn: 0.5031985	total: 1.01s	remaining: 33m 35s
200:	learn: 0.2635886	total: 2m 59s	remaining: 26m 46s
400:	learn: 0.2567511	total: 6m 1s	remaining: 24m 3s
600:	learn: 0.2514082	total: 9m 4s	remaining: 21m 7s
800:	learn: 0.2466996	total: 12m 7s	remaining: 18m 8s
1000:	learn: 0.2427439	total: 15m 10s	remaining: 15m 8s
1200:	learn: 0.2390649	total: 18m 13s	remaining: 12m 7s
1400:	learn: 0.2356883	total: 21m 17s	remaining: 9m 6s
1600:	learn: 0.2326224	total: 24m 21s	remaining: 6m 4s
1800:	learn: 0.2296472	total: 27m 24s	remaining: 3m 1s
1999:	learn: 0.2268622	total: 30m 26s	remaining: 0us


100%|██████████| 1/1 [00:04<00:00,  4.69s/it]


MODEL 3 score: 0.8815842384828032
Learning rate set to 0.203601
0:	learn: 0.5044193	total: 1.08s	remaining: 35m 56s
200:	learn: 0.2635488	total: 2m 57s	remaining: 26m 24s
400:	learn: 0.2565223	total: 5m 59s	remaining: 23m 54s
600:	learn: 0.2511236	total: 9m 2s	remaining: 21m 2s
800:	learn: 0.2463340	total: 12m 5s	remaining: 18m 5s
1000:	learn: 0.2421857	total: 15m 9s	remaining: 15m 7s
1200:	learn: 0.2384157	total: 18m 14s	remaining: 12m 8s
1400:	learn: 0.2349971	total: 21m 21s	remaining: 9m 7s
1600:	learn: 0.2319159	total: 24m 26s	remaining: 6m 5s
1800:	learn: 0.2291178	total: 27m 27s	remaining: 3m 2s
1999:	learn: 0.2265896	total: 30m 30s	remaining: 0us


100%|██████████| 1/1 [00:04<00:00,  4.73s/it]


MODEL 4 score: 0.8796945486235345
Learning rate set to 0.203601
0:	learn: 0.5077758	total: 1.04s	remaining: 34m 44s
200:	learn: 0.2634102	total: 2m 58s	remaining: 26m 37s
400:	learn: 0.2563555	total: 5m 58s	remaining: 23m 48s
600:	learn: 0.2506082	total: 9m	remaining: 20m 57s
800:	learn: 0.2457419	total: 12m 1s	remaining: 18m
1000:	learn: 0.2416947	total: 15m 4s	remaining: 15m 3s
1200:	learn: 0.2382177	total: 18m 7s	remaining: 12m 3s
1400:	learn: 0.2351475	total: 21m 10s	remaining: 9m 3s
1600:	learn: 0.2321686	total: 24m 13s	remaining: 6m 2s
1800:	learn: 0.2293287	total: 27m 15s	remaining: 3m
1999:	learn: 0.2266814	total: 30m 20s	remaining: 0us


100%|██████████| 1/1 [00:04<00:00,  4.71s/it]


MODEL 5 score: 0.8797338086673869
Learning rate set to 0.203601
0:	learn: 0.5143228	total: 1.13s	remaining: 37m 40s
200:	learn: 0.2627467	total: 2m 58s	remaining: 26m 41s
400:	learn: 0.2556657	total: 6m 4s	remaining: 24m 13s
600:	learn: 0.2502051	total: 9m 8s	remaining: 21m 16s
800:	learn: 0.2456188	total: 12m 11s	remaining: 18m 15s
1000:	learn: 0.2416186	total: 15m 14s	remaining: 15m 12s
1200:	learn: 0.2376984	total: 18m 17s	remaining: 12m 9s
1400:	learn: 0.2345110	total: 21m 20s	remaining: 9m 7s
1600:	learn: 0.2313337	total: 24m 23s	remaining: 6m 4s
1800:	learn: 0.2284735	total: 27m 27s	remaining: 3m 2s
1999:	learn: 0.2260154	total: 30m 29s	remaining: 0us


100%|██████████| 1/1 [00:05<00:00,  5.05s/it]


MODEL 6 score: 0.8787479216299292
Learning rate set to 0.203601
0:	learn: 0.5151665	total: 1.04s	remaining: 34m 31s
200:	learn: 0.2627427	total: 2m 57s	remaining: 26m 27s
400:	learn: 0.2554107	total: 6m 1s	remaining: 24m
600:	learn: 0.2500660	total: 9m 6s	remaining: 21m 11s
800:	learn: 0.2455827	total: 12m 9s	remaining: 18m 11s
1000:	learn: 0.2413687	total: 15m 14s	remaining: 15m 12s
1200:	learn: 0.2378064	total: 18m 17s	remaining: 12m 10s
1400:	learn: 0.2344652	total: 21m 20s	remaining: 9m 7s
1600:	learn: 0.2315729	total: 24m 25s	remaining: 6m 5s
1800:	learn: 0.2287015	total: 27m 29s	remaining: 3m 2s
1999:	learn: 0.2261968	total: 30m 32s	remaining: 0us


100%|██████████| 1/1 [00:04<00:00,  4.94s/it]


MODEL 7 score: 0.8783892167440385
Learning rate set to 0.203601
0:	learn: 0.5048246	total: 1.02s	remaining: 33m 52s
200:	learn: 0.2627404	total: 2m 59s	remaining: 26m 46s
400:	learn: 0.2557730	total: 6m 2s	remaining: 24m 5s
600:	learn: 0.2506454	total: 9m 5s	remaining: 21m 9s
800:	learn: 0.2460070	total: 12m 9s	remaining: 18m 12s
1000:	learn: 0.2418513	total: 15m 15s	remaining: 15m 13s
1200:	learn: 0.2385438	total: 18m 19s	remaining: 12m 11s
1400:	learn: 0.2352600	total: 21m 26s	remaining: 9m 9s
1600:	learn: 0.2320909	total: 24m 29s	remaining: 6m 6s
1800:	learn: 0.2292739	total: 27m 32s	remaining: 3m 2s
1999:	learn: 0.2265579	total: 30m 32s	remaining: 0us


100%|██████████| 1/1 [00:04<00:00,  4.81s/it]


MODEL 8 score: 0.8786705070941863
Learning rate set to 0.203601
0:	learn: 0.5175608	total: 998ms	remaining: 33m 14s
200:	learn: 0.2626942	total: 3m	remaining: 26m 54s
400:	learn: 0.2556921	total: 6m 3s	remaining: 24m 10s
600:	learn: 0.2506298	total: 9m 8s	remaining: 21m 16s
800:	learn: 0.2462032	total: 12m 12s	remaining: 18m 16s
1000:	learn: 0.2421990	total: 15m 17s	remaining: 15m 15s
1200:	learn: 0.2384633	total: 18m 22s	remaining: 12m 13s
1400:	learn: 0.2351628	total: 21m 24s	remaining: 9m 9s
1600:	learn: 0.2320707	total: 24m 29s	remaining: 6m 6s
1800:	learn: 0.2292808	total: 27m 33s	remaining: 3m 2s
1999:	learn: 0.2267517	total: 30m 38s	remaining: 0us


100%|██████████| 1/1 [00:04<00:00,  4.99s/it]


MODEL 9 score: 0.8790399312798443


In [34]:
test_preds = predict(models, test_df, 0.41).astype(int)

100%|██████████| 10/10 [04:54<00:00, 29.45s/it]


In [35]:
submission = pd.DataFrame({'id': test_ids.id, 'label': test_preds})

In [36]:
submission.to_csv("submission.csv", index=False)

In [37]:
test_preds.mean()

0.18442375