In [49]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, roc_auc_score
from sklearn.metrics import confusion_matrix

from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier

import warnings
from copy import deepcopy
import gc
from collections import Counter

%matplotlib inline

In [52]:
from tqdm import tqdm

In [None]:
%cd ../

In [64]:
cat_features = ["type", "priority", "is_privatecategory", "class", "is_in_yandex", "mailctg", "directctg", "mailtype"]

def to_cat_features(df):
    for cat_feature in cat_features:
        df[cat_feature] = df[cat_feature].apply(lambda text: str(text))
    return df
        
df = to_cat_features(pd.read_csv('data/train_prepared.csv'))
test_df = to_cat_features(pd.read_csv('data/test_df_prepared.csv'))
y = pd.read_csv('data/train_y_prepared.csv')

In [65]:
test_ids = pd.read_csv('data/test_ids.csv')

In [66]:
def score(pred, y_true):
    tn, fp, fn, tp = confusion_matrix(y_true, pred).ravel()
    tpr = (tp) / (tp + fn)
    fpr = (fp) / (fp + tn)
    r_score = (tp) / (tp + fn)
    ra_score = (1 + tpr - fpr) / 2
    score = 0.1 * r_score + 0.9 * ra_score
    return score
    

def predict(models, X):
    preds = []
    for model in tqdm(models):
        preds.append(model.predict_proba(X)[:, 1])
    preds = np.array(preds).mean(0)
    return preds

In [76]:
models = []

In [78]:
cat_features = ["type", "priority", "is_privatecategory", "class", "is_in_yandex", "mailctg", "directctg", "mailtype"]

In [79]:
#!c1.32
for state in range(0, 1):
        X_train, X_val, y_train, y_val = train_test_split(df, y, test_size=0.2, random_state=state)
        model = CatBoostClassifier(random_state=state, max_depth=8, verbose=200, iterations=5000, auto_class_weights="Balanced", cat_features=cat_features)
        # print(X_train.head(), y_train.head())
        model.fit(X_train, y_train) 
        models.append(model)
        print(f'MODEL {state} score: {score(predict([model], X_val) > 0.41, y_val)}')

Learning rate set to 0.087876
0:	learn: 0.5993345	total: 1.07s	remaining: 1h 29m 37s
200:	learn: 0.2658352	total: 2m 21s	remaining: 56m 23s
400:	learn: 0.2605043	total: 4m 51s	remaining: 55m 39s
600:	learn: 0.2568905	total: 7m 22s	remaining: 53m 59s
800:	learn: 0.2538383	total: 9m 54s	remaining: 51m 57s
1000:	learn: 0.2511654	total: 12m 25s	remaining: 49m 38s
1200:	learn: 0.2485577	total: 14m 57s	remaining: 47m 20s
1400:	learn: 0.2461018	total: 17m 30s	remaining: 44m 57s
1600:	learn: 0.2440183	total: 20m	remaining: 42m 29s
1800:	learn: 0.2419682	total: 22m 31s	remaining: 40m 1s
2000:	learn: 0.2399645	total: 25m 3s	remaining: 37m 33s
2200:	learn: 0.2381489	total: 27m 35s	remaining: 35m 5s
2400:	learn: 0.2364467	total: 30m 7s	remaining: 32m 36s
2600:	learn: 0.2348199	total: 32m 40s	remaining: 30m 8s
2800:	learn: 0.2332478	total: 35m 13s	remaining: 27m 39s
3000:	learn: 0.2318066	total: 37m 45s	remaining: 25m 8s
3200:	learn: 0.2303742	total: 40m 16s	remaining: 22m 37s
3400:	learn: 0.228883

100%|██████████| 1/1 [00:02<00:00,  2.88s/it]


MODEL 0 score: 0.8872347168501412


In [80]:
test_preds = predict(models, test_df)

100%|██████████| 1/1 [00:34<00:00, 34.43s/it]


In [81]:
submission = pd.DataFrame({'id': test_ids.id, 'label': test_preds})

In [82]:
submission.to_csv("submission.csv", index=False)

In [83]:
test_preds.mean()

0.16262175058022063