In [20]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, roc_auc_score
from sklearn.metrics import confusion_matrix

from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier

import warnings
from copy import deepcopy
import gc
from collections import Counter

%matplotlib inline

In [19]:
%pip install seaborn

Defaulting to user installation because normal site-packages is not writeable
Collecting seaborn
  Downloading seaborn-0.12.1-py3-none-any.whl (288 kB)
     |████████████████████████████████| 288 kB 3.2 MB/s            
Installing collected packages: seaborn
Successfully installed seaborn-0.12.1
You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m


In [None]:
%cd ../

In [None]:
df = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")
test_ids = deepcopy(test_data.id)

In [None]:
df.head(3)

In [None]:
test_data.head(3)

In [None]:
top_operators = [obj for obj, count in Counter(test_data['index_oper'].replace(' ', '0').astype(float).astype(int)).most_common()][:10]

In [None]:
X = df.drop(columns=["label"])
y = df.label
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

# del [[X, df]]
# gc.collect()
# X=pd.DataFrame()
# df=pd.DataFrame()

In [None]:
cat_features = ["type", "priority", "is_privatecategory", "class", "is_in_yandex", "mailctg", "directctg", "mailtype"]

def prepare_data(df):
    df.drop(columns=["oper_type + oper_attr"], inplace=True)
    
    for operator in top_operators:
        df[f'is_{operator}_operator'] = df['index_oper'].replace(' ', '0').astype(float).astype(int) == operator
        
    # df['unknown_opers_ratio'] = df.total_qty_oper_login_0 / df.total_qty_over_index_and_type.clip(lower=1)
    # df['certain_mailtype_ratio'] =  df.total_qty_over_index_and_type / df.total_qty_over_index.clip(lower=1)
            
    df.drop(columns=["id", "index_oper", "name_mfi"], inplace=True)
    df.drop(columns=['mailrank'], inplace=True)
    
    df.is_return = df.is_return.apply(lambda text: text == 'Y')
    
    for cat_feature in cat_features:
        df[cat_feature] = df[cat_feature].apply(lambda text: str(text))

    return df

In [None]:
X_train = prepare_data(X_train)
X_val = prepare_data(X_val)
test_df = prepare_data(test_data)

In [None]:
X_train.head()

In [None]:
for column in X_train.columns:
    print(column)
    print(X_train[column].nunique())
    print('\n'.join([str(obj) + ' | ' + str(count / X_train.shape[0]) for obj, count in Counter(X_train[column]).most_common(10)]))
    print()

In [None]:
X_train.to_csv('data/X_train_prepared.csv', index=False)
X_val.to_csv('data/X_val_prepared.csv', index=False)
test_df.to_csv('data/test_df_prepared.csv', index=False)

In [21]:
cat_features = ["type", "priority", "is_privatecategory", "class", "is_in_yandex", "mailctg", "directctg", "mailtype"]

def to_cat_features(df):
    for cat_feature in cat_features:
        df[cat_feature] = df[cat_feature].apply(lambda text: str(text))
        
X_train = pd.read_csv('data/X_train_prepared.csv')
y_train = pd.read_csv('data/y_train_prepared.csv')
X_val = pd.read_csv('data/X_val_prepared.csv')
y_val = pd.read_csv('data/y_val_prepared.csv')
test_df = pd.read_csv('data/test_df_prepared.csv')
test_ids = pd.read_csv('data/test_ids.csv')

to_cat_features(X_train)
to_cat_features(X_val)
to_cat_features(test_df)

In [22]:
model = CatBoostClassifier(random_state=0, max_depth=8, verbose=10, iterations=500, auto_class_weights="Balanced", cat_features=cat_features)

In [23]:
model.fit(X_train, y_train)

Learning rate set to 0.5
0:	learn: 0.3585839	total: 6.49s	remaining: 53m 57s
10:	learn: 0.2752267	total: 1m 3s	remaining: 47m 3s
20:	learn: 0.2715113	total: 1m 55s	remaining: 44m 5s
30:	learn: 0.2694254	total: 2m 52s	remaining: 43m 25s
40:	learn: 0.2672894	total: 3m 48s	remaining: 42m 36s
50:	learn: 0.2657499	total: 4m 46s	remaining: 42m 1s
60:	learn: 0.2645060	total: 5m 44s	remaining: 41m 19s
70:	learn: 0.2635137	total: 6m 45s	remaining: 40m 47s
80:	learn: 0.2624049	total: 7m 47s	remaining: 40m 18s
90:	learn: 0.2616070	total: 8m 45s	remaining: 39m 20s
100:	learn: 0.2608752	total: 9m 44s	remaining: 38m 27s
110:	learn: 0.2601321	total: 10m 43s	remaining: 37m 35s
120:	learn: 0.2592801	total: 11m 41s	remaining: 36m 35s
130:	learn: 0.2585278	total: 12m 41s	remaining: 35m 46s
140:	learn: 0.2579322	total: 13m 41s	remaining: 34m 51s
150:	learn: 0.2570517	total: 14m 39s	remaining: 33m 53s
160:	learn: 0.2563530	total: 15m 39s	remaining: 32m 57s
170:	learn: 0.2555996	total: 16m 37s	remaining: 31

<catboost.core.CatBoostClassifier at 0x7f43251e9c10>

In [24]:
def score_model(model, X, y, threshold=0.5):  # 0.41 is the best for now
    pred = model.predict_proba(X)[:, 1] > threshold
    
    tn, fp, fn, tp = confusion_matrix(y, pred).ravel()
    tpr = (tp) / (tp + fn)
    fpr = (fp) / (fp + tn)
    
    r_score = (tp) / (tp + fn)
    ra_score = (1 + tpr - fpr) / 2
        
    score = 0.1 * r_score + 0.9 * ra_score
    
    return score

In [25]:
score_model(model, X_val, y_val, 0.41)

0.8840026836817507

In [26]:
sorted(list(zip(X_train.columns, model.feature_importances_)), key = lambda x: -x[1])

[('certain_mailtype_ratio', 15.686615436454819),
 ('total_qty_oper_login_1', 13.188057552648171),
 ('type', 11.856385381449073),
 ('unknown_opers_ratio', 8.909660627780923),
 ('total_qty_oper_login_0', 7.8812074675152015),
 ('transport_pay', 5.115043587303872),
 ('weight', 4.6978397687130125),
 ('weight_mfi', 4.6581770462982375),
 ('dist_qty_oper_login_1', 4.646656111179125),
 ('mailctg', 4.513098042295464),
 ('price_mfi', 4.4276947343336674),
 ('is_in_yandex', 2.9474906977214834),
 ('priority', 2.342027477352435),
 ('total_qty_over_index_and_type', 2.0237921189571306),
 ('total_qty_over_index', 1.6064799677101456),
 ('is_wrong_phone_number', 1.1814280123353083),
 ('is_wrong_rcpn_name', 0.924350428616659),
 ('is_102968_operator', 0.8726412761293425),
 ('is_102102_operator', 0.6519816682245936),
 ('is_200980_operator', 0.5934794596726344),
 ('class', 0.5147154154170315),
 ('directctg', 0.2455587794031684),
 ('is_102152_operator', 0.23658849182603542),
 ('is_privatecategory', 0.117755881

In [27]:
test_preds = (model.predict_proba(test_df)[:, 1] > 0.41).astype(int)

In [28]:
submission = pd.DataFrame({'id': test_ids.id, 'label': test_preds})

In [29]:
submission.to_csv("submission.csv", index=False)

In [30]:
test_preds.mean()

0.1888275