In [31]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, roc_auc_score
from sklearn.metrics import confusion_matrix

from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier

import warnings
from copy import deepcopy
import gc
from collections import Counter

%matplotlib inline

In [19]:
%pip install seaborn

Defaulting to user installation because normal site-packages is not writeable
Collecting seaborn
  Downloading seaborn-0.12.1-py3-none-any.whl (288 kB)
     |████████████████████████████████| 288 kB 3.2 MB/s            
Installing collected packages: seaborn
Successfully installed seaborn-0.12.1
You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m


In [None]:
%cd ../

In [None]:
df = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")
test_ids = deepcopy(test_data.id)

In [None]:
df.head(3)

In [None]:
test_data.head(3)

In [None]:
top_operators = [obj for obj, count in Counter(test_data['index_oper'].replace(' ', '0').astype(float).astype(int)).most_common()][:10]

In [None]:
X = df.drop(columns=["label"])
y = df.label
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

# del [[X, df]]
# gc.collect()
# X=pd.DataFrame()
# df=pd.DataFrame()

In [None]:
cat_features = ["type", "priority", "is_privatecategory", "class", "is_in_yandex", "mailctg", "directctg", "mailtype"]

def prepare_data(df):
    df.drop(columns=["oper_type + oper_attr"], inplace=True)
    
    for operator in top_operators:
        df[f'is_{operator}_operator'] = df['index_oper'].replace(' ', '0').astype(float).astype(int) == operator
        
    # df['unknown_opers_ratio'] = df.total_qty_oper_login_0 / df.total_qty_over_index_and_type.clip(lower=1)
    # df['certain_mailtype_ratio'] =  df.total_qty_over_index_and_type / df.total_qty_over_index.clip(lower=1)
            
    df.drop(columns=["id", "index_oper", "name_mfi"], inplace=True)
    df.drop(columns=['mailrank'], inplace=True)
    
    df.is_return = df.is_return.apply(lambda text: text == 'Y')
    
    for cat_feature in cat_features:
        df[cat_feature] = df[cat_feature].apply(lambda text: str(text))

    return df

In [None]:
X_train = prepare_data(X_train)
X_val = prepare_data(X_val)
test_df = prepare_data(test_data)

In [None]:
X_train.head()

In [None]:
for column in X_train.columns:
    print(column)
    print(X_train[column].nunique())
    print('\n'.join([str(obj) + ' | ' + str(count / X_train.shape[0]) for obj, count in Counter(X_train[column]).most_common(10)]))
    print()

In [None]:
X_train.to_csv('data/X_train_prepared.csv', index=False)
X_val.to_csv('data/X_val_prepared.csv', index=False)
test_df.to_csv('data/test_df_prepared.csv', index=False)

In [32]:
cat_features = ["type", "priority", "is_privatecategory", "class", "is_in_yandex", "mailctg", "directctg", "mailtype"]

def to_cat_features(df):
    for cat_feature in cat_features:
        df[cat_feature] = df[cat_feature].apply(lambda text: str(text))
        
X_train = pd.read_csv('data/X_train_prepared.csv')
y_train = pd.read_csv('data/y_train_prepared.csv')
X_val = pd.read_csv('data/X_val_prepared.csv')
y_val = pd.read_csv('data/y_val_prepared.csv')
test_df = pd.read_csv('data/test_df_prepared.csv')
test_ids = pd.read_csv('data/test_ids.csv')

to_cat_features(X_train)
to_cat_features(X_val)
to_cat_features(test_df)

In [33]:
model = CatBoostClassifier(random_state=0, max_depth=8, verbose=10, iterations=2000, auto_class_weights="Balanced", cat_features=cat_features)

In [34]:
model.fit(X_train, y_train)

Learning rate set to 0.203601
0:	learn: 0.4943808	total: 6.19s	remaining: 3h 26m 11s
10:	learn: 0.2877284	total: 55.8s	remaining: 2h 48m 17s
20:	learn: 0.2775276	total: 1m 46s	remaining: 2h 47m 13s
30:	learn: 0.2744802	total: 2m 33s	remaining: 2h 42m 59s
40:	learn: 0.2725872	total: 3m 24s	remaining: 2h 42m 49s
50:	learn: 0.2709979	total: 4m 15s	remaining: 2h 42m 49s
60:	learn: 0.2703430	total: 4m 59s	remaining: 2h 38m 25s
70:	learn: 0.2698407	total: 5m 43s	remaining: 2h 35m 34s
80:	learn: 0.2688627	total: 6m 29s	remaining: 2h 33m 40s
90:	learn: 0.2679981	total: 7m 16s	remaining: 2h 32m 26s
100:	learn: 0.2671641	total: 8m 7s	remaining: 2h 32m 48s
110:	learn: 0.2664046	total: 8m 57s	remaining: 2h 32m 30s
120:	learn: 0.2658268	total: 9m 49s	remaining: 2h 32m 41s
130:	learn: 0.2651059	total: 10m 41s	remaining: 2h 32m 27s
140:	learn: 0.2644698	total: 11m 31s	remaining: 2h 32m 1s
150:	learn: 0.2639872	total: 12m 22s	remaining: 2h 31m 35s
160:	learn: 0.2633767	total: 13m 13s	remaining: 2h 31m

<catboost.core.CatBoostClassifier at 0x7fc9309ad070>

In [35]:
def score_model(model, X, y, threshold=0.5):  # 0.41 is the best for now
    pred = model.predict_proba(X)[:, 1] > threshold
    
    tn, fp, fn, tp = confusion_matrix(y, pred).ravel()
    tpr = (tp) / (tp + fn)
    fpr = (fp) / (fp + tn)
    
    r_score = (tp) / (tp + fn)
    ra_score = (1 + tpr - fpr) / 2
        
    score = 0.1 * r_score + 0.9 * ra_score
    
    return score

In [36]:
score_model(model, X_val, y_val, 0.41)

0.8838221465348826

In [37]:
sorted(list(zip(X_train.columns, model.feature_importances_)), key = lambda x: -x[1])

[('certain_mailtype_ratio', 11.764077716537898),
 ('unknown_opers_ratio', 9.946982218334423),
 ('type', 9.214546717917129),
 ('total_qty_oper_login_0', 9.09671569569094),
 ('dist_qty_oper_login_1', 8.470916066367279),
 ('total_qty_oper_login_1', 8.197543807359237),
 ('transport_pay', 6.808829923968521),
 ('weight', 6.316821916946282),
 ('price_mfi', 5.54810322299647),
 ('weight_mfi', 5.531178873778957),
 ('mailctg', 4.195875729539586),
 ('total_qty_over_index', 4.053495057495931),
 ('total_qty_over_index_and_type', 2.7969675402956593),
 ('is_in_yandex', 1.7080008732258787),
 ('is_wrong_rcpn_name', 1.384759938966418),
 ('is_wrong_phone_number', 1.380027030433976),
 ('priority', 0.989736067403538),
 ('is_102102_operator', 0.6695382853543752),
 ('class', 0.6096994575404641),
 ('is_620984_operator', 0.272430571423585),
 ('is_102152_operator', 0.2641279567208704),
 ('directctg', 0.23065287127614592),
 ('is_102998_operator', 0.21636021433323885),
 ('is_200980_operator', 0.11142358069171557),

In [38]:
test_preds = (model.predict_proba(test_df)[:, 1] > 0.41).astype(int)

In [39]:
submission = pd.DataFrame({'id': test_ids.id, 'label': test_preds})

In [40]:
submission.to_csv("submission.csv", index=False)

In [41]:
test_preds.mean()

0.18151075