In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score, roc_auc_score, make_scorer

from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier

from functools import reduce
from copy import deepcopy
import gc
from collections import Counter

%matplotlib inline

import warnings
warnings.filterwarnings('ignore') 

%cd ../

/home/chervovn04/Programming/hackathons/2022/digital_breakout_885303


In [2]:
df = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")
test_ids = deepcopy(test_data.id)

X = df.drop(columns=["label"])
y = df.label
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

In [3]:
top_operators = [obj for obj, count in Counter(test_data['index_oper'].replace(' ', '0').astype(float).astype(int)).most_common()][:10]

In [5]:
useless = []
cat_features = ["type", "is_privatecategory", "class", "is_in_yandex", "mailctg", "directctg", "mailtype"]
cat_features = list(set(cat_features) - set(useless))

def prepare_data(df):
    
    oper_type, oper_attr = zip(*df['oper_type + oper_attr'].map(lambda x: x.split('_')))
    
    df['is_negative_oper_attr'] = pd.Series(oper_attr) == '-1'
    df['specialization_ratio'] = df.total_qty_over_index_and_type / df.total_qty_over_index.clip(lower=1)
    
    for operator in top_operators:
        df[f'is_{operator}_operator'] = df['index_oper'].replace(' ', '0').astype(float).astype(int) == operator
    
    replace_zeros = ['priority', 'is_privatecategory', 'is_in_yandex', 'mailtype', 'directctg']
    for column in replace_zeros:
        df[column][df[column] == 0] = df[column].mode()[0]
        df[column][df[column] == '0'] = df[column].mode()[0]
    
    df.priority = 1 * (df.priority == 7503.) + 2 * (df.priority == 7504.) + 3 * (df.priority == 7506.)
    
    df.index_oper = df.index_oper.replace(' ', '0').astype(float).astype(int)
    
    for cat_feature in cat_features:
        df[cat_feature] = df[cat_feature].apply(lambda text: str(text))
        
    df['name_mfi_count'] = df.name_mfi.apply(lambda name: 0 if name == '0' else len(name.split(',')))
    
    df.is_return = df.is_return.apply(lambda text: text == 'Y')
        
    df.drop(columns=["oper_type + oper_attr"], inplace=True)
    df.drop(columns=["id", "name_mfi"], inplace=True)
    df.drop(columns=['mailrank'], inplace=True)
    df.drop(columns=useless, inplace=True, errors='ignore')
    
    return df

In [6]:
X_train = prepare_data(X_train)
X_val = prepare_data(X_val)
test_df = prepare_data(test_data)

In [7]:
X_train.to_csv('data/X_train_prepared.csv', index=False)
X_val.to_csv('data/X_val_prepared.csv', index=False)
y_train.to_csv('data/y_train_prepared.csv', index=False)
y_val.to_csv('data/y_val_prepared.csv', index=False)
test_df.to_csv('data/test_df_prepared.csv', index=False)
y_train.to_csv('data/y_train_prepared.csv', index=False)
y_val.to_csv('data/y_val_prepared.csv', index=False)

In [2]:
useless = []
cat_features = ["type", "is_privatecategory", "class", "is_in_yandex", "mailctg", "directctg", "mailtype"]
cat_features = list(set(cat_features) - set(useless))

cat_features = list(set(cat_features) - set(useless))
def to_cat_features(df):
    for cat_feature in cat_features:
        df[cat_feature] = df[cat_feature].apply(lambda text: str(text))
        
X_train = pd.read_csv('data/X_train_prepared.csv')
y_train = pd.read_csv('data/y_train_prepared.csv')
X_val = pd.read_csv('data/X_val_prepared.csv')
y_val = pd.read_csv('data/y_val_prepared.csv')
test_df = pd.read_csv('data/test_df_prepared.csv')
test_ids = pd.read_csv('data/test_ids.csv')


to_cat_features(X_train)
to_cat_features(X_val)
to_cat_features(test_df)

In [3]:
model = CatBoostClassifier(random_state=0, max_depth=8, verbose=10, iterations=100, auto_class_weights="Balanced", cat_features=cat_features)

In [4]:
model.fit(X_train, y_train)

Learning rate set to 0.5
0:	learn: 0.3829357	total: 2.07s	remaining: 3m 25s
10:	learn: 0.2754711	total: 19.4s	remaining: 2m 37s
20:	learn: 0.2693772	total: 36.6s	remaining: 2m 17s
30:	learn: 0.2672621	total: 52.8s	remaining: 1m 57s
40:	learn: 0.2650067	total: 1m 9s	remaining: 1m 40s
50:	learn: 0.2630939	total: 1m 26s	remaining: 1m 23s
60:	learn: 0.2616673	total: 1m 44s	remaining: 1m 7s
70:	learn: 0.2603776	total: 2m 2s	remaining: 50s
80:	learn: 0.2591975	total: 2m 19s	remaining: 32.8s
90:	learn: 0.2581488	total: 2m 37s	remaining: 15.6s
99:	learn: 0.2573083	total: 2m 53s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f7e979abd60>

In [5]:
def score_model(model, X, y, threshold=0.5):  # 0.225 is the best for now
    pred = model.predict_proba(X)[:, 1] > threshold
    recall = recall_score(y, pred, average="macro")
    roc_auc = roc_auc_score(y,  pred, multi_class='ovo')
    score = 0.1 * recall + 0.9 * roc_auc
    return score

In [6]:
score_model(model, X_val, y_val, 0.35)

0.883223911627167

In [8]:
score_model(model, X_train, y_train, 0.35) 

0.887055985158118

In [9]:
sorted(list(zip(X_train.columns, model.feature_importances_)), key = lambda x: -x[1])

[('total_qty_oper_login_1', 16.182123567702078),
 ('type', 14.168856594329933),
 ('specialization_ratio', 13.409770563616348),
 ('total_qty_oper_login_0', 9.384602898303886),
 ('dist_qty_oper_login_1', 8.847208304876755),
 ('index_oper', 6.545243746605591),
 ('transport_pay', 6.053554418700786),
 ('mailctg', 4.479989715935074),
 ('total_qty_over_index', 3.0680351163350474),
 ('price_mfi', 2.5509365538777593),
 ('is_in_yandex', 2.5179956597393716),
 ('weight', 2.4313147949509997),
 ('is_wrong_phone_number', 1.791369119902237),
 ('total_qty_over_index_and_type', 1.7682210561334746),
 ('name_mfi_count', 1.7123867400897255),
 ('weight_mfi', 1.4147060334822652),
 ('priority', 1.1655472380681304),
 ('is_102968_operator', 1.1261252858319184),
 ('is_wrong_rcpn_name', 0.6839804080500996),
 ('is_102976_operator', 0.312395991887884),
 ('class', 0.1556466397739297),
 ('directctg', 0.15067115782810034),
 ('is_negative_oper_attr', 0.024972773308771547),
 ('is_return', 0.016733552602785147),
 ('is_wr

In [10]:
test_preds = (model.predict_proba(test_df)[:, 1] > 0.35).astype(int)
submission = pd.DataFrame({'id': test_ids.id, 'label': test_preds})
submission.to_csv("submission.csv", index=False)

In [5]:
unrounded_sub = pd.read_csv('submission.csv') 
sub = unrounded_sub
sub.label = (unrounded_sub.label > 0.225).astype(int) 
sub.to_csv('submission_rounded.csv', index=False)

In [6]:
sub.label.mean()

0.23711925