In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, roc_auc_score
from sklearn.metrics import confusion_matrix

from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier

import warnings
from copy import deepcopy
import gc
from collections import Counter

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

%cd ../

In [2]:
def mask_generator(n, cnt):
    ones_count = 1
    yielded = 1
    if cnt == 0:
        return
    yield np.zeros(n, dtype=bool)
    while ones_count <= n and yielded < cnt:
        cur_array = np.zeros(n, dtype=bool)
        cur_array[:ones_count] = 1
        while cur_array[-ones_count:].min() == 0:
            yield cur_array
            last_zero = np.where(cur_array == 0)[0][-1]
            mobile = np.where(cur_array[:last_zero] == 1)[0][-1]
            suf_ones = 1 + cur_array[last_zero:].sum()
            cur_array[mobile:] = False
            cur_array[mobile+1:mobile+suf_ones+1] = True
            yielded += 1
            if yielded == cnt:
                return
        if yielded == cnt:
            return 
        yield cur_array
        yielded += 1
        ones_count += 1
        
def score_model(model, X, y, threshold=0.5):  # 0.41 is the best for now
    pred = model.predict_proba(X)[:, 1] > threshold
    recall = recall_score(y, pred, average="macro")
    roc_auc = roc_auc_score(y, pred, multi_class='ovo')
    score = 0.1 * recall + 0.9 * roc_auc
    return score

def score_model_probs(model, X, y):  # 0.41 is the best for now
    pred = model.predict_proba(X)[:, 1]
    roc_auc = roc_auc_score(y, pred, multi_class='ovo')
    return roc_auc

def get_labels(col, label):
    for value in col.unique():
        print(value, (col == value).sum(), label[col == value].sum() / (col == value).sum()) 

In [6]:
df = pd.read_csv("data/small_train.csv")
top_operators = [obj for obj, count in Counter(df['index_oper'].replace(' ', '0').astype(float).astype(int)).most_common()][:5]

In [31]:
cat_features = ["type", "is_privatecategory", "class", "is_in_yandex", "mailctg", "directctg", "mailtype"]

def prepare_data(df):    
    df.drop(columns=["oper_type + oper_attr"], inplace=True)
    
    # for operator in top_operators:
        # df[f'is_{operator}_operator'] = df['index_oper'].replace(' ', '0').astype(float).astype(int) == operator
    
    replace_zeros = ['priority', 'is_privatecategory', 'is_in_yandex', 'mailtype', 'directctg']
    for column in replace_zeros:
        df[column][df[column] == 0] = df[column].mode()[0]
        df[column][df[column] == '0'] = df[column].mode()[0]
    
    df.priority = 1 * (df.priority == 7503.) + 2 * (df.priority == 7504.) + 3 * (df.priority == 7506.)
    
    df.index_oper = df.index_oper.replace(' ', '0').astype(float).astype(int)
    # df['speed'] = df.total_qty_over_index / (max(df.index_oper) - df.index_oper + 1)
    
    for cat_feature in cat_features:
        df[cat_feature] = df[cat_feature].apply(lambda text: str(text))
        
    df['name_mfi_count'] = df.name_mfi.apply(lambda name: 0 if name == '0' else len(name.split(',')))
    # df['name_mfi_len'] = df.name_mfi.apply(lambda name: len(name))
        
    df.drop(columns=["id", "name_mfi"], inplace=True)
    df.drop(columns=['mailrank'], inplace=True)
    
    df.is_return = df.is_return.apply(lambda text: text == 'Y')

    return df

X = df.drop(columns=["label"])
y = df.label
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

X_train = prepare_data(X_train)
X_val = prepare_data(X_val)

In [48]:
model = CatBoostClassifier(random_state=0, max_depth=6, verbose=10, iterations=100, auto_class_weights="Balanced", cat_features=cat_features)
model.fit(X_train, y_train)

Learning rate set to 0.5
0:	learn: 0.4099812	total: 337ms	remaining: 33.4s
10:	learn: 0.2877337	total: 3.29s	remaining: 26.6s
20:	learn: 0.2792744	total: 6.66s	remaining: 25s
30:	learn: 0.2747086	total: 10.6s	remaining: 23.6s
40:	learn: 0.2714930	total: 13.9s	remaining: 20s
50:	learn: 0.2691861	total: 17.2s	remaining: 16.5s
60:	learn: 0.2676017	total: 20.4s	remaining: 13.1s
70:	learn: 0.2657014	total: 23.7s	remaining: 9.68s
80:	learn: 0.2638947	total: 27s	remaining: 6.32s
90:	learn: 0.2622917	total: 30s	remaining: 2.97s
99:	learn: 0.2604435	total: 33s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f7f9cafd330>

In [49]:
print(score_model(model, X_val, y_val, 0.45))
print(score_model_probs(model, X_val, y_val))

0.8821015612191861
0.947484915783278


In [45]:
sorted(list(zip(X_train.columns, model.feature_importances_)), key = lambda x: -x[1])

[('total_qty_oper_login_1', 12.766268924681857),
 ('type', 12.159953095497736),
 ('index_oper', 8.718548243351181),
 ('transport_pay', 7.355907341459322),
 ('price_mfi', 7.2253850749951045),
 ('total_qty_oper_login_0', 6.987125943059096),
 ('total_qty_over_index', 6.5453838487159075),
 ('dist_qty_oper_login_1', 6.445795151856468),
 ('weight', 6.436170957669595),
 ('mailctg', 6.075683596272848),
 ('total_qty_over_index_and_type', 4.796480588936906),
 ('weight_mfi', 4.242539649211898),
 ('is_wrong_rcpn_name', 3.07778346526095),
 ('name_mfi_count', 2.7176775979839016),
 ('is_wrong_phone_number', 1.809445933629861),
 ('priority', 1.7862129270740021),
 ('class', 0.3827111599135438),
 ('is_in_yandex', 0.21451529796716248),
 ('directctg', 0.1802185315606539),
 ('is_return', 0.030957932582459217),
 ('is_wrong_sndr_name', 0.024806849130382093),
 ('is_wrong_address', 0.00965992913098103),
 ('is_privatecategory', 0.006056072866558552),
 ('mailtype', 0.0044617338965267485),
 ('postmark', 0.0002501

In [None]:
for mask in mask_generator(columns.shape[0], columns.shape[0] + 1):
    print(columns[mask], end='  ->  ')
    X_train_reduced = X_train.drop(columns=columns[mask]) 
    X_val_reduced = X_val.drop(columns=columns[mask])
    cat_features_reduced = list(set(cat_features) - set(columns[mask]))
    model = CatBoostClassifier(random_state=0, max_depth=8, verbose=0, iterations=30, auto_class_weights="", cat_features=cat_features_reduced)
    model.fit(X_train_reduced, y_train)
    print(score_model(model, X_val_reduced, y_val, 0.41))