In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, roc_auc_score
from sklearn.metrics import confusion_matrix

from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier

import warnings
from copy import deepcopy
import gc
from collections import Counter

%matplotlib inline

In [30]:
def mask_generator(n, cnt):
    ones_count = 1
    yielded = 1
    if cnt == 0:
        return
    yield np.zeros(n, dtype=bool)
    while ones_count <= n and yielded < cnt:
        cur_array = np.zeros(n, dtype=bool)
        cur_array[:ones_count] = 1
        while cur_array[-ones_count:].min() == 0:
            yield cur_array
            last_zero = np.where(cur_array == 0)[0][-1]
            mobile = np.where(cur_array[:last_zero] == 1)[0][-1]
            suf_ones = 1 + cur_array[last_zero:].sum()
            cur_array[mobile:] = False
            cur_array[mobile+1:mobile+suf_ones+1] = True
            yielded += 1
            if yielded == cnt:
                return
        if yielded == cnt:
            return 
        yield cur_array
        yielded += 1
        ones_count += 1

In [37]:
def score_model(model, X, y, threshold=0.5):  # 0.41 is the best for now
    pred = model.predict_proba(X)[:, 1] > threshold
    recall = recall_score(y, pred, average="macro")
    roc_auc = roc_auc_score(y, pred, multi_class='ovo')
    score = 0.1 * recall + 0.9 * roc_auc
    return score

In [3]:
%cd ../

/home/chervovn04/Programming/hackathons/2022/digital_breakout_885303


In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
df = pd.read_csv("data/small_train.csv")

In [7]:
df.head(3)

Unnamed: 0,id,oper_type + oper_attr,index_oper,type,priority,is_privatecategory,class,is_in_yandex,is_return,weight,...,dist_qty_oper_login_1,total_qty_oper_login_1,total_qty_oper_login_0,total_qty_over_index_and_type,total_qty_over_index,is_wrong_sndr_name,is_wrong_rcpn_name,is_wrong_phone_number,is_wrong_address,label
0,2793601,14_1,102976.0,ММПО,7503.0,N,0.0,N,N,44.0,...,1089.0,64270133.0,116432632.0,180702765.0,188407812.0,0,0,0,0,0
1,8457088,8_2,614070.0,ГОПС,7503.0,N,3.0,Y,N,20.0,...,15.0,15988.0,3565.0,19553.0,532681.0,0,0,1,0,0
2,9639638,8_13,102976.0,ММПО,7503.0,N,0.0,N,N,122.0,...,914.0,48856658.0,83318932.0,132175590.0,136819803.0,0,0,0,0,0


In [8]:
def get_labels(col, label):
    for value in col.unique():
        print(value, (col == value).sum(), label[col == value].sum() / (col == value).sum()) 

In [9]:
get_labels(df['label'], df.label) 

0 1165592 0.0
1 34408 1.0


In [11]:
top_operators = [obj for obj, count in Counter(df['index_oper'].replace(' ', '0').astype(float).astype(int)).most_common()][:5]

In [12]:
X = df.drop(columns=["label"])
y = df.label
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

In [13]:
X.is_privatecategory.dtype

dtype('O')

In [14]:
cat_features = ["type", "is_privatecategory", "class", "is_in_yandex", "mailctg", "directctg", "mailtype"]

def prepare_data(df):    
    df.drop(columns=["oper_type + oper_attr"], inplace=True)
    
    for operator in top_operators:
        df[f'is_{operator}_operator'] = df['index_oper'].replace(' ', '0').astype(float).astype(int) == operator
    
    replace_zeros = ['priority', 'is_privatecategory', 'is_in_yandex', 'mailtype', 'directctg']
    for column in replace_zeros:
        df[column][df[column] == 0] = df[column].mode()[0]
        df[column][df[column] == '0'] = df[column].mode()[0]
    
    df.priority = 1 * (df.priority == 7503.) + 2 * (df.priority == 7504.) + 3 * (df.priority == 7506.)
    
    for cat_feature in cat_features:
        df[cat_feature] = df[cat_feature].apply(lambda text: str(text))
        
    df['goods_count'] = df.name_mfi.apply(lambda name: 0 if name == '0' else len(name.split(',')))
    df['price_by_weight'] = df.price_mfi / df.weight_mfi.clip(lower=1) * (df.weight_mfi != 0) 
    df['price_by_number'] = df.price_mfi / df.goods_count.clip(lower=1) * (df.goods_count != 0) 
        
    df.drop(columns=["id", "index_oper", "name_mfi"], inplace=True)
    df.drop(columns=['mailrank'], inplace=True)
    
    df.is_return = df.is_return.apply(lambda text: text == 'Y')

    return df

In [15]:
X_train = prepare_data(X_train)
X_val = prepare_data(X_val)

In [17]:
X_train.head(5) 

Unnamed: 0,type,priority,is_privatecategory,class,is_in_yandex,is_return,weight,mailtype,mailctg,directctg,...,is_wrong_phone_number,is_wrong_address,is_102976_operator,is_102971_operator,is_620984_operator,is_102998_operator,is_102968_operator,goods_count,price_by_weight,price_by_number
137672,ММПО,1,N,0.0,N,False,95.0,5.0,1.0,2.0,...,1,0,True,False,False,False,False,3,2.765957,86.666667
786837,Цех,1,N,0.0,Y,False,33.0,5.0,0.0,2.0,...,1,0,False,False,False,False,False,1,2.5,100.0
559710,П,1,N,2.0,Y,False,1734.0,5.0,1.0,2.0,...,0,0,False,False,False,False,False,1,1.282051,500.0
127788,Цех,3,N,0.0,N,False,98.0,5.0,1.0,2.0,...,0,0,False,False,False,False,False,2,2.823529,120.0
244109,Цех,3,N,0.0,N,False,545.0,5.0,1.0,2.0,...,0,0,False,False,False,False,False,1,1.219512,50.0


In [22]:
columns = np.array(X_train.columns)

[]  ->  0.8812535792387395
['type']  ->  0.8786821424582967
['priority']  ->  0.8833230087184781
['is_privatecategory']  ->  0.8803297013722126
['class']  ->  0.8824763585492358
['is_in_yandex']  ->  0.8817412619772349
['is_return']  ->  0.8808109459903265
['weight']  ->  0.8806085866047864
['mailtype']  ->  0.8812628644231855
['mailctg']  ->  0.8755337324039194
['directctg']  ->  0.8814802527001262
['transport_pay']  ->  0.8710717099823925
['postmark']  ->  0.8829611843079019
['weight_mfi']  ->  0.8822875698178863
['price_mfi']  ->  0.8815789300566043
['dist_qty_oper_login_1']  ->  0.882206783290521
['total_qty_oper_login_1']  ->  0.8811963727107334
['total_qty_oper_login_0']  ->  0.8802310240157348
['total_qty_over_index_and_type']  ->  0.882201773660054
['total_qty_over_index']  ->  0.8823426261741962
['is_wrong_sndr_name']  ->  0.8826472539725314
['is_wrong_rcpn_name']  ->  0.8821745877307303
['is_wrong_phone_number']  ->  0.8817884517232485
['is_wrong_address']  ->  0.882693017707

In [55]:
model = CatBoostClassifier(random_state=0, max_depth=8, verbose=10, iterations=100, auto_class_weights="Balanced", cat_features=cat_features)
model.fit(X_train, y_train)

Learning rate set to 0.5
0:	learn: 0.3882718	total: 466ms	remaining: 46.1s
10:	learn: 0.2745439	total: 4.51s	remaining: 36.5s
20:	learn: 0.2690307	total: 8.04s	remaining: 30.2s
30:	learn: 0.2632536	total: 12.1s	remaining: 26.9s
40:	learn: 0.2546106	total: 16.9s	remaining: 24.3s
50:	learn: 0.2485691	total: 21.8s	remaining: 20.9s
60:	learn: 0.2420997	total: 26.6s	remaining: 17s
70:	learn: 0.2366152	total: 31.3s	remaining: 12.8s
80:	learn: 0.2312656	total: 36s	remaining: 8.44s
90:	learn: 0.2267284	total: 40.7s	remaining: 4.02s
99:	learn: 0.2228109	total: 44.8s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7fa00e797430>

In [56]:
score_model(model, X_val, y_val, 0.41)

0.8708833082757507

In [34]:
sorted(list(zip(X_train.columns, model.feature_importances_)), key = lambda x: -x[1])

[('total_qty_oper_login_1', 14.292974182042817),
 ('type', 12.141837695401),
 ('total_qty_oper_login_0', 10.344742816009823),
 ('total_qty_over_index', 8.06173523532993),
 ('mailctg', 7.286907703502651),
 ('dist_qty_oper_login_1', 7.164508447826465),
 ('transport_pay', 6.111115737145688),
 ('total_qty_over_index_and_type', 5.430940025292454),
 ('weight_mfi', 4.173760764746747),
 ('price_mfi', 4.040431102234267),
 ('weight', 3.661518337461388),
 ('goods_count', 2.720502755151858),
 ('is_in_yandex', 2.6482048656364148),
 ('price_by_number', 2.3755297360133496),
 ('price_by_weight', 2.3295695202802955),
 ('is_wrong_phone_number', 1.931181302683001),
 ('is_102976_operator', 1.500712863414058),
 ('is_wrong_rcpn_name', 1.3927008776850887),
 ('is_102971_operator', 0.9061627618860806),
 ('is_620984_operator', 0.523075843844831),
 ('class', 0.3735305226942957),
 ('directctg', 0.3135162347461095),
 ('priority', 0.21774223531744022),
 ('is_return', 0.03506879110783479),
 ('is_privatecategory', 0.

In [None]:
for mask in mask_generator(columns.shape[0], columns.shape[0] + 1):
    print(columns[mask], end='  ->  ')
    X_train_reduced = X_train.drop(columns=columns[mask]) 
    X_val_reduced = X_val.drop(columns=columns[mask])
    cat_features_reduced = list(set(cat_features) - set(columns[mask]))
    model = CatBoostClassifier(random_state=0, max_depth=8, verbose=0, iterations=30, auto_class_weights="", cat_features=cat_features_reduced)
    model.fit(X_train_reduced, y_train)
    print(score_model(model, X_val_reduced, y_val, 0.41))