In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, roc_auc_score
from sklearn.metrics import confusion_matrix

from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier

import warnings
from copy import deepcopy
import gc
from collections import Counter

%matplotlib inline

In [2]:
def mask_generator(n, cnt):
    ones_count = 1
    yielded = 1
    if cnt == 0:
        return
    yield np.zeros(n, dtype=bool)
    while ones_count <= n and yielded < cnt:
        cur_array = np.zeros(n, dtype=bool)
        cur_array[:ones_count] = 1
        while cur_array[-ones_count:].min() == 0:
            yield cur_array
            last_zero = np.where(cur_array == 0)[0][-1]
            mobile = np.where(cur_array[:last_zero] == 1)[0][-1]
            suf_ones = 1 + cur_array[last_zero:].sum()
            cur_array[mobile:] = False
            cur_array[mobile+1:mobile+suf_ones+1] = True
            yielded += 1
            if yielded == cnt:
                return
        if yielded == cnt:
            return 
        yield cur_array
        yielded += 1
        ones_count += 1

In [3]:
def score_model(model, X, y, threshold=0.5):  # 0.41 is the best for now
    pred = model.predict_proba(X)[:, 1] > threshold
    recall = recall_score(y, pred, average="macro")
    roc_auc = roc_auc_score(y, pred, multi_class='ovo')
    score = 0.1 * recall + 0.9 * roc_auc
    return score

def score_model_probs(model, X, y):  # 0.41 is the best for now
    pred = model.predict_proba(X)[:, 1]
    roc_auc = roc_auc_score(y, pred, multi_class='ovo')
    return roc_auc

In [4]:
%cd ../

/home/chervovn04/Programming/hackathons/2022/digital_breakout_885303


In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
df = pd.read_csv("data/small_train.csv")

In [7]:
df.head(3)

Unnamed: 0,id,oper_type + oper_attr,index_oper,type,priority,is_privatecategory,class,is_in_yandex,is_return,weight,...,dist_qty_oper_login_1,total_qty_oper_login_1,total_qty_oper_login_0,total_qty_over_index_and_type,total_qty_over_index,is_wrong_sndr_name,is_wrong_rcpn_name,is_wrong_phone_number,is_wrong_address,label
0,2793601,14_1,102976.0,ММПО,7503.0,N,0.0,N,N,44.0,...,1089.0,64270133.0,116432632.0,180702765.0,188407812.0,0,0,0,0,0
1,8457088,8_2,614070.0,ГОПС,7503.0,N,3.0,Y,N,20.0,...,15.0,15988.0,3565.0,19553.0,532681.0,0,0,1,0,0
2,9639638,8_13,102976.0,ММПО,7503.0,N,0.0,N,N,122.0,...,914.0,48856658.0,83318932.0,132175590.0,136819803.0,0,0,0,0,0


In [8]:
def get_labels(col, label):
    for value in col.unique():
        print(value, (col == value).sum(), label[col == value].sum() / (col == value).sum()) 

In [9]:
get_labels(df['label'], df.label) 

0 1165592 0.0
1 34408 1.0


In [10]:
top_operators = [obj for obj, count in Counter(df['index_oper'].replace(' ', '0').astype(float).astype(int)).most_common()][:5]

In [11]:
X = df.drop(columns=["label"])
y = df.label
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

In [12]:
X.is_privatecategory.dtype

dtype('O')

In [13]:
cat_features = ["type", "is_privatecategory", "class", "is_in_yandex", "mailctg", "directctg", "mailtype"]

def prepare_data(df):    
    df.drop(columns=["oper_type + oper_attr"], inplace=True)
    
    for operator in top_operators:
        df[f'is_{operator}_operator'] = df['index_oper'].replace(' ', '0').astype(float).astype(int) == operator
    
    replace_zeros = ['priority', 'is_privatecategory', 'is_in_yandex', 'mailtype', 'directctg']
    for column in replace_zeros:
        df[column][df[column] == 0] = df[column].mode()[0]
        df[column][df[column] == '0'] = df[column].mode()[0]
    
    df.priority = 1 * (df.priority == 7503.) + 2 * (df.priority == 7504.) + 3 * (df.priority == 7506.)
    
    df.index_oper = df.index_oper.replace(' ', '1').astype(float).astype(int)
    # df['speed'] = df.total_qty_over_index / df.index_oper
    
    for cat_feature in cat_features:
        df[cat_feature] = df[cat_feature].apply(lambda text: str(text))
        
    df['goods_count'] = df.name_mfi.apply(lambda name: 0 if name == '0' else len(name.split(',')))
        
    df.drop(columns=["id", "name_mfi", "dist_qty_oper_login_1"], inplace=True)
    df.drop(columns=['mailrank'], inplace=True)
    
    df.is_return = df.is_return.apply(lambda text: text == 'Y')

    return df

In [14]:
X_train = prepare_data(X_train)
X_val = prepare_data(X_val)

In [15]:
X_train.head(5) 

Unnamed: 0,index_oper,type,priority,is_privatecategory,class,is_in_yandex,is_return,weight,mailtype,mailctg,...,is_wrong_sndr_name,is_wrong_rcpn_name,is_wrong_phone_number,is_wrong_address,is_102976_operator,is_102971_operator,is_620984_operator,is_102998_operator,is_102968_operator,goods_count
137672,102976,ММПО,1,N,0.0,N,False,95.0,5.0,1.0,...,0,0,1,0,True,False,False,False,False,3
786837,140980,Цех,1,N,0.0,Y,False,33.0,5.0,0.0,...,0,0,1,0,False,False,False,False,False,1
559710,360000,П,1,N,2.0,Y,False,1734.0,5.0,1.0,...,0,0,0,0,False,False,False,False,False,1
127788,102102,Цех,3,N,0.0,N,False,98.0,5.0,1.0,...,0,0,0,0,False,False,False,False,False,2
244109,102102,Цех,3,N,0.0,N,False,545.0,5.0,1.0,...,0,0,0,0,False,False,False,False,False,1


In [16]:
model = CatBoostClassifier(random_state=0, max_depth=8, verbose=10, iterations=100, auto_class_weights="Balanced", cat_features=cat_features)
model.fit(X_train, y_train)

Learning rate set to 0.5
0:	learn: 0.4004887	total: 419ms	remaining: 41.5s
10:	learn: 0.2811177	total: 3.6s	remaining: 29.1s
20:	learn: 0.2742479	total: 6.71s	remaining: 25.3s
30:	learn: 0.2706186	total: 9.61s	remaining: 21.4s
40:	learn: 0.2656841	total: 12.8s	remaining: 18.4s
50:	learn: 0.2615171	total: 16s	remaining: 15.3s
60:	learn: 0.2575713	total: 19.1s	remaining: 12.2s
70:	learn: 0.2545579	total: 22.3s	remaining: 9.11s
80:	learn: 0.2509518	total: 25.8s	remaining: 6.06s
90:	learn: 0.2481281	total: 29.4s	remaining: 2.91s
99:	learn: 0.2458057	total: 32.4s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f74691df670>

In [17]:
print(score_model(model, X_val, y_val, 0.41))
print(score_model_probs(model, X_val, y_val))

0.8783960081816737
0.9451670114831643


In [34]:
sorted(list(zip(X_train.columns, model.feature_importances_)), key = lambda x: -x[1])

[('total_qty_oper_login_1', 16.153419389571567),
 ('type', 11.736205829665298),
 ('dist_qty_oper_login_1', 9.517529254490938),
 ('total_qty_oper_login_0', 9.022842629139694),
 ('transport_pay', 6.384577113567003),
 ('total_qty_over_index', 6.212506271355333),
 ('weight', 5.504152879779317),
 ('index_oper', 5.471207432554614),
 ('mailctg', 5.082134844114865),
 ('price_mfi', 4.568811078714823),
 ('is_in_yandex', 4.0333794563436065),
 ('total_qty_over_index_and_type', 3.7561549414510864),
 ('goods_count', 3.052060870150288),
 ('weight_mfi', 2.629642802060001),
 ('is_wrong_phone_number', 2.027617103472018),
 ('is_102971_operator', 1.230029229556383),
 ('class', 1.139059339835674),
 ('is_wrong_rcpn_name', 1.1382379372316962),
 ('is_620984_operator', 0.6013477964229106),
 ('priority', 0.3133496325830855),
 ('directctg', 0.25871912516720325),
 ('is_102976_operator', 0.09946835833638423),
 ('is_return', 0.02888056468783035),
 ('is_wrong_sndr_name', 0.019059033306320812),
 ('is_wrong_address', 

In [None]:
for mask in mask_generator(columns.shape[0], columns.shape[0] + 1):
    print(columns[mask], end='  ->  ')
    X_train_reduced = X_train.drop(columns=columns[mask]) 
    X_val_reduced = X_val.drop(columns=columns[mask])
    cat_features_reduced = list(set(cat_features) - set(columns[mask]))
    model = CatBoostClassifier(random_state=0, max_depth=8, verbose=0, iterations=30, auto_class_weights="", cat_features=cat_features_reduced)
    model.fit(X_train_reduced, y_train)
    print(score_model(model, X_val_reduced, y_val, 0.41))