In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score, roc_auc_score

from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier

import warnings
from copy import deepcopy
import gc
from collections import Counter

%matplotlib inline

In [2]:
def mask_generator(n, cnt):
    ones_count = 1
    yielded = 1
    if cnt == 0:
        return
    yield np.zeros(n, dtype=int)
    while ones_count <= n and yielded < cnt:
        cur_array = np.zeros(n, dtype=int)
        cur_array[:ones_count] = 1
        while cur_array[-ones_count:].min() == 0:
            yield cur_array
            last_zero = np.where(cur_array == 0)[0][-1]
            mobile = np.where(cur_array[:last_zero] == 1)[0][-1]
            suf_ones = 1 + cur_array[last_zero:].sum()
            cur_array[mobile:] = 0
            cur_array[mobile+1:mobile+suf_ones+1] = 1
            yielded += 1
            if yielded == cnt:
                return
        if yielded == cnt:
            return 
        yield cur_array
        yielded += 1
        ones_count += 1

In [3]:
warnings.filterwarnings('ignore') 

In [4]:
%cd ../

/home/chervovn04/Programming/hackathons/2022/digital_breakout_885303


In [5]:
df = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")
test_ids = deepcopy(test_data.id)

In [6]:
test_opers = test_data[['oper_type + oper_attr']]

In [7]:
df.head(3)

Unnamed: 0,id,oper_type + oper_attr,index_oper,type,priority,is_privatecategory,class,is_in_yandex,is_return,weight,...,dist_qty_oper_login_1,total_qty_oper_login_1,total_qty_oper_login_0,total_qty_over_index_and_type,total_qty_over_index,is_wrong_sndr_name,is_wrong_rcpn_name,is_wrong_phone_number,is_wrong_address,label
0,6818780,1043_-1,628629.0,Участок,7503.0,N,0.0,Y,N,87.0,...,42.0,720176.0,58950.0,779126.0,8290896.0,0,0,0,0,0
1,9907176,1023_-1,102976.0,ММПО,7503.0,N,0.0,N,N,107.0,...,914.0,48856658.0,83318932.0,132175590.0,136819803.0,0,0,0,0,0
2,3304275,1018_-1,620962.0,Цех,7503.0,N,0.0,Y,N,50.0,...,62.0,3246292.0,3233068.0,6479360.0,52708071.0,0,1,0,0,0


In [8]:
test_data.head(3)

Unnamed: 0,id,oper_type + oper_attr,index_oper,type,priority,is_privatecategory,class,is_in_yandex,is_return,weight,...,price_mfi,dist_qty_oper_login_1,total_qty_oper_login_1,total_qty_oper_login_0,total_qty_over_index_and_type,total_qty_over_index,is_wrong_sndr_name,is_wrong_rcpn_name,is_wrong_phone_number,is_wrong_address
0,7815282,8_13,102976.0,ММПО,7503.0,N,0.0,N,N,85.0,...,489.0,972.0,68766046.0,91123247.0,159889293.0,164927295.0,0,0,0,0
1,8443555,8_2,238753.0,ГОПС,7503.0,N,4.0,Y,N,21.0,...,186.0,2.0,2895.0,1545.0,4440.0,20623.0,0,1,1,0
2,6352559,1020_-1,618254.0,ГОПС,7503.0,N,4.0,Y,N,388.0,...,500.0,3.0,2751.0,993.0,3744.0,37817.0,0,0,0,0


In [9]:
top_operators = [obj for obj, count in Counter(test_data['index_oper'].replace(' ', '0').astype(float).astype(int)).most_common()][:10]

In [10]:
X = df.drop(columns=["label"])
y = df.label
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

# del [[X, df]]
# gc.collect()
# X=pd.DataFrame()
# df=pd.DataFrame()

In [11]:
cat_features = ["type", "is_privatecategory", "class", "is_in_yandex", "mailctg", "directctg", "mailtype"]

def prepare_data(df):    
    df.drop(columns=["oper_type + oper_attr"], inplace=True)
    
    for operator in top_operators:
        df[f'is_{operator}_operator'] = df['index_oper'].replace(' ', '0').astype(float).astype(int) == operator
    
    replace_zeros = ['priority', 'is_privatecategory', 'is_in_yandex', 'mailtype', 'directctg']
    for column in replace_zeros:
        df[column][df[column] == 0] = df[column].mode()[0]
        df[column][df[column] == '0'] = df[column].mode()[0]
    
    df.priority = 1 * (df.priority == 7503.) + 2 * (df.priority == 7504.) + 3 * (df.priority == 7506.)
    
    df.index_oper = df.index_oper.replace(' ', '1').astype(float).astype(int)
    # df['speed'] = df.total_qty_over_index / df.index_oper
    
    for cat_feature in cat_features:
        df[cat_feature] = df[cat_feature].apply(lambda text: str(text))
        
    df['goods_count'] = df.name_mfi.apply(lambda name: 0 if name == '0' else len(name.split(',')))
        
    df.drop(columns=["id", "name_mfi"], inplace=True)
    df.drop(columns=['mailrank'], inplace=True)
    
    df.is_return = df.is_return.apply(lambda text: text == 'Y')

    return df

In [12]:
X_train = prepare_data(X_train)
X_val = prepare_data(X_val)
test_df = prepare_data(test_data)

In [13]:
X_train.head()

Unnamed: 0,index_oper,type,priority,is_privatecategory,class,is_in_yandex,is_return,weight,mailtype,mailctg,...,is_102971_operator,is_620984_operator,is_102998_operator,is_102968_operator,is_102102_operator,is_102152_operator,is_140980_operator,is_420306_operator,is_200980_operator,goods_count
2766711,394962,Участок,1,N,0.0,Y,False,215.0,5.0,1.0,...,False,False,False,False,False,False,False,False,False,0
3781936,420302,Цех,1,N,0.0,Y,False,34.0,5.0,1.0,...,False,False,False,False,False,False,False,False,False,1
3824787,108971,Цех,3,N,0.0,N,False,78.0,5.0,1.0,...,False,False,False,False,False,False,False,False,False,1
4768159,108978,Участок,3,N,0.0,N,False,14.0,5.0,1.0,...,False,False,False,False,False,False,False,False,False,1
1873604,690967,Цех,1,N,0.0,Y,False,245.0,5.0,1.0,...,False,False,False,False,False,False,False,False,False,0


In [14]:
X_train.to_csv('data/X_train_prepared.csv', index=False)
X_val.to_csv('data/X_val_prepared.csv', index=False)
test_df.to_csv('data/test_df_prepared.csv', index=False)

In [5]:
cat_features = ["type", "is_privatecategory", "class", "is_in_yandex", "mailctg", "directctg", "mailtype"]

def to_cat_features(df):
    for cat_feature in cat_features:
        df[cat_feature] = df[cat_feature].apply(lambda text: str(text))
        
X_train = pd.read_csv('data/X_train_prepared.csv')
y_train = pd.read_csv('data/y_train_prepared.csv')
X_val = pd.read_csv('data/X_val_prepared.csv')
y_val = pd.read_csv('data/y_val_prepared.csv')
test_df = pd.read_csv('data/test_df_prepared.csv')
test_ids = pd.read_csv('data/test_ids.csv')

to_cat_features(X_train)
to_cat_features(X_val)
to_cat_features(test_df)

In [6]:
model = CatBoostClassifier(random_state=0, max_depth=8, verbose=10, iterations=100, auto_class_weights="Balanced", cat_features=cat_features)

In [7]:
model.fit(X_train, y_train)

Learning rate set to 0.5
0:	learn: 0.3863953	total: 1.89s	remaining: 3m 6s
10:	learn: 0.2750081	total: 19.1s	remaining: 2m 34s
20:	learn: 0.2699640	total: 35.7s	remaining: 2m 14s
30:	learn: 0.2681214	total: 50.7s	remaining: 1m 52s
40:	learn: 0.2657576	total: 1m 7s	remaining: 1m 36s
50:	learn: 0.2643680	total: 1m 23s	remaining: 1m 20s
60:	learn: 0.2630543	total: 1m 40s	remaining: 1m 4s
70:	learn: 0.2616516	total: 1m 57s	remaining: 47.8s
80:	learn: 0.2605686	total: 2m 13s	remaining: 31.4s
90:	learn: 0.2595191	total: 2m 30s	remaining: 14.8s
99:	learn: 0.2586633	total: 2m 44s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f7c3a2b7c40>

In [8]:
def score_model(model, X, y, threshold=0.5):  # 0.41 is the best for now
    pred = model.predict_proba(X)[:, 1] > threshold
    recall = recall_score(y, pred, average="macro")
    roc_auc = roc_auc_score(y,  pred, multi_class='ovo')
    score = 0.1 * recall + 0.9 * roc_auc
    return score

In [9]:
score_model(model, X_val, y_val, 0.41)

0.8843310448636955

In [10]:
test_preds = (model.predict_proba(test_df)[:, 1] > 0.41).astype(int)

In [11]:
submission = pd.DataFrame({'id': test_ids.id, 'label': test_preds})

In [12]:
submission.to_csv("submission.csv", index=False)

In [13]:
test_preds.mean()

0.2046925

In [14]:
sorted(list(zip(X_train.columns, model.feature_importances_)), key = lambda x: -x[1])

[('total_qty_oper_login_1', 16.23057086258367),
 ('index_oper', 15.873213423932373),
 ('total_qty_oper_login_0', 13.873488752859357),
 ('type', 10.402528333661735),
 ('dist_qty_oper_login_1', 8.013567630482589),
 ('transport_pay', 5.780142449926358),
 ('mailctg', 5.6566503771520384),
 ('total_qty_over_index', 4.820286118555671),
 ('goods_count', 3.2933030245793575),
 ('weight', 2.429991703817829),
 ('total_qty_over_index_and_type', 2.4184710855123166),
 ('weight_mfi', 1.8647883916842143),
 ('priority', 1.7607447717553208),
 ('price_mfi', 1.6070805532669663),
 ('is_102968_operator', 1.3419936140977904),
 ('is_wrong_phone_number', 1.2493553981479892),
 ('is_in_yandex', 0.9850759964551408),
 ('is_wrong_rcpn_name', 0.8179277050032154),
 ('is_102976_operator', 0.6177223698145542),
 ('is_102971_operator', 0.35954805216281643),
 ('is_620984_operator', 0.19853591193065057),
 ('directctg', 0.19232668424840557),
 ('class', 0.15037098658013892),
 ('is_return', 0.0240172880024046),
 ('is_wrong_snd