In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, roc_auc_score
from sklearn.metrics import confusion_matrix

from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier

import warnings
from copy import deepcopy
import gc
from collections import Counter

%matplotlib inline

In [2]:
%cd ../

/home/chervovn04/Programming/hackathons/2022/digital_breakout_885303


In [3]:
df = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")
test_ids = deepcopy(test_data.id)

  df = pd.read_csv("data/train.csv")
  test_data = pd.read_csv("data/test.csv")


In [4]:
df.head(3)

Unnamed: 0,id,oper_type + oper_attr,index_oper,type,priority,is_privatecategory,class,is_in_yandex,is_return,weight,...,dist_qty_oper_login_1,total_qty_oper_login_1,total_qty_oper_login_0,total_qty_over_index_and_type,total_qty_over_index,is_wrong_sndr_name,is_wrong_rcpn_name,is_wrong_phone_number,is_wrong_address,label
0,6818780,1043_-1,628629.0,Участок,7503.0,N,0.0,Y,N,87.0,...,42.0,720176.0,58950.0,779126.0,8290896.0,0,0,0,0,0
1,9907176,1023_-1,102976.0,ММПО,7503.0,N,0.0,N,N,107.0,...,914.0,48856658.0,83318932.0,132175590.0,136819803.0,0,0,0,0,0
2,3304275,1018_-1,620962.0,Цех,7503.0,N,0.0,Y,N,50.0,...,62.0,3246292.0,3233068.0,6479360.0,52708071.0,0,1,0,0,0


In [5]:
test_data.head(3)

Unnamed: 0,id,oper_type + oper_attr,index_oper,type,priority,is_privatecategory,class,is_in_yandex,is_return,weight,...,price_mfi,dist_qty_oper_login_1,total_qty_oper_login_1,total_qty_oper_login_0,total_qty_over_index_and_type,total_qty_over_index,is_wrong_sndr_name,is_wrong_rcpn_name,is_wrong_phone_number,is_wrong_address
0,7815282,8_13,102976.0,ММПО,7503.0,N,0.0,N,N,85.0,...,489.0,972.0,68766046.0,91123247.0,159889293.0,164927295.0,0,0,0,0
1,8443555,8_2,238753.0,ГОПС,7503.0,N,4.0,Y,N,21.0,...,186.0,2.0,2895.0,1545.0,4440.0,20623.0,0,1,1,0
2,6352559,1020_-1,618254.0,ГОПС,7503.0,N,4.0,Y,N,388.0,...,500.0,3.0,2751.0,993.0,3744.0,37817.0,0,0,0,0


In [6]:
top_operators = [obj for obj, count in Counter(test_data['index_oper'].replace(' ', '0').astype(float).astype(int)).most_common()][:10]

In [7]:
X = df.drop(columns=["label"])
y = df.label
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

# del [[X, df]]
# gc.collect()
# X=pd.DataFrame()
# df=pd.DataFrame()

In [8]:
cat_features = ["type", "is_privatecategory", "class", "is_in_yandex", "mailctg", "directctg", "mailtype"]

def prepare_data(df):
    df.drop(columns=["oper_type + oper_attr"], inplace=True)
    
    for operator in top_operators:
        df[f'is_{operator}_operator'] = df['index_oper'].replace(' ', '0').astype(float).astype(int) == operator
    
    df.priority = 1 * (df.priority == 7503.0) + 2 * (df.priority == 7504.0) + 3 * (df.priority == 7506.0)
    
    # df['unknown_opers_ratio'] = df.total_qty_oper_login_0 / df.total_qty_over_index_and_type.clip(lower=1)
    # df['certain_mailtype_ratio'] =  df.total_qty_over_index_and_type / df.total_qty_over_index.clip(lower=1)
            
    df.drop(columns=["id", "index_oper", "name_mfi"], inplace=True)
    df.drop(columns=['mailrank'], inplace=True)
    
    df.is_return = df.is_return.apply(lambda text: text == 'Y')
    
    for cat_feature in cat_features:
        df[cat_feature] = df[cat_feature].apply(lambda text: str(text))

    return df

In [9]:
X_train = prepare_data(X_train)
X_val = prepare_data(X_val)
test_df = prepare_data(test_data)

In [10]:
X_train.head()

Unnamed: 0,type,priority,is_privatecategory,class,is_in_yandex,is_return,weight,mailtype,mailctg,directctg,...,is_102976_operator,is_102971_operator,is_620984_operator,is_102998_operator,is_102968_operator,is_102102_operator,is_102152_operator,is_140980_operator,is_420306_operator,is_200980_operator
2766711,Участок,1,N,0.0,Y,False,215.0,5.0,1.0,2.0,...,False,False,False,False,False,False,False,False,False,False
3781936,Цех,1,N,0.0,Y,False,34.0,5.0,1.0,2.0,...,False,False,False,False,False,False,False,False,False,False
3824787,Цех,3,N,0.0,N,False,78.0,5.0,1.0,2.0,...,False,False,False,False,False,False,False,False,False,False
4768159,Участок,3,N,0.0,N,False,14.0,5.0,1.0,2.0,...,False,False,False,False,False,False,False,False,False,False
1873604,Цех,1,N,0.0,Y,False,245.0,5.0,1.0,2.0,...,False,False,False,False,False,False,False,False,False,False


In [11]:
for column in X_train.columns:
    print(column)
    print(X_train[column].nunique())
    print('\n'.join([str(obj) + ' | ' + str(count / X_train.shape[0]) for obj, count in Counter(X_train[column]).most_common(10)]))
    print()

type
20
Цех | 0.3488125
ММПО | 0.26914145833333336
Участок | 0.15447354166666666
ГОПС | 0.07737270833333333
ТИ | 0.05089770833333333
П | 0.041024375
МСЦ | 0.037173333333333336
СОПС | 0.012051458333333334
СЦ | 0.0020733333333333333
ОП | 0.00157625

priority
4
1 | 0.6652016666666667
3 | 0.29063083333333334
2 | 0.043621458333333335
0 | 0.0005460416666666667

is_privatecategory
3
N | 0.9990822916666666
Y | 0.00046916666666666667
0 | 0.00044854166666666664

class
6
0.0 | 0.868975625
3.0 | 0.062074375
2.0 | 0.025088958333333335
4.0 | 0.02195333333333333
1.0 | 0.019014375
5.0 | 0.0028933333333333333

is_in_yandex
3
N | 0.5976210416666666
Y | 0.4019304166666667
0 | 0.00044854166666666664

is_return
2
False | 0.9988310416666667
True | 0.0011689583333333333

weight
2806
30.0 | 0.006743125
27.0 | 0.006570208333333333
40.0 | 0.006527916666666666
28.0 | 0.006475208333333334
60.0 | 0.006428125
29.0 | 0.006399166666666667
26.0 | 0.006373958333333333
20.0 | 0.006341041666666667
25.0 | 0.006261875
70.0

In [12]:
X_train.to_csv('data/X_train_prepared.csv', index=False)
X_val.to_csv('data/X_val_prepared.csv', index=False)
test_df.to_csv('data/test_df_prepared.csv', index=False)

In [3]:
cat_features = ["type", "priority", "is_privatecategory", "class", "is_in_yandex", "mailctg", "directctg", "mailtype"]

def to_cat_features(df):
    for cat_feature in cat_features:
        df[cat_feature] = df[cat_feature].apply(lambda text: str(text))
        
X_train = pd.read_csv('data/X_train_prepared.csv')
y_train = pd.read_csv('data/y_train_prepared.csv')
X_val = pd.read_csv('data/X_val_prepared.csv')
y_val = pd.read_csv('data/y_val_prepared.csv')
test_df = pd.read_csv('data/test_df_prepared.csv')
test_ids = pd.read_csv('data/test_ids.csv')

to_cat_features(X_train)
to_cat_features(X_val)
to_cat_features(test_df)

In [4]:
model = CatBoostClassifier(random_state=0, max_depth=8, verbose=10, iterations=100, auto_class_weights="Balanced", cat_features=cat_features)

In [None]:
model.fit(X_train, y_train)

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


Learning rate set to 0.5
0:	learn: 0.3823949	total: 2.17s	remaining: 3m 35s
10:	learn: 0.2766858	total: 23.4s	remaining: 3m 8s
20:	learn: 0.2723407	total: 41.5s	remaining: 2m 36s
30:	learn: 0.2704094	total: 58.4s	remaining: 2m 9s
40:	learn: 0.2680534	total: 1m 17s	remaining: 1m 51s
50:	learn: 0.2662735	total: 1m 36s	remaining: 1m 33s
60:	learn: 0.2649147	total: 1m 56s	remaining: 1m 14s
70:	learn: 0.2637173	total: 2m 15s	remaining: 55.3s
80:	learn: 0.2627047	total: 2m 34s	remaining: 36.3s


In [None]:
def score_model(model, X, y, threshold=0.5):  # 0.41 is the best for now
    pred = model.predict_proba(X)[:, 1] > threshold
    
    tn, fp, fn, tp = confusion_matrix(y, pred).ravel()
    tpr = (tp) / (tp + fn)
    fpr = (fp) / (fp + tn)
    
    r_score = (tp) / (tp + fn)
    ra_score = (1 + tpr - fpr) / 2
        
    score = 0.1 * r_score + 0.9 * ra_score
    
    return score

In [None]:
score_model(model, X_val, y_val, 0.41)

In [None]:
sorted(list(zip(X_train.columns, model.feature_importances_)), key = lambda x: -x[1])

In [None]:
test_preds = (model.predict_proba(test_df)[:, 1] > 0.41).astype(int)

In [None]:
submission = pd.DataFrame({'id': test_data.id, 'label': test_preds})

In [None]:
submission.to_csv("submission.csv", index=False)

In [None]:
test_preds.mean()