In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score, roc_auc_score, make_scorer

from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier

from functools import reduce
import warnings
from copy import deepcopy
import gc
from collections import Counter

%matplotlib inline

In [3]:
def mask_generator(n, cnt):
    ones_count = 1
    yielded = 1
    if cnt == 0:
        return
    yield np.zeros(n, dtype=int)
    while ones_count <= n and yielded < cnt:
        cur_array = np.zeros(n, dtype=int)
        cur_array[:ones_count] = 1
        while cur_array[-ones_count:].min() == 0:
            yield cur_array
            last_zero = np.where(cur_array == 0)[0][-1]
            mobile = np.where(cur_array[:last_zero] == 1)[0][-1]
            suf_ones = 1 + cur_array[last_zero:].sum()
            cur_array[mobile:] = 0
            cur_array[mobile+1:mobile+suf_ones+1] = 1
            yielded += 1
            if yielded == cnt:
                return
        if yielded == cnt:
            return 
        yield cur_array
        yielded += 1
        ones_count += 1

In [4]:
warnings.filterwarnings('ignore') 

In [5]:
%cd ../

/home/chervovn04/Programming/hackathons/2022/digital_breakout_885303


In [5]:
df = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")
test_ids = deepcopy(test_data.id)

In [6]:
X = df.drop(columns=["label"])
y = df.label
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

# del [[X, df]]
# gc.collect()
# X=pd.DataFrame()
# df=pd.DataFrame()

In [7]:
df.info(0)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000000 entries, 0 to 5999999
Data columns (total 29 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   id                             int64  
 1   oper_type + oper_attr          object 
 2   index_oper                     object 
 3   type                           object 
 4   priority                       float64
 5   is_privatecategory             object 
 6   class                          float64
 7   is_in_yandex                   object 
 8   is_return                      object 
 9   weight                         float64
 10  mailtype                       float64
 11  mailctg                        float64
 12  mailrank                       float64
 13  directctg                      float64
 14  transport_pay                  float64
 15  postmark                       float64
 16  name_mfi                       object 
 17  weight_mfi                     float64
 18  pr

In [8]:
top_operators = [obj for obj, count in Counter(test_data['index_oper'].replace(' ', '0').astype(float).astype(int)).most_common()][:10]

In [9]:
useless = []
cat_features = ["type", "is_privatecategory", "class", "is_in_yandex", "mailctg", "directctg", "mailtype"]

cat_features = list(set(cat_features) - set(useless))

def prepare_data(df, grouped):
    for operator in top_operators:
        df[f'is_{operator}_operator'] = df['index_oper'].replace(' ', '0').astype(float).astype(int) == operator
    
    df = df.join(grouped, on='index_oper')
    mean_columns = [column for column in df.columns if '__agg' in column]
    for column in mean_columns:
        df[column].fillna(df[column].mean(), inplace=True)
    
    replace_zeros = ['priority', 'is_privatecategory', 'is_in_yandex', 'mailtype', 'directctg']
    for column in replace_zeros:
        df[column][df[column] == 0] = df[column].mode()[0]
        df[column][df[column] == '0'] = df[column].mode()[0]
    
    df.priority = 1 * (df.priority == 7503.) + 2 * (df.priority == 7504.) + 3 * (df.priority == 7506.)
    
    df.index_oper = df.index_oper.replace(' ', '0').astype(float).astype(int)
    
    for cat_feature in cat_features:
        df[cat_feature] = df[cat_feature].apply(lambda text: str(text))
        
    df['name_mfi_count'] = df.name_mfi.apply(lambda name: 0 if name == '0' else len(name.split(',')))
    
    df.is_return = df.is_return.apply(lambda text: text == 'Y')
        
    df.drop(columns=["oper_type + oper_attr"], inplace=True)
    df.drop(columns=["id", "name_mfi"], inplace=True)
    df.drop(columns=['mailrank'], inplace=True)
    df.drop(columns=useless, inplace=True, errors='ignore')
    
    return df

In [10]:
def nunique(series):
      return series.nunique()

grouped = X_train[['index_oper', 'weight', 'transport_pay', 'weight_mfi', 'price_mfi']]
X_train.iloc[:, 1:].replace(0, np.nan)
grouped = grouped.groupby('index_oper').agg({
    'weight': np.mean,
    'transport_pay': np.mean,
    'weight_mfi': np.mean,
    'price_mfi': np.mean,
})
grouped = grouped.rename(columns={
    name: f'{name}__agg' for name in ['weight', 'transport_pay', 'weight_mfi', 'price_mfi']
})
grouped.head()

Unnamed: 0_level_0,weight__agg,transport_pay__agg,weight_mfi__agg,price_mfi__agg,total_qty_over_index__agg
index_oper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,411.459158,16.435631,195.67203,550.726485,2
101000.0,313.875,20.890536,72.767857,343.553571,2
102002.0,379.800235,15.75808,36.198927,122.166462,3
102007.0,140.477419,1.135226,129.378065,554.656774,3
102102.0,200.257263,8.68763,132.64682,587.42811,4


In [11]:
X_train = prepare_data(X_train, grouped)
X_val = prepare_data(X_val, grouped)
test_df = prepare_data(test_data, grouped)

In [12]:
X_train.head() 

Unnamed: 0,index_oper,type,priority,is_privatecategory,class,is_in_yandex,is_return,weight,mailtype,mailctg,...,is_102152_operator,is_140980_operator,is_420306_operator,is_200980_operator,weight__agg,transport_pay__agg,weight_mfi__agg,price_mfi__agg,total_qty_over_index__agg,name_mfi_count
2766711,394962,Участок,1,N,0.0,Y,False,215.0,5.0,1.0,...,False,False,False,False,315.869539,11.423399,139.895263,484.339935,4,0
3781936,420302,Цех,1,N,0.0,Y,False,34.0,5.0,1.0,...,False,False,False,False,176.929353,2.631576,154.856374,1020.345437,2,1
3824787,108971,Цех,3,N,0.0,N,False,78.0,5.0,1.0,...,False,False,False,False,286.965616,14.293944,85.745336,228.12693,4,1
4768159,108978,Участок,3,N,0.0,N,False,14.0,5.0,1.0,...,False,False,False,False,310.262426,18.187504,147.689413,467.707075,5,1
1873604,690967,Цех,1,N,0.0,Y,False,245.0,5.0,1.0,...,False,False,False,False,293.357585,12.270348,159.164065,571.633284,4,0


In [13]:
X_train.to_csv('data/X_train_prepared.csv', index=False)
X_val.to_csv('data/X_val_prepared.csv', index=False)
y_train.to_csv('data/y_train_prepared.csv', index=False)
y_val.to_csv('data/y_val_prepared.csv', index=False)
test_df.to_csv('data/test_df_prepared.csv', index=False)
y_train.to_csv('data/y_train_prepared.csv', index=False)
y_val.to_csv('data/y_val_prepared.csv', index=False)

In [5]:
useless = []
cat_features = ["type", "is_privatecategory", "class", "is_in_yandex", "mailctg", "directctg", "mailtype"]

cat_features = list(set(cat_features) - set(useless))
def to_cat_features(df):
    for cat_feature in cat_features:
        df[cat_feature] = df[cat_feature].apply(lambda text: str(text))
        
X_train = pd.read_csv('data/X_train_prepared.csv')
y_train = pd.read_csv('data/y_train_prepared.csv')
X_val = pd.read_csv('data/X_val_prepared.csv')
y_val = pd.read_csv('data/y_val_prepared.csv')
test_df = pd.read_csv('data/test_df_prepared.csv')
test_ids = pd.read_csv('data/test_ids.csv')
# X = pd.read_csv('data/X_prepared.csv')
# y = pd.read_csv('data/y_prepared.csv')


to_cat_features(X_train)
to_cat_features(X_val)
to_cat_features(test_df)
# to_cat_features(X)

In [6]:
model = CatBoostClassifier(random_state=0, max_depth=8, verbose=10, iterations=100, auto_class_weights="Balanced", cat_features=cat_features)

In [7]:
model.fit(X_train, y_train)

Learning rate set to 0.5
0:	learn: 0.3796545	total: 1.93s	remaining: 3m 10s
10:	learn: 0.2755910	total: 18.7s	remaining: 2m 30s
20:	learn: 0.2689417	total: 35.4s	remaining: 2m 13s
30:	learn: 0.2662813	total: 52.7s	remaining: 1m 57s
40:	learn: 0.2644754	total: 1m 9s	remaining: 1m 40s
50:	learn: 0.2626796	total: 1m 26s	remaining: 1m 23s
60:	learn: 0.2609979	total: 1m 44s	remaining: 1m 6s
70:	learn: 0.2597521	total: 2m 1s	remaining: 49.4s
80:	learn: 0.2585481	total: 2m 18s	remaining: 32.4s
90:	learn: 0.2573145	total: 2m 35s	remaining: 15.4s
99:	learn: 0.2564056	total: 2m 52s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f703713bb50>

In [8]:
def score_model(model, X, y, threshold=0.5):  # 0.41 is the best for now
    pred = model.predict_proba(X)[:, 1] > threshold
    recall = recall_score(y, pred, average="macro")
    roc_auc = roc_auc_score(y,  pred, multi_class='ovo')
    score = 0.1 * recall + 0.9 * roc_auc
    return score

In [9]:
score_model(model, X_val, y_val, 0.35)

0.8828069774163595

In [10]:
test_preds = (model.predict_proba(test_df)[:, 1] > 0.35).astype(int)

In [11]:
submission = pd.DataFrame({'id': test_ids.id, 'label': test_preds})

In [12]:
submission.to_csv("submission.csv", index=False)

In [13]:
test_preds.mean()

0.2122305

In [14]:
sorted(list(zip(X_train.columns, model.feature_importances_)), key = lambda x: -x[1])

[('total_qty_oper_login_1', 16.689211121026513),
 ('type', 10.795959266394197),
 ('index_oper', 9.183171067647415),
 ('transport_pay__agg', 8.835840514061854),
 ('transport_pay', 7.312076865711262),
 ('mailctg', 6.232364069292818),
 ('total_qty_oper_login_0', 5.785993013556491),
 ('total_qty_over_index', 4.064910754524851),
 ('is_in_yandex', 3.106984541355958),
 ('dist_qty_oper_login_1', 3.090233483344795),
 ('total_qty_over_index_and_type', 2.827669462205038),
 ('weight__agg', 2.7059704059654583),
 ('total_qty_over_index__agg', 2.447249137162868),
 ('price_mfi', 2.3834862704788145),
 ('weight', 2.2461044374941497),
 ('weight_mfi__agg', 2.139236190701426),
 ('priority', 1.904231663958938),
 ('name_mfi_count', 1.88871792383795),
 ('price_mfi__agg', 1.6249118031002803),
 ('is_wrong_phone_number', 1.4079766398482636),
 ('weight_mfi', 1.1521495915351303),
 ('is_wrong_rcpn_name', 0.9470555069073645),
 ('is_102976_operator', 0.43802490344416894),
 ('directctg', 0.37811074470230366),
 ('class

In [15]:
unrounded_sub = pd.read_csv('submission.csv') 
sub = unrounded_sub
sub.label = (unrounded_sub.label > 0.225).astype(int) 
sub.to_csv('submission_rounded.csv', index=False)

In [16]:
sub.label.mean()

0.23725725