In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score, roc_auc_score, make_scorer

from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier

from tqdm import tqdm
from functools import reduce
from copy import deepcopy
import gc
from collections import Counter

%matplotlib inline

import warnings
warnings.filterwarnings('ignore') 

%cd ../

/home/chervovn04/Programming/hackathons/2022/digital_breakout_885303


In [2]:
def delete_double_spaces(nm):
    new_nm = ''
    for char in nm:
        if char != ' ' or (len(new_nm) and new_nm[-1] != ' '):
            new_nm += char
    return new_nm.strip()


def label_process(nm):
    nm = nm.lower()
    new_nm = ''
    balance = 0
    for char in nm:
        if char in '({[':
            balance += 1
        elif char in ')}]':
            balance = max(0, balance - 1)
        elif balance == 0:
            new_nm += char
    nm = new_nm
    nm = delete_double_spaces(nm)
    return nm

In [3]:
df = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")
test_ids = deepcopy(test_data.id)

In [4]:
df['processed_name'] = df.name_mfi.apply(label_process)
test_data['processed_name'] = test_data.name_mfi.apply(label_process)

In [5]:
df['item_hash'] = df['name_mfi'].apply(hash) ^ df.weight_mfi.apply(lambda x: hash(str(x))) ^ df.weight.apply(lambda x: hash(str(x))) ^ \
    df.transport_pay.apply(lambda x: hash(str(x)))
test_data['item_hash'] = test_data['item_hash'] = test_data['name_mfi'].apply(hash) ^ test_data.weight_mfi.apply(lambda x: hash(str(x))) ^ test_data.weight.apply(lambda x: hash(str(x))) ^ \
    test_data.transport_pay.apply(lambda x: hash(str(x)))

df['oper_item_hash'] = df.index_oper.apply(lambda x: hash(str(x))) ^ df.item_hash
test_data['oper_item_hash'] = test_data.index_oper.apply(lambda x: hash(str(x))) ^ test_data.item_hash

df['oper_characteristics_hash'] = df.dist_qty_oper_login_1.apply(lambda x: hash(str(x))) ^ df.total_qty_oper_login_1.apply(lambda x: hash(str(x))) ^ df.total_qty_oper_login_0.apply(lambda x: hash(str(x))) ^ \
df.total_qty_over_index_and_type.apply(lambda x: hash(str(x))) ^ df.total_qty_over_index.apply(lambda x: hash(str(x)))
test_data['oper_characteristics_hash'] = test_data.dist_qty_oper_login_1.apply(lambda x: hash(str(x))) ^ test_data.total_qty_oper_login_1.apply(lambda x: hash(str(x))) ^ test_data.total_qty_oper_login_0.apply(lambda x: hash(str(x))) ^ \
test_data.total_qty_over_index_and_type.apply(lambda x: hash(str(x))) ^ test_data.total_qty_over_index.apply(lambda x: hash(str(x)))

In [6]:
hash_counter = Counter(pd.concat([df.item_hash, test_data.item_hash], ignore_index=True))
oper_item_hash_counter = Counter(pd.concat([df.oper_item_hash, test_data.oper_item_hash], ignore_index=True))
name_counter = Counter(pd.concat([df.processed_name, test_data.processed_name], ignore_index=True))
charact_counter = Counter(pd.concat([df.oper_characteristics_hash, test_data.oper_characteristics_hash], ignore_index=True))

In [7]:
id_hash = pd.concat([df[['id', 'item_hash']], test_data[['id', 'item_hash']]], ignore_index=True)
id_hash = id_hash.groupby('item_hash').agg(max).rename(columns={'id': 'last_id'})

In [8]:
df = df.join(id_hash, on='item_hash')
test_data = test_data.join(id_hash, on='item_hash')

In [9]:
X = df.drop(columns=["label"])
y = df.label
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

In [10]:
top_operators = [obj for obj, count in Counter(test_data['index_oper'].replace(' ', '0').astype(float).astype(int)).most_common()][:10]

In [11]:
useless = []
cat_features = ["type", "is_privatecategory", "class", "is_in_yandex", "mailctg", "directctg", "mailtype"]
cat_features = list(set(cat_features) - set(useless))

def prepare_data(df):
    oper_type, oper_attr = zip(*df['oper_type + oper_attr'].map(lambda x: x.split('_')))
    
    # df['is_negative_oper_attr'] = pd.Series(oper_attr) == '-1'
    df['specialization_ratio'] = df.total_qty_over_index_and_type / df.total_qty_over_index.clip(lower=1)
    df['total_over_person'] = df.total_qty_over_index / df.dist_qty_oper_login_1.clip(lower=1)
    df['unknown_oper_ratio'] = df.total_qty_oper_login_0 / df.total_qty_oper_login_1.clip(lower=1)
    
    df['same_hash'] = df.item_hash.apply(lambda h: hash_counter[h])
    df['same_name'] = df.processed_name.apply(lambda name: name_counter[name])
    df['same_oper_item_hash'] = df.oper_item_hash.apply(lambda h: oper_item_hash_counter[h])
    df['same_characteristics_hash'] = df.oper_characteristics_hash.apply(lambda h: charact_counter[h])
    
    df['name_mfi_count'] = df.name_mfi.apply(lambda name: 0 if name == '0' else len(name.split(',')))
    df['name_mfi_len'] = df.name_mfi.apply(len)
    
    df['till_end'] = df.last_id - df.id
    
    for operator in top_operators:
        df[f'is_{operator}_operator'] = df['index_oper'].replace(' ', '0').astype(float).astype(int) == operator
    
    replace_zeros = ['priority', 'is_privatecategory', 'is_in_yandex', 'mailtype', 'directctg']
    for column in replace_zeros:
        df[column][df[column] == 0] = df[column].mode()[0]
        df[column][df[column] == '0'] = df[column].mode()[0]
    
    df.priority = 1 * (df.priority == 7503.) + 2 * (df.priority == 7504.) + 3 * (df.priority == 7506.)
    
    df.index_oper = df.index_oper.replace(' ', '0').astype(float).astype(int)
    
    for cat_feature in cat_features:
        df[cat_feature] = df[cat_feature].apply(lambda text: str(text))
            
    df.is_return = df.is_return.apply(lambda text: text == 'Y')
        
    df.drop(columns=["oper_type + oper_attr"], inplace=True)
    df.drop(columns=["name_mfi", "item_hash", "processed_name", "oper_item_hash", "oper_characteristics_hash"], inplace=True)
    df.drop(columns=['mailrank'], inplace=True)
    df.drop(columns=useless, inplace=True, errors='ignore')
    
    return df

In [12]:
# X_train = prepare_data(X_train)
# X_val = prepare_data(X_val)
X = prepare_data(X)
test_df = prepare_data(test_data)

In [13]:
# X_train.to_csv('data/X_train_prepared.csv', index=False)
# X_val.to_csv('data/X_val_prepared.csv', index=False)
# y_train.to_csv('data/y_train_prepared.csv', index=False)
# y_val.to_csv('data/y_val_prepared.csv', index=False)
X.to_csv('data/X_prepared.csv', index=False)

test_df.to_csv('data/test_df_prepared.csv', index=False)

In [None]:
useless = []
cat_features = ["type", "is_privatecategory", "class", "is_in_yandex", "mailctg", "directctg", "mailtype"]
cat_features = list(set(cat_features) - set(useless))

cat_features = list(set(cat_features) - set(useless))
def to_cat_features(df):
    for cat_feature in cat_features:
        df[cat_feature] = df[cat_feature].apply(lambda text: str(text))
        
X_train = pd.read_csv('data/X_train_prepared.csv')
y_train = pd.read_csv('data/y_train_prepared.csv')
X_val = pd.read_csv('data/X_val_prepared.csv')
y_val = pd.read_csv('data/y_val_prepared.csv')
test_df = pd.read_csv('data/test_df_prepared.csv')
test_ids = pd.read_csv('data/test_ids.csv')

# X_train = pd.read_csv('data/X_prepared.csv') 
# y_train = pd.read_csv('data/y_prepared.csv') 
# test_df = pd.read_csv('data/test_df_prepared.csv')
# test_ids = pd.read_csv('data/test_ids.csv')


to_cat_features(X_train)
to_cat_features(X_val)
to_cat_features(test_df)

In [None]:
model = CatBoostClassifier(random_state=0, max_depth=8, verbose=10, iterations=100, auto_class_weights="Balanced", cat_features=cat_features)

In [None]:
model.fit(X_train, y_train)

Learning rate set to 0.5
0:	learn: 0.3566742	total: 2.06s	remaining: 3m 24s
10:	learn: 0.2360138	total: 19.9s	remaining: 2m 41s
20:	learn: 0.2286112	total: 39.8s	remaining: 2m 29s
30:	learn: 0.2251070	total: 1m 1s	remaining: 2m 16s
40:	learn: 0.2219982	total: 1m 20s	remaining: 1m 55s
50:	learn: 0.2196130	total: 1m 39s	remaining: 1m 35s
60:	learn: 0.2172634	total: 1m 58s	remaining: 1m 15s
70:	learn: 0.2158655	total: 2m 17s	remaining: 56.2s
80:	learn: 0.2142544	total: 2m 36s	remaining: 36.7s
90:	learn: 0.2126560	total: 2m 55s	remaining: 17.4s
99:	learn: 0.2115454	total: 3m 12s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f5e57276d10>

In [None]:
def score_model(model, X, y, threshold=0.5):  # 0.225 is the best for now
    pred = model.predict_proba(X)[:, 1] > threshold
    recall = recall_score(y, pred, average="macro")
    roc_auc = roc_auc_score(y,  pred, multi_class='ovo')
    score = 0.1 * recall + 0.9 * roc_auc
    return score

In [None]:
score_model(model, X_val, y_val, 0.35)

0.8999372490808518

In [7]:
# score_model(model, X_train, y_train, 0.35)

In [8]:
sorted(list(zip(X_train.columns, model.feature_importances_)), key = lambda x: -x[1])

[('same_oper', 15.67805005398859),
 ('same_oper_item_hash', 9.495493762611071),
 ('total_over_person', 9.431277090218483),
 ('index_oper', 7.179157899104964),
 ('mailctg', 5.8224212506822886),
 ('total_qty_oper_login_1', 5.6441604028619405),
 ('dist_qty_oper_login_1', 5.055100440186739),
 ('total_qty_oper_login_0', 4.909403177030391),
 ('unknown_oper_ratio', 4.888155281322663),
 ('transport_pay', 4.813518167416267),
 ('same_characteristics_hash', 4.298420616711916),
 ('same_hash', 3.2867911780739254),
 ('specialization_ratio', 3.1927291224547414),
 ('total_qty_over_index', 2.9913653138546383),
 ('type', 1.8416800298700953),
 ('weight', 1.7389250998897714),
 ('is_wrong_phone_number', 1.3041108904061613),
 ('price_mfi', 1.2896252273636974),
 ('name_mfi_count', 1.1820823771117184),
 ('same_name', 1.1559751121065787),
 ('total_qty_over_index_and_type', 0.9002550015975972),
 ('priority', 0.7004822703082637),
 ('class', 0.6772394576231933),
 ('weight_mfi', 0.6518677296561304),
 ('is_wrong_rc

In [9]:
test_preds = (model.predict_proba(test_df)[:, 1] > 0.35).astype(int)
submission = pd.DataFrame({'id': test_ids.id, 'label': test_preds})
submission.to_csv("submission.csv", index=False)

In [16]:
sub1 = pd.read_csv('big_submission/7depth.csv')
sub2 = pd.read_csv('big_submission/8depth.csv')
sub3 = pd.read_csv('big_submission/9depth.csv')

sub = sub1.copy()
sub.label = ((sub1.label + sub2.label + sub3.label) / 3 > 0.15).

In [17]:
print(sub1.label.mean(), sub2.label.mean(), sub3.label.mean())
print(sub.label.mean())
sub.to_csv('submission_rounded.csv', index=False)

0.13603199322856416 0.12729435951097007 0.11270382666859656
0.21372625
