## Загрузим нужные библиотеки

In [15]:
import re
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import lightgbm
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import recall_score, roc_auc_score

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

%matplotlib inline

In [2]:
df = pd.read_csv("content/train.csv", low_memory=False)
df_test = pd.read_csv("content/test.csv", low_memory=False)

solution = df_test[['id']]

In [3]:
df.head()

Unnamed: 0,id,oper_type + oper_attr,index_oper,type,priority,is_privatecategory,class,is_in_yandex,is_return,weight,...,dist_qty_oper_login_1,total_qty_oper_login_1,total_qty_oper_login_0,total_qty_over_index_and_type,total_qty_over_index,is_wrong_sndr_name,is_wrong_rcpn_name,is_wrong_phone_number,is_wrong_address,label
0,6818780,1043_-1,628629,Участок,7503.0,N,0.0,Y,N,87.0,...,42.0,720176.0,58950.0,779126.0,8290896.0,0,0,0,0,0
1,9907176,1023_-1,102976,ММПО,7503.0,N,0.0,N,N,107.0,...,914.0,48856658.0,83318932.0,132175590.0,136819803.0,0,0,0,0,0
2,3304275,1018_-1,620962,Цех,7503.0,N,0.0,Y,N,50.0,...,62.0,3246292.0,3233068.0,6479360.0,52708071.0,0,1,0,0,0
3,9020937,1019_-1,344964,Цех,7503.0,N,0.0,Y,N,416.0,...,55.0,2060928.0,653280.0,2714208.0,19562334.0,0,0,0,0,0
4,3082311,1020_-1,629819,Участок,7503.0,N,0.0,Y,N,795.0,...,16.0,316919.0,27911.0,344830.0,4719186.0,0,0,0,0,0


In [22]:
df['count_mfi'] = df['name_mfi'].apply(lambda name_mfi: name_mfi.count(',') + 1)
df_test['count_mfi'] = df_test['name_mfi'].apply(lambda name_mfi: name_mfi.count(',') + 1)

In [23]:
def has_digits(text):
    return bool(re.search('[0-9]', text))

df['name_has_digits'] = df['name_mfi'].apply(has_digits)
df_test['name_has_digits'] = df_test['name_mfi'].apply(has_digits)

In [25]:
def process_df(df):
    df = df.copy()
    drop_cols = [
        'id', 'oper_type + oper_attr', 'name_mfi',
    ]
    
    not_int_cols = []
    
    df["is_in_yandex"] = pd.Categorical(df["is_in_yandex"])
    df["is_in_yandex"].astype('category').cat.codes
    df["is_in_yandex"] = df["is_in_yandex"].cat.codes

    df["is_return"] = pd.Categorical(df["is_return"])
    df["is_return"].astype('category').cat.codes
    df["is_return"] = df["is_return"].cat.codes

    df["type"] = pd.Categorical(df["type"])
    df["type"].astype('category').cat.codes
    df["type"] = df["type"].cat.codes

    df["is_privatecategory"] = pd.Categorical(df["is_privatecategory"])
    df["is_privatecategory"].astype('category').cat.codes
    df["is_privatecategory"] = df["is_privatecategory"].cat.codes

    df['index_oper'] = df['index_oper'].apply(
        lambda index: int(float(index)) if len(index) > 1 else 0
    )
    
    df = df.drop(drop_cols, axis=1)
    
    for col in df.columns.drop(not_int_cols):
        df[col] = df[col].astype(int)
    
    return df

In [26]:
df = process_df(df)
df_test = process_df(df_test)

In [28]:
def add_features(df, df_test):
    df['is_train'] = True
    df_test['is_train'] = False
    df_union = pd.concat([df, df_test])
    
    by_columns = list(df_union.columns.drop(['is_train', 'label']))
    df_union['count_duplicates'] = df_union.groupby(by_columns)['class'].transform('count')

    df = df_union[df_union['is_train'] == True]
    df_test = df_union[df_union['is_train'] == False]
    
    df = df.drop('is_train', axis=1)
    df_test = df_test.drop(['is_train', 'label'], axis=1)
    
    return df, df_test

In [29]:
df, df_test = add_features(df, df_test)

## Выделим выборки

In [30]:
X = df.drop(["label"], axis=1)
y = df["label"].values

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

## Обучение модели

### RandomForest

In [34]:
RandomForest = RandomForestClassifier(
    n_estimators=100,
    max_depth=12,
    max_features=0.5,
    class_weight='balanced',
    verbose=2,
    n_jobs=4
)

RandomForest.fit(X_train, y_train)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.


building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100


[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  8.2min


building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100building tree 76 of 100

building tree 77 of 100
building tree 78 of 100
building tree 79

[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed: 23.3min finished


In [38]:
pred = RandomForest.predict(X_test)

print("Score", recall_score(y_test, pred, average="macro"))

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    2.2s finished


Score 0.9189754665909402


### CatBoost

In [39]:
cat_features = [
    'type',
    'priority',
    'class',
    'mailtype',
    'mailctg',
    'directctg',
    'is_in_yandex',
    'is_return',
    'is_privatecategory'
]

In [36]:
CatBoost = CatBoostClassifier(
    n_estimators=170,
    cat_features=cat_features,
    auto_class_weights='Balanced',
    verbose=True
)

CatBoost.fit(X_train, y_train)

Learning rate set to 0.5
0:	learn: 0.3274527	total: 3.65s	remaining: 10m 17s
1:	learn: 0.2785860	total: 6.41s	remaining: 8m 58s
2:	learn: 0.2604393	total: 9.84s	remaining: 9m 7s
3:	learn: 0.2487415	total: 12.8s	remaining: 8m 52s
4:	learn: 0.2381397	total: 16.5s	remaining: 9m 3s
5:	learn: 0.2274544	total: 19.4s	remaining: 8m 49s
6:	learn: 0.2231854	total: 22s	remaining: 8m 31s
7:	learn: 0.2189673	total: 24.8s	remaining: 8m 22s
8:	learn: 0.2175243	total: 27.5s	remaining: 8m 11s
9:	learn: 0.2158668	total: 30.1s	remaining: 8m 2s
10:	learn: 0.2136875	total: 32.6s	remaining: 7m 50s
11:	learn: 0.2121437	total: 35.2s	remaining: 7m 42s
12:	learn: 0.2113602	total: 37.7s	remaining: 7m 35s
13:	learn: 0.2102096	total: 40s	remaining: 7m 25s
14:	learn: 0.2090006	total: 42.4s	remaining: 7m 18s
15:	learn: 0.2079429	total: 45s	remaining: 7m 13s
16:	learn: 0.2071713	total: 48s	remaining: 7m 11s
17:	learn: 0.2063489	total: 50.6s	remaining: 7m 7s
18:	learn: 0.2053918	total: 53.6s	remaining: 7m 5s
19:	learn

<catboost.core.CatBoostClassifier at 0x1ee0b5e1b50>

In [40]:
pred = CatBoost.predict(X_test)

print("Score", recall_score(y_test, pred, average="macro"))

Score 0.9192870174111833


In [41]:
CatBoost.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,total_qty_over_index_and_type,21.257344
1,total_qty_oper_login_1,13.940197
2,count_duplicates,12.719985
3,dist_qty_oper_login_1,8.277198
4,total_qty_oper_login_0,8.141154
5,type,7.362892
6,index_oper,6.07371
7,transport_pay,4.482207
8,mailctg,4.025349
9,total_qty_over_index,3.249103


### LGBM

In [37]:
LGBM = LGBMClassifier(
    class_weight='balanced',
)

LGBM.fit(X_train, y_train)

In [42]:
pred = LGBM.predict(X_test)

print("Score", recall_score(y_test, pred, average="macro"))

Score 0.9183696828201178


In [43]:
for imp, name in sorted(zip(LGBM.feature_importances_, LGBM.feature_name_), key=lambda x: -x[0]):
    print(f'{name:40}{imp:.3f}')

count_duplicates                        414.000
dist_qty_oper_login_1                   303.000
index_oper                              263.000
total_qty_oper_login_0                  253.000
price_mfi                               207.000
type                                    189.000
total_qty_over_index                    182.000
transport_pay                           174.000
weight                                  167.000
weight_mfi                              166.000
total_qty_oper_login_1                  133.000
mailctg                                 127.000
is_wrong_phone_number                   88.000
total_qty_over_index_and_type           64.000
count_mfi                               57.000
class                                   48.000
is_wrong_rcpn_name                      47.000
is_in_yandex                            29.000
directctg                               18.000
is_wrong_sndr_name                      15.000
priority                                14.000
m

### Stacking

In [44]:
clf = StackingClassifier(
    estimators=[
        ('RandomForest', RandomForest),
        ('CatBoost', CatBoost),
        ('LGBM', LGBM),
    ],
    final_estimator=LogisticRegression(class_weight='balanced'),
    cv='prefit'
)

In [45]:
clf.fit(X_test, y_test)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    2.2s finished


In [46]:
clf.final_estimator_.coef_

array([[2.76310325, 3.14001469, 1.67062365]])

## Оценка точности

In [22]:
pred = clf.predict(X_train)

print("TRAIN score", recall_score(y_train, pred, average="macro"))

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    3.0s finished


TRAIN score 0.930746304769984


In [47]:
pred = clf.predict(X_test)

print("TEST score", recall_score(y_test, pred, average="macro"))

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    2.2s finished


TEST score 0.9198558567045984


## Посылка в систему

In [73]:
thr = 0.055

In [74]:
# pred_proba = clf.predict_proba(df_test)[:, 1]
pred = (pred_proba >= thr).astype(int)

In [75]:
pred.mean()

0.2213145

In [76]:
solution['label'] = pred

solution.to_csv('solutions/the_best_solution.csv', index=False, lineterminator='\n')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  solution['label'] = pred
