In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm
%matplotlib inline

from preprocessors import GlobalPreprocessor, MeanTargetEncoder

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# Чтение и описание данных

In [2]:
train = pd.read_csv("data/insclass_train.csv")
train.head()

Unnamed: 0,variable_1,variable_2,variable_3,variable_4,variable_5,variable_6,variable_7,variable_8,variable_9,variable_10,...,variable_20,variable_21,variable_22,variable_23,variable_24,variable_25,variable_26,variable_27,variable_28,target
0,w200,0,0,14,q2,98.0,,0.0,,0,...,C,j2,h45,0,0.0,0,1,19.323463,t1,0
1,w160,0,0,7,q11,106.0,,0.0,,0,...,C,j33,h234,0,1.0,0,1,41.1779,t1,0
2,w200,0,0,4,q3,123.0,,0.0,,0,...,B,j12,h28,0,0.0,0,1,3.614395,t1,0
3,w200,0,0,9,q3,102.0,,0.0,,0,...,C,j12,h64,0,1.0,0,0,49.041674,t1,0
4,w200,0,0,18,q20,117.0,,0.0,,0,...,C,j111,h991,0,1.0,0,0,17.909612,t1,0


### Описание полей
В обучающем наборе данных для каждого договора (всего 151406) известны следующие поля:

**variable_1** - агрегированный коэффициент бонус-малус (повышающий или понижающий стоимость полиса в зависимости от аварийности в предыдущие периоды);

15 различных значений типа "w*". В теории очень важный показатель. One-hot? Mean target? Frequency?

**variable_2** - индикатор расторжения договора по инициативе страхователя (клиента);

Бинарное значение. 98% нулей (не расторгали договор). Оставляем столбец.

**variable_3** - индикатор расторжения договора по инициативе страховщика (страховой компании);

Удивительно, но страховщик не расторгал ни разу договор. Можно убрать столбец.

**variable_4** - идентификатор года выпуска транспортного средства;

Странное натуральное значение, меньше 100. Оставить как есть + frequency. 

**variable_5** - идентификатор страны - производителя транспортного средства;

Значения вида "q*". Примерно 40 категорий. LabelEncoder, mean-target, frequency?

**variable_6** - мощность двигателя в лошадиных силах;

Число, чаще всего целое. Порядка 1000 различных значений. Есть NaNы (совсем мало). Is_nan + оставить (заполнить средним) + frequency.  

**variable_7** - объем двигателя в куб. см;

Число, чаще всего целое. Порядка 700 различных значений. Есть NaNы (2/3 от всех значений). Is_nan + оставить (заполнить средним) + frequency. 

**variable_8** - идентификатор стороны расположения руля (левый или правый);

80% леворульные ("0"), меньше 1% праворульные, остальные NaNы. Is_nan + оставить (заполнить нулем).

**variable_9** - пробег транспортного средства, покрываемый гарантией производителя;

Много NaNов. Числовое значение, различных значений меньше 100. Is_nan + оставить (заполнить нулем).

**variable_10** - индикатор действия гарантии на транспортное средство;

Много автомобилей с уже недействующей гарантией ("0"). Оставить как есть.

**variable_11** - "мультидрайв" - индикатор допуска к управлению транспортным средством более одного водителя;

Много автомобилей лишь с одним водителем ("0"). Оставить как есть.

**variable_12** - возраст транспортного средства (в мес.);

Числовое значение. Есть немного NaNов. Is_nan + Оставить как есть (заполнить средним).

**variable_13** - возраст водителя с максимальным стажем;

Число. Есть немного NaNов. Is_nan + Оставить как есть (заполнить средним).

**variable_14** - коэффициент возраст-стаж;

Числовое значение, при этом всего 4 различных значения. Есть немного NaNов. Is_nan + one-hot (заполнить уникальным) + frequency.

**variable_15** - коэффициент краткосрочности;

Всего 13 значений, не являющихся NaNами. Is_nan

**variable_16** - коэффициент мощности;

Числовое значение, при этом всего 6 различных значений. Есть немного NaNов. Is_nan + one-hot (заполнить уникальным) + frequency.

**variable_17** - коэффициент "мультидрайв";

Числовое значение, при этом всего 2 различных значений. Есть немного NaNов. Is_nan + one-hot (заполнить уникальным) + frequency.

**variable_18** - территориальный коэффициент;

Числовое значение, ~20 различных. Есть немного NaNов. Is_nan + оставить (заполнить уникальным) + frequency.

**variable_19** - коэффициент "КНДР";

Числовое значение, ~12 различных. LabelEncoder + frequency.

**variable_20** - идентификатор канала продаж;

Значения "A", "B", "C". Есть немного NaNов. Заполнить самым частым + LabelEncoder + frequency.

**variable_21** - марка транспортного средства;

Значения "j*", всего 192. LabelEncoder + frequency + mean_encoding?

**variable_22** - модель транспортного средства;

Значения "h*", всего ~1500. LabelEncoder + frequency + mean_encoding?

**variable_23** - индикатор отечественных транспортных средств;

Бинарное значение, больше иномарок ("0"). Оставить как есть.

**variable_24** - пол водителя с максимальным коэффициентом "возраст-стаж";

Бинарное значение, есть немного NaNов. Is_nan + заполнить самым частым.

**variable_25** - индикатор пролонгации;

Бинарное значение, большинство "0". Оставить как есть.

**variable_26** - индикатор совпадения собственника транспортного средства и водителя;

Бинарное значение, большинство "1". Оставить как есть.

**variable_27** - стаж водителя с максимальным коэффициентом "возраст-стаж";

Числовое значение, всего 73 различных. Есть немного NaNов. Is_nan + заполнить самым частым + frequency + оставить. 

**variable_28** - тип транспортного средства;

Категории типа "t*". Всего 5 различных значений. LabelEncoder + frequency + one-hot.

**target** - рисковано ли заключать сделку с клиентом (бинарный признак)

Около 90% имеют значения "0" (нерискованная сделка).

In [3]:
# train.groupby(["variable_14"])[["target"]].count() #/ 151406

In [4]:
# train.describe()

In [5]:
# train.info()

In [6]:
test = pd.read_csv("data/insclass_test.csv")
test.head()

Unnamed: 0,id,variable_1,variable_2,variable_3,variable_4,variable_5,variable_6,variable_7,variable_8,variable_9,...,variable_19,variable_20,variable_21,variable_22,variable_23,variable_24,variable_25,variable_26,variable_27,variable_28
0,1,w200,0,0,12,q1,105.0,1598.0,0.0,,...,154.244863,C,j1,h24,0,1.0,0,1,12.200897,t1
1,2,w160,0,0,19,q2,165.0,2500.0,,,...,148.905712,C,j40,h568,0,1.0,1,1,22.133735,t1
2,3,w160,0,0,6,q3,107.0,1396.0,,,...,122.974203,C,j12,h28,0,1.0,0,1,0.975189,t1
3,4,w200,0,0,11,q4,80.9,,0.0,,...,154.244863,C,j6,h62,1,1.0,0,1,0.975189,t1
4,5,w200,0,0,4,q3,197.0,,0.0,,...,148.905712,C,j12,h69,0,0.0,0,1,0.975189,t1


In [7]:
# test.groupby(["variable_16"])[["id"]].count() #/ 22624

In [8]:
# test.info()

In [9]:
feature_columns = [("variable_" + str(i)) for i in range(1, 29)]
X_train = train[feature_columns].values
y_train = train[["target"]].values.flatten()
X_test = test[feature_columns].values

# Предобработка

In [10]:
pd.DataFrame(X_train).head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,w200,0,0,14,q2,98.0,,0,,0,...,127.113,C,j2,h45,0,0,0,1,19.3235,t1
1,w160,0,0,7,q11,106.0,,0,,0,...,148.906,C,j33,h234,0,1,0,1,41.1779,t1
2,w200,0,0,4,q3,123.0,,0,,0,...,154.245,B,j12,h28,0,0,0,1,3.6144,t1
3,w200,0,0,9,q3,102.0,,0,,0,...,131.068,C,j12,h64,0,1,0,0,49.0417,t1
4,w200,0,0,18,q20,117.0,,0,,0,...,131.068,C,j111,h991,0,1,0,0,17.9096,t1
5,w160,0,0,27,q0,70.0,,0,,0,...,148.906,C,j11,h504,0,0,0,1,19.3235,t1
6,w200,0,0,4,q1,184.0,,0,,0,...,148.906,B,j31,h132,0,1,0,1,3.6144,t1
7,w145,0,0,15,q2,90.0,,0,,0,...,102.173,A,j15,h169,0,1,0,1,17.9096,t1
8,w200,0,0,6,q4,97.9,,0,,0,...,154.245,C,j14,h215,1,0,0,1,17.9096,t1
9,w160,0,0,25,q2,74.0,,0,,0,...,131.068,C,j4,h6,0,1,0,0,33.1444,t1


In [11]:
preprocessor = GlobalPreprocessor()
preprocessor.fit(X_train, y_train)
X_new = preprocessor.transform(X_train)
new_df = pd.DataFrame(X_new)
new_df.head(10)

NanHandler(zero):: idx=7, value=0, column=
NanHandler(zero):: idx=8, value=0, column=
NanHandler(zero):: idx=14, value=0, column=
NanHandler(mean):: idx=5, value=117.93774103572484, column=
NanHandler(mean):: idx=6, value=1865.9097315616762, column=
NanHandler(mean):: idx=11, value=133.0999555680707, column=
NanHandler(mean):: idx=12, value=49.9751241304109, column=
NanHandler(unique):: idx=13, value=-1, column=
NanHandler(unique):: idx=15, value=-1, column=
NanHandler(unique):: idx=16, value=-1, column=
NanHandler(unique):: idx=17, value=-1, column=
NanHandler(most_frequent):: idx=19, value=C, column=
NanHandler(most_frequent):: idx=23, value=1.0, column=
NanHandler(most_frequent):: idx=26, value=7.88327128357425, column=
CategoricalFrequencyEncoder:: idx=0 len=15
CategoricalFrequencyEncoder:: idx=3 len=62
CategoricalFrequencyEncoder:: idx=4 len=39
CategoricalFrequencyEncoder:: idx=5 len=1122
CategoricalFrequencyEncoder:: idx=6 len=743
CategoricalFrequencyEncoder:: idx=13 len=5
Catego

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,82,83,84,85,86,87,88,89,90,91
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,43869.0,23581.0,109540.0,8103.0,948.0,7045.0,149989.0,80.985224,98.648082,80.985224
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,26714.0,42824.0,109540.0,1005.0,487.0,708.0,149989.0,80.985224,118.116608,80.985224
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,14302.0,20911.0,16468.0,14608.0,4526.0,8166.0,149989.0,80.985224,162.514016,80.985224
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,43869.0,17728.0,109540.0,14608.0,3753.0,517.0,149989.0,80.985224,118.116608,80.985224
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,20203.0,17728.0,109540.0,49.0,8.0,5408.0,149989.0,80.985224,118.116608,80.985224
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,26714.0,42824.0,109540.0,7718.0,12.0,7045.0,149989.0,80.985224,80.985224,80.985224
6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,20203.0,42824.0,16468.0,4037.0,1091.0,8166.0,149989.0,80.985224,214.256572,80.985224
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,16860.0,8218.0,25398.0,4421.0,237.0,5408.0,149989.0,80.985224,98.648082,80.985224
8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,43869.0,20911.0,109540.0,4529.0,586.0,5408.0,149989.0,80.985224,98.648082,80.985224
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,20203.0,17728.0,109540.0,12217.0,2776.0,1216.0,149989.0,80.985224,98.648082,80.985224


In [12]:
# new_df.info()

In [13]:
# new_df[[2]].describe()

In [14]:
# new_df.groupby([7])[[1]].count() #/ 151406

In [15]:
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)
pd.DataFrame(X_test).head(10)

NanHandler(zero):: idx=7, value=0, column=
NanHandler(zero):: idx=8, value=0, column=
NanHandler(zero):: idx=14, value=0, column=
NanHandler(mean):: idx=5, value=117.93774103572484, column=
NanHandler(mean):: idx=6, value=1865.9097315616762, column=
NanHandler(mean):: idx=11, value=133.0999555680707, column=
NanHandler(mean):: idx=12, value=49.9751241304109, column=
NanHandler(unique):: idx=13, value=-1, column=
NanHandler(unique):: idx=15, value=-1, column=
NanHandler(unique):: idx=16, value=-1, column=
NanHandler(unique):: idx=17, value=-1, column=
NanHandler(most_frequent):: idx=19, value=C, column=
NanHandler(most_frequent):: idx=23, value=1.0, column=
NanHandler(most_frequent):: idx=26, value=7.88327128357425, column=
SubstitutiveCategoricalEncoder:: idx=13, column=
SubstitutiveCategoricalEncoder:: idx=15, column=
SubstitutiveCategoricalEncoder:: idx=16, column=
NanHandler(zero):: idx=7, value=0, column=
NanHandler(zero):: idx=8, value=0, column=
NanHandler(zero):: idx=14, value=0

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,82,83,84,85,86,87,88,89,90,91
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,43869.0,20911.0,109540.0,5359.0,2525.0,6758.0,149989.0,80.985224,118.116608,80.985224
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,26714.0,42824.0,109540.0,596.0,203.0,5539.0,149989.0,80.985224,214.256572,80.985224
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,11594.0,17575.0,109540.0,14608.0,4526.0,3117.0,149989.0,242.904217,118.116608,80.985224
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,43869.0,20911.0,109540.0,21961.0,1420.0,3117.0,149989.0,273.413449,98.648082,80.985224
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,26714.0,42824.0,109540.0,14608.0,611.0,3117.0,149989.0,80.985224,214.256572,80.985224
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,11594.0,17575.0,25398.0,12217.0,2776.0,10038.0,149989.0,80.985224,118.116608,80.985224
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2109.0,3728.0,109540.0,7718.0,4590.0,10038.0,149989.0,80.985224,118.116608,80.985224
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,43869.0,23581.0,109540.0,21961.0,3810.0,100.0,149989.0,80.985224,98.648082,80.985224
8,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,11594.0,17575.0,25398.0,6524.0,888.0,831.0,149989.0,80.985224,98.648082,80.985224
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2109.0,3728.0,109540.0,207.0,62.0,5539.0,149989.0,80.985224,98.648082,80.985224


# Подбор модели

In [16]:
best_params = {
    'random_state': 6,
    'logging_level': 'Silent',
#     'max_depth': 5,
#     'learning_rate': 0.05,
#     'n_estimators': 150,
}

In [17]:
clf = Pipeline(steps=[
#     ('mean_target', MeanTargetEncoder(features=[46, 47])),
#     ('model', XGBClassifier(**best_params)),
    ('model', CatBoostClassifier(**best_params)),
])

In [18]:
params = {
#     'model__max_depth': [15, 10, 9, 8, 7, 6, 5, 4, 3, 2],
#     'model__learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 1],
#     'model__n_estimators': [1000],
#     'model__booster': ['gbtree', 'gblinear', 'dart'],
    
    
}

In [19]:
grid_search = GridSearchCV(clf, 
                           params, 
                           scoring='roc_auc', 
                           n_jobs=-2, 
                           cv=3, 
                           verbose=2, 
                           return_train_score=True
                          )
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................................. , total= 3.0min
[CV] ................................................. , total= 3.0min
[CV] ................................................. , total= 3.0min


[Parallel(n_jobs=-2)]: Done   3 out of   3 | elapsed:  3.1min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('model', <catboost.core.CatBoostClassifier object at 0x7f1c6aef4e80>)]),
       fit_params=None, iid=True, n_jobs=-2, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=2)

In [20]:
print(grid_search.best_score_)
print(grid_search.best_params_)
grid_search.best_estimator_

0.7392586044513577
{}


Pipeline(memory=None,
     steps=[('model', <catboost.core.CatBoostClassifier object at 0x7f1c6c976d30>)])

In [21]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,180.580482,0.845526,0.739259,0.796751,{},1,0.744858,0.796006,0.736009,0.797412,0.736909,0.796836,0.128187,0.017027,0.003976,0.000577


# Предсказание

In [22]:
clf.fit(X_train, y_train)
y_test = clf.predict_proba(X_test)

In [23]:
answer = pd.read_csv("data/insclass_sample.csv")
y_answer = answer[["target"]].values.flatten()
answer[["target"]] = np.reshape(y_test[:, 1], (-1, 1))
answer.to_csv("answer.csv", index=False)