In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm
%matplotlib inline

from preprocessors import GlobalPreprocessor, MeanTargetEncoder

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

from sklearn.linear_model import ElasticNet, Ridge
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

# Чтение и описание данных

In [2]:
train = pd.read_csv("data/insvalue_train.csv")
train.head()

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10,column_11,column_12,column_13,column_14,column_15,column_16,column_17,column_18,target
0,2,15.065199,a1,b0,b0,53.0,q2,1.0,2,249.0,23,1,58.025786,1,c4,1,k,3.095619,3105288.0
1,1,12.200897,a3,b0,b0,6.0,q1,2100.0,4,123.0,4,3,81.549097,1,c0,0,a,1.665975,375255.7
2,2,50.337125,a1,b4,b4,10.155556,q1,1.0,2,181.0,9,0,76.689904,1,c3,1,j,4.265557,991012.3
3,1,20.731525,a1,b86,b0,0.0,q2,60000.0,7,211.0,29,0,79.125201,1,c4,0,f,5.201468,938985.0
4,2,51.628454,a1,b0,b0,37.0,q1,48065.0,7,209.0,19,0,81.549097,1,c1,0,h,6.136154,776107.5


### Описание полей
**column_1** - пол водителя;

Если "1", то девушка; если "2", то мужчина. (из предположения, что мужчин-водителей больше). Можно оставить как есть

**column_2** - стаж вождения;

От 0 до 82.75 лет, присутствуют немного (86 на всю выборку) Nan'ов (заменить нулями?). Явно нуждается в StandartScaler'e.

**column_3** - семейное положение водителя;

Значения "a1" ... "a7", one-hot? (или mean-encoding)

**column_4** - город регистрации водителя;

Множество различных мест "b0" ... "bXXXX" (больше 1000!) => 
1) Либо просто перекодировать
2) Либо mean-encoding
3) Либо выделить отдельно "большие города"

**column_5** - город регистрации договора;

Ситуация похожа на column_4. Идея: выудить признак, совпадают ли эти 2 колонки.

**column_6** - максимальная просрочка по выплатам;

Около 20% добропорядочные (нет вообще прострочек (их в отдельный признак?)). Для остальных сделать StandartScaler?

**column_7** - тип двигателя;

Значения "q1" ... "q7", причем первый самый популярный (~80%), второй менее (~20%), остальный "эксклюзивные". one-hot? (или mean-encoding)

**column_8** - пробег транспортного средства;

Ярко выраженные пики при круглых числах. В тесте (но не в трейне!) есть Nan'ы (заменить нулями?). Просто StandartScaler?

**column_9** - год производства транспортного средства;

Значения от 1 до 11 (что это за года?), причем если построить график, то он будет иметь вид пика в значении 6, отдаленно напоминает гауссиану. Тут либо mean-encoding, либо one-hot, либо что-то нормальное.

**column_10** - мощность двигателя в лошадиных силах;

Число от 77 до почти 2000. StandartScaler?

**column_11** - количество договоров клиента;

Число от 0 до 239, сильно смещено в 0. StandartScaler?

**column_12** - количество договоров транспортного средства;

Число от 0 до 22, сильно смещено в 0. StandartScaler?

**column_13** - возраст клиента;

Число от 26 до 113. StandartScaler?

**column_14** - тип транспортного средства;

Всегда 1, можно дропать.

**column_15** - страна-производитель транспортного средства;

Значение вида "с#", всего 8 различных значений. Можно сделать one-hot или mean-ecoding.

**column_16** - наличие телематических систем, установленных на транспортное средство;

Булево значение, можно оставить как есть.

**column_17** - марка и модель автомобиля (задача усложняется тем, что в выборке почти для всех марок присутствуют разные модели, представляющие их, т.е. марка X в выборке представлена моделями x1,x2, хотя в явном виде эта информация в данных не присутствует);

11 различных значений. Можно сделать one-hot или mean-ecoding.

**column_18** - технический параметр транспортного средства.

Значения от -0.6 до 8.6. Нужен StandartScaler.

**target**

Значения от $2 \cdot 10^5$ до $4 \cdot 10^6$. Вывод: предсказываем как есть.

In [3]:
# train.groupby(["column_17"])[["target"]].count()

In [4]:
# train.describe()

In [5]:
# train.info()

In [6]:
test = pd.read_csv("data/insvalue_test.csv")
test.head()

Unnamed: 0,id,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10,column_11,column_12,column_13,column_14,column_15,column_16,column_17,column_18
0,1,1,35.842308,a1,b0,b0,7.0,q1,17001.0,6,184.0,31,6,74.24288,1,c4,0,k,1.472249
1,2,2,59.29387,a1,b0,b0,0.0,q1,54000.0,5,209.0,24,0,86.363908,1,c8,0,h,4.056287
2,3,2,24.920571,a1,b0,b0,36.0,q1,48123.0,8,209.44,4,0,69.312262,1,c8,0,h,0.429722
3,4,1,10.763031,a3,b0,b0,37.0,q1,35000.0,7,245.0,13,5,56.753996,1,c4,1,k,4.079482
4,5,2,23.530077,a1,b42,b42,2.0,q1,33872.0,6,170.0,3,1,71.783787,1,c1,0,e,5.796073


In [7]:
# test.info()

In [8]:
feature_columns = [("column_" + str(i)) for i in range(1, 19)]
X_train = train[feature_columns].values
y_train = train[["target"]].values.flatten()
X_test = test[feature_columns].values

# Разберемся с марками и моделями

In [9]:
# from sklearn.preprocessing import LabelEncoder

# def get_groups_by_mark(train):
#     groups = []
#     for group_name in set(train[["column_17"]].values.flatten()):
#         indexes = [7, 9, 10, 15, 18]
#         columns = [("column_" + str(i)) for i in indexes]
# #         columns += ["target"]
#         subset = train[columns][train["column_17"] == group_name].copy()
#         subset = subset.rename(index=str, columns={"column_7": "engine_type", 
#                                                    "column_9": "year_production",
#                                                    "column_10": "engine_power",
#                                                    "column_15": "country_production",
#                                                    "column_18": "tech_coeff",
#                                                   })
#         subset[["engine_type"]] = subset[["engine_type"]].apply(LabelEncoder().fit_transform)
#         subset[["country_production"]] = subset[["country_production"]].apply(LabelEncoder().fit_transform)
# #         subset = subset[location]
#         groups.append((group_name, subset))
#     return groups

In [10]:
# groups = get_groups_by_mark(train)

In [11]:
# i = 0
# for group, subset in groups:
#     print(group, len(subset))
#     i += len(subset)
# assert i == len(train)

In [12]:
# import seaborn as sns
# sns.set(style="ticks")

# df = groups[3][1]
# sns.pairplot(df)

In [13]:
# from sklearn.cluster import MeanShift, estimate_bandwidth

# for i, (group, subset) in enumerate(groups): 
#     ms = MeanShift(bin_seeding=True)
#     X = groups[i][1].values
#     ms.fit(X)
#     labels = ms.labels_
#     cluster_centers = ms.cluster_centers_

#     labels_unique = np.unique(labels)
#     n_clusters_ = len(labels_unique)

#     print("number of estimated clusters : {} for group {}".format(n_clusters_, i))

Вывод: из картинок это не совсем очевидно, но раз уж есть такая информация, то почему бы не попробовать провести еще дополнительную кластеризацию для разделения автомобилей внутри одной марки на модели.

# Предобработка

In [14]:
pd.DataFrame(X_train).head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,2,15.0652,a1,b0,b0,53.0,q2,1,2,249,23,1,58.0258,1,c4,1,k,3.09562
1,1,12.2009,a3,b0,b0,6.0,q1,2100,4,123,4,3,81.5491,1,c0,0,a,1.66598
2,2,50.3371,a1,b4,b4,10.1556,q1,1,2,181,9,0,76.6899,1,c3,1,j,4.26556
3,1,20.7315,a1,b86,b0,0.0,q2,60000,7,211,29,0,79.1252,1,c4,0,f,5.20147
4,2,51.6285,a1,b0,b0,37.0,q1,48065,7,209,19,0,81.5491,1,c1,0,h,6.13615
5,2,26.3053,a1,b0,b0,46.0,q1,10000,4,170,25,0,75.4679,1,c1,0,e,4.72041
6,2,20.7315,a1,b0,b0,15.0,q1,19000,3,125,12,0,54.199,1,c1,0,f,6.114
7,2,16.4901,a3,b0,b0,55.0,q1,60000,7,180,10,0,59.2939,1,c4,0,c,3.97441
8,2,20.7315,a3,b0,b0,17.0,q2,60000,5,241,35,3,56.754,1,c1,0,i,1.29377
9,2,17.9096,a2,b0,b0,44.0,q1,30000,4,181,16,1,46.4381,1,c1,0,j,0.962209


In [15]:
preprocessor = GlobalPreprocessor()

X_united = np.vstack((X_train.copy(), X_test.copy()))
X_united = preprocessor.fit(X_united).transform(X_united)

X_train = X_united[[i for i in range(X_train.shape[0])], :]
X_test = X_united[[i + X_train.shape[0] for i in range(X_test.shape[0])], :]

# preprocessor.fit(X_train, y_train)
# X_new = preprocessor.transform(X_train)
pd.DataFrame(X_train).head(10)

NanHandler(zero):: idx=7, value=0, column=
NanHandler(most_frequent):: idx=1, value=20.7315254450123, column=
CategoricalFrequencyEncoder:: idx=1 len=62
CategoricalFrequencyEncoder:: idx=2 len=7
CategoricalFrequencyEncoder:: idx=3 len=872
CategoricalFrequencyEncoder:: idx=4 len=838
CategoricalFrequencyEncoder:: idx=6 len=7
CategoricalFrequencyEncoder:: idx=7 len=4254
CategoricalFrequencyEncoder:: idx=8 len=11
CategoricalFrequencyEncoder:: idx=9 len=239
CategoricalFrequencyEncoder:: idx=10 len=136
CategoricalFrequencyEncoder:: idx=11 len=21
CategoricalFrequencyEncoder:: idx=12 len=71
CategoricalFrequencyEncoder:: idx=14 len=8
CategoricalFrequencyEncoder:: idx=16 len=11
Clusterizer(MeanShift):: 4 clusters for mark 0
Clusterizer(MeanShift):: 4 clusters for mark 1
Clusterizer(MeanShift):: 2 clusters for mark 2
Clusterizer(MeanShift):: 2 clusters for mark 3
Clusterizer(MeanShift):: 4 clusters for mark 4
Clusterizer(MeanShift):: 2 clusters for mark 5
Clusterizer(MeanShift):: 3 clusters for m

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.726156,0.201292,0.046569,0.046853,0.016328,0.185568,0.030881,0.384872,0.079864,700.0
1,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.726156,0.798282,0.171618,0.094026,0.039612,0.074717,0.016541,0.130444,0.112271,0.0
2,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.012104,0.798282,0.046569,0.064885,0.051539,0.494587,0.022433,0.161147,0.129415,500.0
3,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.726156,0.201292,0.145316,0.024385,0.009548,0.494587,0.018741,0.384872,0.116175,101.0
4,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.726156,0.798282,0.145316,0.016931,0.022326,0.494587,0.016541,0.254641,0.041103,1001.0
5,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.726156,0.798282,0.171618,0.079154,0.01395,0.494587,0.023427,0.254641,0.204451,800.0
6,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.726156,0.798282,0.120576,0.006638,0.034927,0.494587,0.036169,0.254641,0.116175,100.0
7,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.726156,0.798282,0.145316,0.00859,0.042807,0.494587,0.030881,0.384872,0.121038,300.0
8,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.726156,0.201292,0.213502,0.009477,0.005892,0.074717,0.031484,0.254641,0.023888,601.0
9,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.726156,0.798282,0.171618,0.064885,0.024385,0.185568,0.031484,0.254641,0.129415,500.0


С трудом, но, вроде как, преобразование данных работает.

In [16]:
# X_train = preprocessor.transform(X_train)
# X_test = preprocessor.transform(X_test)
pd.DataFrame(X_test).head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.726156,0.798282,0.250949,0.016292,0.008554,0.008448,0.023498,0.384872,0.079864,701.0
1,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.726156,0.798282,0.213502,0.016931,0.014801,0.494587,0.0093,0.032123,0.041103,1000.0
2,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.726156,0.798282,0.038654,0.007454,0.039612,0.494587,0.02261,0.032123,0.041103,1000.0
3,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.726156,0.798282,0.145316,0.02222,0.033543,0.016292,0.031484,0.384872,0.079864,701.0
4,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.002556,0.798282,0.250949,0.079154,0.044511,0.185568,0.021545,0.254641,0.204451,800.0
5,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.726156,0.201292,0.250949,0.000319,0.044511,0.494587,0.033578,0.254641,0.041103,1003.0
6,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.726156,0.798282,0.145316,0.022007,0.042807,0.166507,0.016754,0.384872,0.121038,300.0
7,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.010152,0.798282,0.171618,0.02222,0.0011,0.185568,0.0093,0.254641,0.116175,100.0
8,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.726156,0.201292,0.250949,0.002911,0.003514,0.494587,0.022149,0.019735,0.02655,200.0
9,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.010294,0.798282,0.250949,0.094026,0.046392,0.185568,0.030881,0.130444,0.072516,400.0


In [17]:
# columns = [i for i in range(11, 21)]
# # columns
# for maximum in np.max(X_test, axis=0):
#     print(maximum)
# # pd.DataFrame(X_test).describe()[columns]

In [18]:
# preprocessor.preprocessors[1].encoders[2][1].inverse_transform(481)

# Подбор модели

In [19]:
best_params = {
    'random_state': 3,
    
    # XGBoost params
    'max_depth': 11,
    'learning_rate': 0.04,
    'n_estimators': 100,
    'min_child_weight': 2,
    'subsample': 0.83,
    'colsample_bytree': 0.9433,
    
    
#     # CatBoost params
#     'loss_function': 'RMSE',
#     'one_hot_max_size': 15,
}

In [20]:
clf = Pipeline(steps=[
#     ('mean_target', MeanTargetEncoder(features=[46, 47])),
    ('model', XGBRegressor(**best_params)),
#     ('model', CatBoostRegressor(**best_params)),
])

In [21]:
params = {
#     'model__max_depth': [9, 10, 11, 12, 13],
#     'model__learning_rate': [0.01, 0.03, 0.04, 0.05, 0.07, 0.1],
#     'model__n_estimators': [200, 110, 100, 90, 50],    
#     'model__min_child_weight': [1, 2, 3, 5],  
#     'model__subsample': np.linspace(0.83, 1, 7),    
#     'model__colsample_bytree': np.linspace(0.83, 1, 7),    
}

In [22]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

grid_search = GridSearchCV(clf, 
                           params, 
                           scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False), 
                           n_jobs=-2, 
                           cv=5, 
                           verbose=2, 
                           return_train_score=True
                          )
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................................. , total=  10.3s
[CV] ................................................. , total=  11.1s
[CV] ................................................. , total=  12.1s
[CV] ................................................. , total=  12.3s
[CV] ................................................. , total=  13.1s


[Parallel(n_jobs=-2)]: Done   5 out of   5 | elapsed:   13.5s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('model', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.9433, gamma=0, learning_rate=0.04,
       max_delta_step=0, max_depth=11, min_child_weight=2, missing=None,
       n_estimators=100, n_jobs=1, nthread=None, objective='reg:linear',
       random_state=3, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=0.83))]),
       fit_params=None, iid=True, n_jobs=-2, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False),
       verbose=2)

In [23]:
print(grid_search.best_score_)
print(grid_search.best_params_)
grid_search.best_estimator_

-8.544390027001599
{}


Pipeline(memory=None,
     steps=[('model', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.9433, gamma=0, learning_rate=0.04,
       max_delta_step=0, max_depth=11, min_child_weight=2, missing=None,
       n_estimators=100, n_jobs=1, nthread=None, objective='reg:linear',
       random_state=3, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=0.83))])

In [24]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,11.696712,0.089898,-8.54439,-5.785148,{},1,-8.614507,-5.816792,-8.604861,-5.789403,-8.67644,-5.760864,-8.354384,-5.809469,-8.471744,-5.749214,1.006597,0.017407,0.116093,0.026427


# Predict answer

In [25]:
clf.fit(X_train, y_train)
y_test = clf.predict(X_test)

In [26]:
answer = pd.read_csv("data/insvalue_sample.csv")
y_answer = answer[["target"]].values.flatten()
y_answer = y_train.mean()
answer[["target"]] = y_test
answer.to_csv("answer.csv", index=False)

In [27]:
# np.argmax(clf.named_steps["model"].feature_importances_)

In [28]:
# xgboost.plot_importance(clf.named_steps["model"])