In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt 

from catboost import CatBoost, Pool, CatBoostRegressor

%matplotlib inline

# Baseline

## One hot encoding

In [5]:
one_hot_data = pd.read_csv('data/one_hot_data.csv')
one_hot_data.head()

Unnamed: 0,price,year,mileage,owners_num,volume,horsepower,city kazan,city moskva,city nizhniy_novgorod,city novosibirsk,...,colour серый,colour синий,colour фиолетовый,colour чёрный,transmission механическая,transmission роботизированная,drive полный,doc_unique Оригинал,doc_unique Растаможен,engine_type Дизель
0,500000.0,2015,60500.0,2,1.6,123.0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0
1,405000.0,2012,130000.0,2,1.6,123.0,1,0,0,0,...,0,0,0,1,1,0,0,1,0,0
2,630000.0,2017,67000.0,1,1.6,123.0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0
3,620000.0,2016,80000.0,1,1.6,123.0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,729000.0,2017,150000.0,3,1.6,123.0,1,0,0,0,...,0,0,0,1,1,0,0,1,0,0


In [7]:
y_one_hot = one_hot_data['price'].copy()
X_one_hot = one_hot_data.drop(['price'], axis=1)

X_train_oh, X_test_oh, y_train_oh, y_test_oh = train_test_split(X_one_hot, y_one_hot, test_size=0.3, shuffle=False)

del y_one_hot
del X_one_hot

In [15]:
%%time

lin_reg_one_hot = LinearRegression()
lin_reg_one_hot.fit(X_train_oh, y_train_oh)

print("Coef of determination:",lin_reg_one_hot.score(X_test_oh, y_test_oh))

Coef of determination: 0.8301673079196885
Wall time: 28.5 ms


## Label encoding

In [10]:
label_enc_data = pd.read_csv('data/label_encoded_data.csv')
label_enc_data.head(5)

Unnamed: 0,city,brand,model,price,year,mileage,body_type,colour,transmission,drive,owners_num,doc_unique,engine_type,volume,horsepower
0,1,1,3,500000.0,2015,60500.0,1,10,1,0,2,1,0,1.6,123.0
1,1,1,3,405000.0,2012,130000.0,1,14,1,0,2,1,0,1.6,123.0
2,1,1,3,630000.0,2017,67000.0,1,1,1,0,1,1,0,1.6,123.0
3,1,1,3,620000.0,2016,80000.0,1,10,1,0,1,1,0,1.6,123.0
4,1,1,3,729000.0,2017,150000.0,1,14,1,0,3,1,0,1.6,123.0


In [11]:
y_lab_enc = label_enc_data['price'].copy()
X_lab_enc = label_enc_data.drop(['price'], axis=1)

X_train_lab, X_test_lab, y_train_lab, y_test_lab = train_test_split(X_lab_enc, y_lab_enc, test_size=0.3, shuffle=False)

del y_lab_enc
del X_lab_enc

In [25]:
%%time

lin_reg_lab = LinearRegression()
lin_reg_lab.fit(X_train_lab, y_train_lab)

print("Coef of determination:", lin_reg_lab.score(X_test_lab, y_test_lab))

Coef of determination: 0.7947316862545974
Wall time: 11 ms


# Cat Boost

## Label encoding

In [32]:
%%time

model = CatBoostRegressor(iterations=100,
                           depth=2,
                           learning_rate=1,
                           loss_function='RMSE')

model.fit(X_train_lab, y_train_lab)

predictions = model.predict(X_test_lab)
print("Coef of determination:", r2_score(y_test_lab, predictions))

0:	learn: 136841.5352777	total: 4.1ms	remaining: 406ms
1:	learn: 121135.4714184	total: 6.44ms	remaining: 316ms
2:	learn: 114956.7991799	total: 9.26ms	remaining: 299ms
3:	learn: 105696.3982835	total: 11.6ms	remaining: 278ms
4:	learn: 101334.6188801	total: 13.6ms	remaining: 259ms
5:	learn: 94617.1365279	total: 16.2ms	remaining: 254ms
6:	learn: 90429.2001273	total: 18.6ms	remaining: 246ms
7:	learn: 88336.7631366	total: 20.4ms	remaining: 235ms
8:	learn: 85916.0488555	total: 22.3ms	remaining: 226ms
9:	learn: 83941.9835226	total: 24.7ms	remaining: 222ms
10:	learn: 81938.5482372	total: 26.7ms	remaining: 216ms
11:	learn: 80147.3565331	total: 28.9ms	remaining: 212ms
12:	learn: 79063.1966273	total: 31.3ms	remaining: 210ms
13:	learn: 78019.2015713	total: 33.5ms	remaining: 206ms
14:	learn: 77172.3628940	total: 35.7ms	remaining: 202ms
15:	learn: 76167.2444379	total: 37.7ms	remaining: 198ms
16:	learn: 75175.5097598	total: 39.9ms	remaining: 195ms
17:	learn: 74461.6863295	total: 42ms	remaining: 191ms


## One hot encoding

In [33]:
%%time

model = CatBoostRegressor(iterations=100,
                           depth=2,
                           learning_rate=1,
                           loss_function='RMSE')

model.fit(X_train_oh, y_train_oh)

predictions = model.predict(X_test_oh)
print("Coef of determination:", r2_score(y_test_oh, predictions))

0:	learn: 136922.5320341	total: 3.05ms	remaining: 302ms
1:	learn: 120424.7256349	total: 5.39ms	remaining: 264ms
2:	learn: 112372.6470837	total: 7.39ms	remaining: 239ms
3:	learn: 107499.0824175	total: 9.36ms	remaining: 225ms
4:	learn: 101712.3293017	total: 11.5ms	remaining: 219ms
5:	learn: 96090.4088825	total: 13.8ms	remaining: 216ms
6:	learn: 93829.3033738	total: 16.2ms	remaining: 215ms
7:	learn: 90043.4942096	total: 18.5ms	remaining: 213ms
8:	learn: 87352.1338811	total: 20.7ms	remaining: 209ms
9:	learn: 85410.5205977	total: 22.6ms	remaining: 204ms
10:	learn: 83924.3137468	total: 24.9ms	remaining: 201ms
11:	learn: 82638.0983696	total: 27ms	remaining: 198ms
12:	learn: 81070.1804175	total: 29ms	remaining: 194ms
13:	learn: 79849.9420125	total: 31.3ms	remaining: 192ms
14:	learn: 78675.1458256	total: 33.3ms	remaining: 189ms
15:	learn: 77957.7948670	total: 35.7ms	remaining: 187ms
16:	learn: 76134.8378725	total: 37.9ms	remaining: 185ms
17:	learn: 74922.7446670	total: 40.2ms	remaining: 183ms
1