# Определение стоимости автомобилей

Сервис по продаже автомобилей с пробегом «Не бит, не крашен» разрабатывает приложение для привлечения новых клиентов. В нём можно быстро узнать рыночную стоимость своего автомобиля. В вашем распоряжении исторические данные: технические характеристики, комплектации и цены автомобилей. Вам нужно построить модель для определения стоимости. 

Заказчику важны:

- качество предсказания;
- скорость предсказания;
- время обучения.

## Подготовка данных

In [5]:
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor 
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import time
import warnings

def line():
    print('-----------------------------------------------')

In [6]:
# Загрузка

data = pd.read_csv('/datasets/autos.csv')
display(data)
line()

data.info()

Unnamed: 0,DateCrawled,Price,VehicleType,RegistrationYear,Gearbox,Power,Model,Kilometer,RegistrationMonth,FuelType,Brand,NotRepaired,DateCreated,NumberOfPictures,PostalCode,LastSeen
0,2016-03-24 11:52:17,480,,1993,manual,0,golf,150000,0,petrol,volkswagen,,2016-03-24 00:00:00,0,70435,2016-04-07 03:16:57
1,2016-03-24 10:58:45,18300,coupe,2011,manual,190,,125000,5,gasoline,audi,yes,2016-03-24 00:00:00,0,66954,2016-04-07 01:46:50
2,2016-03-14 12:52:21,9800,suv,2004,auto,163,grand,125000,8,gasoline,jeep,,2016-03-14 00:00:00,0,90480,2016-04-05 12:47:46
3,2016-03-17 16:54:04,1500,small,2001,manual,75,golf,150000,6,petrol,volkswagen,no,2016-03-17 00:00:00,0,91074,2016-03-17 17:40:17
4,2016-03-31 17:25:20,3600,small,2008,manual,69,fabia,90000,7,gasoline,skoda,no,2016-03-31 00:00:00,0,60437,2016-04-06 10:17:21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
354364,2016-03-21 09:50:58,0,,2005,manual,0,colt,150000,7,petrol,mitsubishi,yes,2016-03-21 00:00:00,0,2694,2016-03-21 10:42:49
354365,2016-03-14 17:48:27,2200,,2005,,0,,20000,1,,sonstige_autos,,2016-03-14 00:00:00,0,39576,2016-04-06 00:46:52
354366,2016-03-05 19:56:21,1199,convertible,2000,auto,101,fortwo,125000,3,petrol,smart,no,2016-03-05 00:00:00,0,26135,2016-03-11 18:17:12
354367,2016-03-19 18:57:12,9200,bus,1996,manual,102,transporter,150000,3,gasoline,volkswagen,no,2016-03-19 00:00:00,0,87439,2016-04-07 07:15:26


-----------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354369 entries, 0 to 354368
Data columns (total 16 columns):
DateCrawled          354369 non-null object
Price                354369 non-null int64
VehicleType          316879 non-null object
RegistrationYear     354369 non-null int64
Gearbox              334536 non-null object
Power                354369 non-null int64
Model                334664 non-null object
Kilometer            354369 non-null int64
RegistrationMonth    354369 non-null int64
FuelType             321474 non-null object
Brand                354369 non-null object
NotRepaired          283215 non-null object
DateCreated          354369 non-null object
NumberOfPictures     354369 non-null int64
PostalCode           354369 non-null int64
LastSeen             354369 non-null object
dtypes: int64(7), object(9)
memory usage: 43.3+ MB


In [7]:
# Обработка

print(data.isnull().sum())
line()

# Удаление дубликатов

print('Количество дубликатов до удаления:', data.duplicated().sum())
data.drop_duplicates(inplace = True)
print('Количество дубликатов после удаления:', data.duplicated().sum())

DateCrawled              0
Price                    0
VehicleType          37490
RegistrationYear         0
Gearbox              19833
Power                    0
Model                19705
Kilometer                0
RegistrationMonth        0
FuelType             32895
Brand                    0
NotRepaired          71154
DateCreated              0
NumberOfPictures         0
PostalCode               0
LastSeen                 0
dtype: int64
-----------------------------------------------
Количество дубликатов до удаления: 4
Количество дубликатов после удаления: 0


In [8]:
# Приведем дату к правильному типу

data['DateCrawled'] = pd.to_datetime(data['DateCrawled'],format= '%Y-%m-%d %H:%M:%S') 
data['DateCreated'] = pd.to_datetime(data['DateCreated'],format= '%Y-%m-%d %H:%M:%S') 
data['LastSeen'] = pd.to_datetime(data['LastSeen'],format= '%Y-%m-%d %H:%M:%S') 

Комментарий: Данные загружены, все типы столбцов соответствуют их значениям (кроме даты), дубликаты удалены, но есть одно большое НО и это пропуски. Когда в данных на которых необходимо провести обучение есть пропуски рекомендуют количественные переменные заменять медианой а категориальные заменять классом количество которого преобладает в признаке. И тут сразу минусы. Во-первых таких данных много заменяя их на вышеперечисленное мы можем запутать алгоритмы, во-вторых это все же автомобильный рынок и нельзя работать с "битыми" данными. Поэтому все что нам остается, это просто удалить все пропуски

In [9]:
# Удалим пропуски

data = data.dropna().reset_index(drop=True)
print(data.isnull().sum())

DateCrawled          0
Price                0
VehicleType          0
RegistrationYear     0
Gearbox              0
Power                0
Model                0
Kilometer            0
RegistrationMonth    0
FuelType             0
Brand                0
NotRepaired          0
DateCreated          0
NumberOfPictures     0
PostalCode           0
LastSeen             0
dtype: int64


In [10]:
# Подготовим данные для обучения
# Выполним прямое кодирование

data_ohe = (
    pd.get_dummies(data, drop_first = True, 
    columns =['VehicleType', 'Gearbox', 'Model', 'FuelType', 'Brand', 'NotRepaired'])
)

display(data_ohe)

Unnamed: 0,DateCrawled,Price,RegistrationYear,Power,Kilometer,RegistrationMonth,DateCreated,NumberOfPictures,PostalCode,LastSeen,...,Brand_seat,Brand_skoda,Brand_smart,Brand_subaru,Brand_suzuki,Brand_toyota,Brand_trabant,Brand_volkswagen,Brand_volvo,NotRepaired_yes
0,2016-03-17 16:54:04,1500,2001,75,150000,6,2016-03-17,0,91074,2016-03-17 17:40:17,...,0,0,0,0,0,0,0,1,0,0
1,2016-03-31 17:25:20,3600,2008,69,90000,7,2016-03-31,0,60437,2016-04-06 10:17:21,...,0,1,0,0,0,0,0,0,0,0
2,2016-04-04 17:36:23,650,1995,102,150000,10,2016-04-04,0,33775,2016-04-06 19:17:07,...,0,0,0,0,0,0,0,0,0,1
3,2016-04-01 20:48:51,2200,2004,109,150000,8,2016-04-01,0,67112,2016-04-05 18:18:39,...,0,0,0,0,0,0,0,0,0,0
4,2016-03-21 18:54:38,0,1980,50,40000,7,2016-03-21,0,19348,2016-03-25 16:47:58,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245805,2016-04-02 20:37:03,3999,2005,3,150000,5,2016-04-02,0,81825,2016-04-06 20:47:12,...,0,0,0,0,0,0,0,0,0,0
245806,2016-03-19 19:53:49,3200,2004,225,150000,5,2016-03-19,0,96465,2016-03-19 20:44:43,...,1,0,0,0,0,0,0,0,0,1
245807,2016-03-27 20:36:20,1150,2000,0,150000,3,2016-03-27,0,26624,2016-03-29 10:17:23,...,0,0,0,0,0,0,0,0,0,0
245808,2016-03-05 19:56:21,1199,2000,101,125000,3,2016-03-05,0,26135,2016-03-11 18:17:12,...,0,0,1,0,0,0,0,0,0,0


In [11]:
# Работа с датой (выделим месяц)

data_ohe['DateCrawled'] = data_ohe['DateCrawled'].dt.month
data_ohe['DateCreated'] = data_ohe['DateCreated'].dt.month
data_ohe['LastSeen'] = data_ohe['LastSeen'].dt.month

display(data_ohe)

Unnamed: 0,DateCrawled,Price,RegistrationYear,Power,Kilometer,RegistrationMonth,DateCreated,NumberOfPictures,PostalCode,LastSeen,...,Brand_seat,Brand_skoda,Brand_smart,Brand_subaru,Brand_suzuki,Brand_toyota,Brand_trabant,Brand_volkswagen,Brand_volvo,NotRepaired_yes
0,3,1500,2001,75,150000,6,3,0,91074,3,...,0,0,0,0,0,0,0,1,0,0
1,3,3600,2008,69,90000,7,3,0,60437,4,...,0,1,0,0,0,0,0,0,0,0
2,4,650,1995,102,150000,10,4,0,33775,4,...,0,0,0,0,0,0,0,0,0,1
3,4,2200,2004,109,150000,8,4,0,67112,4,...,0,0,0,0,0,0,0,0,0,0
4,3,0,1980,50,40000,7,3,0,19348,3,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245805,4,3999,2005,3,150000,5,4,0,81825,4,...,0,0,0,0,0,0,0,0,0,0
245806,3,3200,2004,225,150000,5,3,0,96465,3,...,1,0,0,0,0,0,0,0,0,1
245807,3,1150,2000,0,150000,3,3,0,26624,3,...,0,0,0,0,0,0,0,0,0,0
245808,3,1199,2000,101,125000,3,3,0,26135,3,...,0,0,1,0,0,0,0,0,0,0


In [12]:
# Выполним разделение выборки на обучающу валидационную и тестовую

features = data_ohe.drop(['Price'], axis = 1)
target = data_ohe['Price']

# Получение пропорции 60:40

features_train, features_40, target_train, target_40 = train_test_split(
    features, target, test_size = 0.40, random_state = 12345)

# Разделение 40 % на две части

features_valid, features_test, target_valid, target_test = train_test_split(
    features_40, target_40, test_size = 0.50, random_state = 12345)

print('Проверка размерности')
line()
print(features_train.shape)
print(features_valid.shape)
print(features_test.shape)

print(target_train.shape)
print(target_valid.shape)
print(target_test.shape)
line()

Проверка размерности
-----------------------------------------------
(147486, 310)
(49162, 310)
(49162, 310)
(147486,)
(49162,)
(49162,)
-----------------------------------------------


In [13]:
# Масштабирование признаков
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

numeric = (
    ['DateCrawled', 'RegistrationYear', 'Power', 'Kilometer', 
    'RegistrationMonth', 'DateCreated', 'NumberOfPictures', 'PostalCode', 'LastSeen']
)

scaler = StandardScaler()
scaler.fit(features_train[numeric])

features_train[numeric] = scaler.transform(features_train[numeric])
features_valid[numeric] = scaler.transform(features_valid[numeric])
features_test[numeric] = scaler.transform(features_test[numeric])

display(features_train)

Unnamed: 0,DateCrawled,RegistrationYear,Power,Kilometer,RegistrationMonth,DateCreated,NumberOfPictures,PostalCode,LastSeen,VehicleType_convertible,...,Brand_seat,Brand_skoda,Brand_smart,Brand_subaru,Brand_suzuki,Brand_toyota,Brand_trabant,Brand_volkswagen,Brand_volvo,NotRepaired_yes
74637,-0.444471,0.822534,-0.208875,-1.817243,-0.341119,-0.428257,0.0,-1.472942,0.843217,0,...,0,0,0,0,0,0,0,0,0,0
70793,-0.444471,-0.148556,-0.264338,0.612146,0.520556,-0.428257,0.0,0.004576,0.843217,0,...,0,0,0,0,0,0,0,0,0,0
22262,-0.444471,0.660686,-0.278204,-1.277379,-0.915570,-0.428257,0.0,-0.897723,-1.185935,0,...,0,0,0,0,0,0,0,0,0,1
44936,-0.444471,1.631776,0.137774,-2.627040,0.807781,-0.428257,0.0,1.741639,0.843217,0,...,0,0,0,0,0,0,0,0,0,0
222747,2.249866,-0.634101,0.345762,0.612146,-1.490020,2.202066,0.0,-0.079194,0.843217,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85412,-0.444471,0.336989,0.297232,0.612146,0.520556,-0.428257,0.0,1.085432,0.843217,1,...,0,0,0,0,0,0,0,0,0,0
133249,-0.444471,-1.281494,-0.458461,0.612146,-0.341119,-0.428257,0.0,0.118151,-1.185935,0,...,0,0,0,0,0,0,0,1,0,0
130333,2.249866,-2.090736,-0.028618,-0.737515,1.095006,2.202066,0.0,-1.339304,0.843217,1,...,0,0,0,0,0,0,0,0,0,0
77285,-0.444471,0.013293,0.297232,0.612146,0.807781,-0.428257,0.0,0.066389,0.843217,0,...,0,0,0,0,0,0,0,0,0,0


## Обучение моделей

In [18]:
models = ([LGBMRegressor(),
           CatBoostRegressor(iterations=100, depth = 10),
           XGBRegressor()]
           )

results = [[],[],[]]

for i in range(len(models)):
    
    start_time_fit = time.time()
    model = models[i]
    model.fit(features_train, target_train)
    results[i].append(time.time() - start_time_fit)
    
    start_time_predict = time.time()
    predict = model.predict(features_valid)
    results[i].append(time.time() - start_time_fit)
    
    RMSE = mean_squared_error(target_valid, predict) ** 0.5
    R2_score = r2_score(target_valid, predict)
    results[i].append(RMSE)
    results[i].append(R2_score)

0:	learn: 4615.3321027	total: 187ms	remaining: 18.5s
1:	learn: 4516.0880810	total: 395ms	remaining: 19.4s
2:	learn: 4419.6045411	total: 683ms	remaining: 22.1s
3:	learn: 4327.0303503	total: 891ms	remaining: 21.4s
4:	learn: 4238.3691031	total: 1.18s	remaining: 22.4s
5:	learn: 4152.4001558	total: 1.38s	remaining: 21.7s
6:	learn: 4068.7923710	total: 1.68s	remaining: 22.3s
7:	learn: 3988.5794252	total: 1.89s	remaining: 21.8s
8:	learn: 3911.5578692	total: 2.09s	remaining: 21.1s
9:	learn: 3836.2605927	total: 2.38s	remaining: 21.4s
10:	learn: 3764.7867020	total: 2.59s	remaining: 20.9s
11:	learn: 3696.0871971	total: 2.87s	remaining: 21.1s
12:	learn: 3628.9891650	total: 3.08s	remaining: 20.6s
13:	learn: 3564.5262132	total: 3.37s	remaining: 20.7s
14:	learn: 3502.3706957	total: 3.58s	remaining: 20.3s
15:	learn: 3443.2645205	total: 3.87s	remaining: 20.3s
16:	learn: 3385.9242753	total: 4.07s	remaining: 19.9s
17:	learn: 3330.7412759	total: 4.28s	remaining: 19.5s
18:	learn: 3277.9065578	total: 4.57s	r

  if getattr(data, 'base', None) is not None and \




In [19]:
data_for_cat = data

data_for_cat['DateCrawled'] = data_for_cat['DateCrawled'].dt.month
data_for_cat['DateCreated'] = data_for_cat['DateCreated'].dt.month
data_for_cat['LastSeen'] = data_for_cat['LastSeen'].dt.month

display(data_for_cat)

Unnamed: 0,DateCrawled,Price,VehicleType,RegistrationYear,Gearbox,Power,Model,Kilometer,RegistrationMonth,FuelType,Brand,NotRepaired,DateCreated,NumberOfPictures,PostalCode,LastSeen
0,3,1500,small,2001,manual,75,golf,150000,6,petrol,volkswagen,no,3,0,91074,3
1,3,3600,small,2008,manual,69,fabia,90000,7,gasoline,skoda,no,3,0,60437,4
2,4,650,sedan,1995,manual,102,3er,150000,10,petrol,bmw,yes,4,0,33775,4
3,4,2200,convertible,2004,manual,109,2_reihe,150000,8,petrol,peugeot,no,4,0,67112,4
4,3,0,sedan,1980,manual,50,other,40000,7,petrol,volkswagen,no,3,0,19348,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245805,4,3999,wagon,2005,manual,3,3er,150000,5,gasoline,bmw,no,4,0,81825,4
245806,3,3200,sedan,2004,manual,225,leon,150000,5,petrol,seat,yes,3,0,96465,3
245807,3,1150,bus,2000,manual,0,zafira,150000,3,petrol,opel,no,3,0,26624,3
245808,3,1199,convertible,2000,auto,101,fortwo,125000,3,petrol,smart,no,3,0,26135,3


In [20]:
features = data_for_cat.drop(['Price'], axis = 1).reset_index(drop = True)
target = data_for_cat['Price'].reset_index(drop = True)

# Получение пропорции 60:40

features_train, features_40, target_train, target_40 = train_test_split(
    features, target, test_size = 0.40, random_state = 12345)

# Разделение 40 % на две части

features_valid, features_test, target_valid, target_test = train_test_split(
    features_40, target_40, test_size = 0.50, random_state = 12345)

In [21]:
display(features_train)

Unnamed: 0,DateCrawled,VehicleType,RegistrationYear,Gearbox,Power,Model,Kilometer,RegistrationMonth,FuelType,Brand,NotRepaired,DateCreated,NumberOfPictures,PostalCode,LastSeen
74637,3,small,2008,manual,90,corsa,60000,5,petrol,opel,no,3,0,13359,4
70793,3,bus,2002,manual,82,kangoo,150000,8,gasoline,renault,no,3,0,51580,4
22262,3,small,2007,manual,80,corsa,80000,3,petrol,opel,yes,3,0,28239,3
44936,3,wagon,2013,manual,140,other,30000,9,petrol,chevrolet,no,3,0,96515,4
222747,4,wagon,1999,manual,170,e_klasse,150000,1,petrol,mercedes_benz,no,4,0,49413,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85412,3,convertible,2005,auto,163,clk,150000,8,petrol,mercedes_benz,no,3,0,79540,4
133249,3,small,1995,manual,54,polo,150000,5,petrol,volkswagen,no,3,0,54518,3
130333,4,convertible,1990,manual,116,mx_reihe,100000,10,petrol,mazda,no,4,0,16816,4
77285,3,sedan,2003,manual,163,a4,150000,9,gasoline,audi,no,3,0,53179,4


In [23]:
best_score = 0
best_RMSE = 0
best_step = 0
best_learning_rate_step = 0
best_depth_step = 0

for step in range(50, 300, 50):
    for learning_rate_step in range(1, 2, 1):
        for depth_step in range(1, 5, 1):
            model = CatBoostRegressor(iterations=step,
                                      learning_rate=learning_rate_step,
                                      depth=depth_step)
            model.fit(features_train, target_train, cat_features)
            predict = model.predict(features_valid)
            RMSE = mean_squared_error(target_valid, predict) ** 0.5
            R2_score = r2_score(target_valid, predict)
            
            if best_score < R2_score:
                best_score = R2_score
                best_RMSE = RMSE
                best_step = step
                best_learning_rate_step = learning_rate_step
                best_depth_step = depth_step

0:	learn: 3615.1155439	total: 32.1ms	remaining: 1.57s
1:	learn: 3230.9100207	total: 209ms	remaining: 5.02s
2:	learn: 3123.2043833	total: 313ms	remaining: 4.91s
3:	learn: 3056.6422608	total: 411ms	remaining: 4.73s
4:	learn: 2952.4138643	total: 510ms	remaining: 4.59s
5:	learn: 2893.9526341	total: 609ms	remaining: 4.46s
6:	learn: 2839.2287014	total: 709ms	remaining: 4.36s
7:	learn: 2788.2429397	total: 809ms	remaining: 4.25s
8:	learn: 2754.3666604	total: 907ms	remaining: 4.13s
9:	learn: 2710.3560995	total: 1s	remaining: 4.01s
10:	learn: 2675.9440094	total: 1.1s	remaining: 3.91s
11:	learn: 2642.7110624	total: 1.2s	remaining: 3.8s
12:	learn: 2596.6856531	total: 1.21s	remaining: 3.46s
13:	learn: 2574.6706864	total: 1.31s	remaining: 3.38s
14:	learn: 2556.3161540	total: 1.41s	remaining: 3.29s
15:	learn: 2542.5004755	total: 1.51s	remaining: 3.21s
16:	learn: 2530.1662725	total: 1.6s	remaining: 3.11s
17:	learn: 2502.9148863	total: 1.72s	remaining: 3.06s
18:	learn: 2483.4137272	total: 1.8s	remainin

Комментарий: На сколько я понял необходимо просто передать номера столбцов которые являются категориальными и обучение проходит в принципе также

In [28]:
print('Лучший R2_score и MSE:', best_score, best_RMSE)
print('Количество итераций:', best_step)
print('Шаг градиента:', best_learning_rate_step)
print('Глубина:', best_depth_step)

Лучший R2_score и MSE: 0.8710732418983942 1699.9066365635185
Количество итераций: 250
Шаг градиента: 1
Глубина: 4


## Анализ моделей

In [48]:
results = pd.DataFrame(results)
results.columns = ['Time_fit', 'Time_predict', 'RMSE', 'R2_score']
results.index = ['LGBMRegressor', 'CatBoostRegressor', 'XGBRegressor']

display(results)

Unnamed: 0,Time_fit,Time_predict,RMSE,R2_score
LGBMRegressor,3.805643,4.50588,1699.782811,0.871092
CatBoostRegressor,29.277522,29.29389,1927.135129,0.834302
XGBRegressor,126.134422,126.76809,1932.797898,0.833327


Комментарий: Самой точной является модель LGBMRegressor, но если подкрутить гиперпараметры то CatBoost может приблизится к его результатам но ооочень долго придется ждать. Тут конечно вопрос в том, что у нас важнее точность или скорость в зависимости от этого и следует выбирать модель, но все же победителем является LGBMRegressor

In [10]:
# Проверим, что получится на обычном дереве решений

model = RandomForestRegressor(random_state=12345, n_estimators=5, max_depth = 5)
model.fit(features_train, target_train)
prediction = model.predict(features_valid)
result = r2_score(target_valid, prediction)

print('R2_score', result)

R2_score 0.7377897434747027


In [18]:
# Решающее дерево

model = DecisionTreeRegressor(max_depth = 20)
model.fit(features_train, target_train)
prediction = model.predict(features_valid)
result = r2_score(target_valid, prediction)

print('R2_score', result)

R2_score 0.8122792667908592


Комментарий: Обычные модели деревьев не превзошли показатели бустинга поэтому однозначно можно говорить о том, что бустинг по точности гораздо лучше чем стандартные модели, но опять же все зависит от решаемой задачи