# Определение стоимости автомобилей

Сервис по продаже автомобилей с пробегом «Не бит, не крашен» разрабатывает приложение для привлечения новых клиентов. В нём можно быстро узнать рыночную стоимость своего автомобиля. В вашем распоряжении исторические данные: технические характеристики, комплектации и цены автомобилей. Вам нужно построить модель для определения стоимости. 

Заказчику важны:

- качество предсказания;
- скорость предсказания;
- время обучения.

## Подготовка данных

In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import lightgbm as lgb
import xgboost

from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import ( OneHotEncoder)
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import make_scorer
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import cross_val_score
pd.options.mode.chained_assignment = None

In [2]:
frame = pd.read_csv('/datasets/autos.csv')
frame.info()
frame.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354369 entries, 0 to 354368
Data columns (total 16 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   DateCrawled        354369 non-null  object
 1   Price              354369 non-null  int64 
 2   VehicleType        316879 non-null  object
 3   RegistrationYear   354369 non-null  int64 
 4   Gearbox            334536 non-null  object
 5   Power              354369 non-null  int64 
 6   Model              334664 non-null  object
 7   Kilometer          354369 non-null  int64 
 8   RegistrationMonth  354369 non-null  int64 
 9   FuelType           321474 non-null  object
 10  Brand              354369 non-null  object
 11  Repaired           283215 non-null  object
 12  DateCreated        354369 non-null  object
 13  NumberOfPictures   354369 non-null  int64 
 14  PostalCode         354369 non-null  int64 
 15  LastSeen           354369 non-null  object
dtypes: int64(7), object(

Unnamed: 0,DateCrawled,Price,VehicleType,RegistrationYear,Gearbox,Power,Model,Kilometer,RegistrationMonth,FuelType,Brand,Repaired,DateCreated,NumberOfPictures,PostalCode,LastSeen
0,2016-03-24 11:52:17,480,,1993,manual,0,golf,150000,0,petrol,volkswagen,,2016-03-24 00:00:00,0,70435,2016-04-07 03:16:57
1,2016-03-24 10:58:45,18300,coupe,2011,manual,190,,125000,5,gasoline,audi,yes,2016-03-24 00:00:00,0,66954,2016-04-07 01:46:50
2,2016-03-14 12:52:21,9800,suv,2004,auto,163,grand,125000,8,gasoline,jeep,,2016-03-14 00:00:00,0,90480,2016-04-05 12:47:46
3,2016-03-17 16:54:04,1500,small,2001,manual,75,golf,150000,6,petrol,volkswagen,no,2016-03-17 00:00:00,0,91074,2016-03-17 17:40:17
4,2016-03-31 17:25:20,3600,small,2008,manual,69,fabia,90000,7,gasoline,skoda,no,2016-03-31 00:00:00,0,60437,2016-04-06 10:17:21
5,2016-04-04 17:36:23,650,sedan,1995,manual,102,3er,150000,10,petrol,bmw,yes,2016-04-04 00:00:00,0,33775,2016-04-06 19:17:07
6,2016-04-01 20:48:51,2200,convertible,2004,manual,109,2_reihe,150000,8,petrol,peugeot,no,2016-04-01 00:00:00,0,67112,2016-04-05 18:18:39
7,2016-03-21 18:54:38,0,sedan,1980,manual,50,other,40000,7,petrol,volkswagen,no,2016-03-21 00:00:00,0,19348,2016-03-25 16:47:58
8,2016-04-04 23:42:13,14500,bus,2014,manual,125,c_max,30000,8,petrol,ford,,2016-04-04 00:00:00,0,94505,2016-04-04 23:42:13
9,2016-03-17 10:53:50,999,small,1998,manual,101,golf,150000,0,,volkswagen,,2016-03-17 00:00:00,0,27472,2016-03-31 17:17:06


In [3]:
frame.describe()

Unnamed: 0,Price,RegistrationYear,Power,Kilometer,RegistrationMonth,NumberOfPictures,PostalCode
count,354369.0,354369.0,354369.0,354369.0,354369.0,354369.0,354369.0
mean,4416.656776,2004.234448,110.094337,128211.172535,5.714645,0.0,50508.689087
std,4514.158514,90.227958,189.850405,37905.34153,3.726421,0.0,25783.096248
min,0.0,1000.0,0.0,5000.0,0.0,0.0,1067.0
25%,1050.0,1999.0,69.0,125000.0,3.0,0.0,30165.0
50%,2700.0,2003.0,105.0,150000.0,6.0,0.0,49413.0
75%,6400.0,2008.0,143.0,150000.0,9.0,0.0,71083.0
max,20000.0,9999.0,20000.0,150000.0,12.0,0.0,99998.0


In [4]:
frame.isnull().sum()

DateCrawled              0
Price                    0
VehicleType          37490
RegistrationYear         0
Gearbox              19833
Power                    0
Model                19705
Kilometer                0
RegistrationMonth        0
FuelType             32895
Brand                    0
Repaired             71154
DateCreated              0
NumberOfPictures         0
PostalCode               0
LastSeen                 0
dtype: int64

In [5]:
drop_price = frame[frame['Price'] == 0]

drop_price.shape

(10772, 16)

In [6]:
frame = frame[frame['Price'] != 0]

In [7]:
drop_date = frame.loc[frame['RegistrationYear']>=2020 | (frame['RegistrationYear'] <= 1930), 'RegistrationYear']
drop_date.shape

(343597,)

In [8]:
frame = frame[frame['RegistrationYear']<=2023]

Цена = 0 в признаках, нас не интересует по определнию поставленной задачи (~3% данных)

Дата регистрации выше текущей - определенно ошибка в данных

In [9]:
frame['Gearbox'] = frame['Gearbox'].replace('manual',0)
frame['Gearbox'] = frame['Gearbox'].replace('auto',1)

In [10]:
frame['Gearbox'].isnull().sum()

17289

In [11]:
frame.loc[(frame['Power'] > 1000) | (frame['Power'] <= 0), 'Power'] = None
frame['Power'] = frame['Power'].fillna(frame.groupby('Model')['Power'].transform('median'))
frame = frame.loc[~frame['Power'].isna()]
frame['Power'] = frame['Power'].astype('int64')

In [12]:
df_with_date = frame.copy()
frame = frame.drop(["DateCrawled","DateCreated","LastSeen","NumberOfPictures","PostalCode"],axis = 1)

In [13]:
frame["VehicleType"] = frame["VehicleType"].fillna(frame["VehicleType"].mode().values[0])
frame["Gearbox"] = frame["Gearbox"].fillna(frame["Gearbox"].mode().values[0])
frame["Model"] = frame["Model"].fillna(frame["Model"].mode().values[0])
frame["FuelType"] = frame["FuelType"].fillna(frame["FuelType"].mode().values[0])
frame["Repaired"] = frame["Repaired"].fillna("no")

Вывод

Проанализируем значения числовых признаков перед применения к ним алгоритма очитки выбросов

Можно заметить, что в столбце RegistrationYear присутствуют аномальные значения, предлагаю установить границы, в котором может находится значение этого столбца. А именно, год регистрации не может быть больше 2020, и не может быть меньше 1930. Все показания выше или ниже этих значений приведем к максимальной или минимальной границе (признак не нуждается в поиске выбросов)

Можно заметить, что в столбце RegistrationMonth, встречается значение 0, которое корректнее было бы изменить на 1 (признак не нуждается в поиске выбросов)

Так же в Power присутствуют аномальные значения (20 000 лошадиных сил), ограничим их мощностью БелАЗ(а), т.е. 3500 (признак нуждается в поиске выбросов)

Столбец Kilometer имеет достаточно реальные значения, поэтому смысла искать там выбросы нет (признак не нуждается в поиске выбросов)

Значения из столбца Price обладают высоким стандартным отклонениям, что пагубно будет влиять на работу алгоритма (признак нуждается в поиске выбросов)

Такие признаки как NumberOfPictures и PostalCode (признаки не нуждается в поиске выбросов) , так как не влияют на реальную стоимость автомобиля (их мы просто удалим)

In [14]:
# RegistrationYear 
def Balance_RegistrationYear(value):
    if value > 2020:
        return 2020
    elif value < 1930:
        return 1930
    else:
        return value
frame["RegistrationYear"] = frame["RegistrationYear"].apply(Balance_RegistrationYear)
# RegistrationMonth
frame.loc[frame['RegistrationMonth'] == 0, 'RegistrationMonth'] = 1
# Power
frame.loc[frame['Power'] > 3500, 'Power'] = 3500

In [15]:
def remove_ouliers(frame,column):
    q25=np.array(frame[column].quantile(0.25))
    
    q75=np.array(frame[column].quantile(0.75))
    first_part=q25-1.5*(q75-q25)
    second_part=q75+1.5*(q75-q25)
    del_index = []
    for index_value, value in zip(frame[column].index,frame[column]):
        if second_part <= value or value <= first_part:
            del_index.append(index_value)
    
    print('Количество строк, выбранных для удаления ' + str(column)+":",len(del_index))
    return del_index

In [16]:
array_num_col = ["Price","Power"]
count = 0 
for column in array_num_col:
    index_del = remove_ouliers(frame,column)
    count += len(index_del)
    frame = frame.drop(index_del,axis = 0)
print("Было удалено:", count)

Количество строк, выбранных для удаления Price: 18867
Количество строк, выбранных для удаления Power: 6007
Было удалено: 24874


In [17]:
frame.corr()

Unnamed: 0,Price,RegistrationYear,Gearbox,Power,Kilometer,RegistrationMonth
Price,1.0,0.387145,0.19563,0.441707,-0.313266,0.089682
RegistrationYear,0.387145,1.0,0.009337,0.082341,-0.168279,0.030813
Gearbox,0.19563,0.009337,1.0,0.357443,0.019841,0.03636
Power,0.441707,0.082341,0.357443,1.0,0.161163,0.0461
Kilometer,-0.313266,-0.168279,0.019841,0.161163,1.0,0.000594
RegistrationMonth,0.089682,0.030813,0.03636,0.0461,0.000594,1.0


Выводы

Пропуски были только в категорийных признаках, поэтому заполнения выполнял модой

Дубликатов внутри признаков найдено не было

Дубликаты объектов: было найдено 5 штук - удалены

Проверена корреляция: все в рамках разумных пределов, наиболее коррелирующим с целевим признаком является признак отвечающий за мощность

Данные были очищены от выбросов, суммарно было удалено 73 145 объектов, что составляет около 20%. Это достаточно большой процент, поэтому в случае низких результатов модели имеет смысл изменить(увеличить) коэффициент при межквартильном размахе, что уменьшит общий объем объектов идентифицирующих как выбросы (при коэффициенте равном 2 сумма количества удаленных объектов уменьшится ~20 000 (приблизительно 5% от изначальной выборки))

In [18]:
frame.duplicated().sum()

30213

In [19]:
frame = frame.drop_duplicates()
frame.duplicated().sum()

0

Разделим и закодируем выборки

In [20]:
frame_ohe = frame.copy()
frame_ohe = pd.get_dummies(frame_ohe)



frame_light = frame.copy()
frame_light = frame_light.astype({"VehicleType":'category',
                                  "Gearbox":'category',
                                  "Model":'category',
                                  "FuelType":'category',
                                  "Brand":'category',
                                  "Repaired":'category'})



In [21]:
frame_ohe

Unnamed: 0,Price,RegistrationYear,Gearbox,Power,Kilometer,RegistrationMonth,VehicleType_bus,VehicleType_convertible,VehicleType_coupe,VehicleType_other,...,Brand_smart,Brand_sonstige_autos,Brand_subaru,Brand_suzuki,Brand_toyota,Brand_trabant,Brand_volkswagen,Brand_volvo,Repaired_no,Repaired_yes
0,480,1993,0.0,101,150000,1,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
2,9800,2004,1.0,163,125000,8,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1500,2001,0.0,75,150000,6,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
4,3600,2008,0.0,69,90000,7,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5,650,1995,0.0,102,150000,10,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
354361,5250,2016,1.0,150,150000,12,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
354362,3200,2004,0.0,225,150000,5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
354366,1199,2000,1.0,101,125000,3,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
354367,9200,1996,0.0,102,150000,3,1,0,0,0,...,0,0,0,0,0,0,1,0,1,0


In [22]:
frame_light.dtypes


Price                   int64
VehicleType          category
RegistrationYear        int64
Gearbox              category
Power                   int64
Model                category
Kilometer               int64
RegistrationMonth       int64
FuelType             category
Brand                category
Repaired             category
dtype: object

Разделим данные на выборки

In [23]:
frame.columns

Index(['Price', 'VehicleType', 'RegistrationYear', 'Gearbox', 'Power', 'Model',
       'Kilometer', 'RegistrationMonth', 'FuelType', 'Brand', 'Repaired'],
      dtype='object')

In [24]:
#frame.dtypes
frame['Gearbox'] = frame['Gearbox'].astype(object)

## Обучение моделей

OHE - Необходимо для линейной регресии и случайного леса

In [25]:

df_temp3 = frame.copy() # сет данных до OneHotEncoder(OHE/ohe) -кодирования
data_ohe = pd.get_dummies(frame, drop_first = True)
#data_ohe.head()

features_ohe = data_ohe.drop('Price',axis=1)
target_ohe = data_ohe['Price']

x_train_ohe, x_temp, y_train_ohe, y_temp = train_test_split(\
                                                    features_ohe, target_ohe,\
                                                    test_size=.3, random_state=12345)
x_test_ohe, x_valid_ohe, y_test_ohe, y_valid_ohe = train_test_split(\
                                                    x_temp, y_temp,\
                                                    test_size=.5, random_state=12345)

In [26]:
#Названия признаков
categorical_features = ['VehicleType', 'Gearbox', 'Model', 'FuelType', 'Brand', 'Repaired']
kol = list((set(frame.columns)-set(['Price']))-set(categorical_features))

OE - Кодирование категориальных признаков для бустинговых моделей

In [27]:

from sklearn.preprocessing import OrdinalEncoder as enc

frame = frame.reset_index(drop=True)
features = frame.drop('Price',axis=1)
target = frame['Price']

# temp_0 = features[categorical_features]
# enc = OrdinalEncoder(categories = 'auto')
# temp = enc.fit_transform(temp_0)
# enc.categories_
# features = pd.concat([pd.DataFrame(data = temp, columns = temp_0.columns), features[kol]],axis=1)
# pd.DataFrame(data = temp, columns = temp_0.columns)

In [28]:
x_train_oe, x_temp, y_train_oe, y_temp = train_test_split(\
                                                    features, target,\
                                                   test_size=.3, random_state=12345)
x_test_oe, x_valid_oe, y_test_oe, y_valid_oe = train_test_split(\
                                                    x_temp, y_temp,\
                                                    test_size=.5, random_state=12345)

In [29]:

enc = OrdinalEncoder(handle_unknown = "ignore")

enc.fit(x_train_oe[categorical_features])

x_train_oe[categorical_features] = enc.transform(x_train_oe[categorical_features])
x_test_oe[categorical_features] = enc.transform(x_test_oe[categorical_features])
x_valid_oe[categorical_features] = enc.transform(x_valid_oe[categorical_features])

#temp = enc.fit_transform(temp_0)
#enc.categories_
#features = pd.concat([pd.DataFrame(data = temp, columns = temp_0.columns), features[kol]],axis=1)
#pd.DataFrame(data = temp, columns = temp_0.columns)

In [30]:
pd.DataFrame(x_train_oe, columns = features.columns).head()

Unnamed: 0,VehicleType,RegistrationYear,Gearbox,Power,Model,Kilometer,RegistrationMonth,FuelType,Brand,Repaired
214074,4.0,2017,0.0,130,165.0,150000,12,6.0,25.0,0.0
90394,4.0,1997,0.0,101,115.0,150000,1,6.0,38.0,0.0
557,1.0,2003,1.0,192,115.0,150000,8,6.0,2.0,0.0
115418,4.0,2002,0.0,170,115.0,150000,2,6.0,2.0,0.0
53089,4.0,2001,1.0,198,240.0,150000,1,6.0,13.0,0.0


In [31]:
#Проверка размеров
for i in [x_train_oe, x_test_oe, y_train_oe, y_test_oe, x_valid_oe, y_valid_oe]:
    print(i.shape)

(198366, 10)
(42507, 10)
(198366,)
(42507,)
(42508, 10)
(42508,)


Подготовка данных для обучения лин.регресии, деревом и случайным лесом

In [32]:

column_all_ohe = features_ohe.columns

column_all = features.columns

column_all_ohe

Index(['RegistrationYear', 'Power', 'Kilometer', 'RegistrationMonth',
       'VehicleType_convertible', 'VehicleType_coupe', 'VehicleType_other',
       'VehicleType_sedan', 'VehicleType_small', 'VehicleType_suv',
       ...
       'Brand_skoda', 'Brand_smart', 'Brand_sonstige_autos', 'Brand_subaru',
       'Brand_suzuki', 'Brand_toyota', 'Brand_trabant', 'Brand_volkswagen',
       'Brand_volvo', 'Repaired_yes'],
      dtype='object', length=304)

Масштабирование

In [33]:
scaler_ohe = StandardScaler()
x_train_ohe[kol] = scaler_ohe.fit_transform(x_train_ohe[kol])
x_test_ohe[kol] =  scaler_ohe.transform(x_test_ohe[kol][kol])
x_valid_ohe[kol] = scaler_ohe.transform(x_valid_ohe[kol])
x_valid_ohe

Unnamed: 0,RegistrationYear,Power,Kilometer,RegistrationMonth,VehicleType_convertible,VehicleType_coupe,VehicleType_other,VehicleType_sedan,VehicleType_small,VehicleType_suv,...,Brand_skoda,Brand_smart,Brand_sonstige_autos,Brand_subaru,Brand_suzuki,Brand_toyota,Brand_trabant,Brand_volkswagen,Brand_volvo,Repaired_yes
134182,-0.262178,-0.860901,0.552749,-1.090951,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
215109,1.968532,0.885370,0.552749,-1.372799,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
193250,0.295500,-0.744483,0.552749,1.163832,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
264148,0.853177,-0.045975,-1.403596,0.318288,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
145252,0.156080,0.722384,0.552749,-0.245407,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43415,1.271435,-0.255527,-1.683074,0.881984,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
280169,-3.468823,-1.442991,-0.145946,-0.527255,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
170276,0.295500,-0.860901,0.552749,0.881984,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
26768,-0.401597,-0.069258,0.552749,-1.372799,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [34]:

scaler = StandardScaler()
x_train_oe[kol] = scaler.fit_transform(x_train_oe[kol])
x_test_oe[kol] =  scaler.transform(x_test_oe[kol])
x_valid_oe[kol] = scaler.transform(x_valid_oe[kol])
x_valid_oe

Unnamed: 0,VehicleType,RegistrationYear,Gearbox,Power,Model,Kilometer,RegistrationMonth,FuelType,Brand,Repaired
113473,4.0,-0.262178,0.0,-0.860901,115.0,0.552749,-1.090951,6.0,38.0,0.0
178025,4.0,1.968532,0.0,0.885370,11.0,0.552749,-1.372799,6.0,2.0,0.0
160864,5.0,0.295500,0.0,-0.744483,175.0,0.552749,1.163832,6.0,9.0,0.0
215830,0.0,0.853177,0.0,-0.045975,115.0,-1.403596,0.318288,2.0,38.0,0.0
122442,0.0,0.156080,0.0,0.722384,236.0,0.552749,-0.245407,2.0,4.0,0.0
...,...,...,...,...,...,...,...,...,...,...
37662,4.0,1.271435,0.0,-0.255527,148.0,-1.683074,0.881984,6.0,27.0,0.0
228003,5.0,-3.468823,0.0,-1.442991,165.0,-0.145946,-0.527255,6.0,38.0,0.0
142604,5.0,0.295500,0.0,-0.860901,172.0,0.552749,0.881984,2.0,38.0,0.0
23299,4.0,-0.401597,0.0,-0.069258,148.0,0.552749,-1.372799,6.0,27.0,1.0


### Линейная регрессия

In [37]:
%%time
model_linReg = LinearRegression()
model_linReg.fit(x_train_ohe, y_train_ohe)
predict_model_linReg = model_linReg.predict(x_test_ohe)
rmse_model_linReg = mse(y_valid_ohe, predict_valid_linReg)**(0.5)
mae_model_linReg = mean_absolute_error(y_valid_ohe, predict_model_linReg)
print('rmse =', rmse_model_linReg)
print('mae =', mae_model_linReg)

rmse = 2187.29964916146
mae = 1573.831044060865
CPU times: user 28.7 s, sys: 38.6 s, total: 1min 7s
Wall time: 1min 7s


In [40]:
%%time 
print("CV_RMSE", cross_val_score(model_linReg, x_train_ohe, y_train_ohe, cv=5, verbose=0, scoring = 'neg_root_mean_squared_error').mean())

CV_RMSE -2202.992274241244
CPU times: user 1min 42s, sys: 2min 14s, total: 3min 57s
Wall time: 3min 58s


 ### RandomForest 

In [41]:
%%time
model_RandForest_oe = RandomForestRegressor()
model_RandForest_oe.fit(x_train_oe, y_train_oe)
predict_model_RandForest_oe = model_RandForest_oe.predict(x_valid_oe)
rmse_model_RandForest_oe = mse(y_valid_oe, predict_model_RandForest_oe)**(0.5)
mae_model_RandForest_oe = mean_absolute_error(y_valid_oe, predict_model_RandForest_oe)
print('rmse =', rmse_model_RandForest_oe)
print('mae =', mae_model_RandForest_oe)

rmse = 1416.8639890690213
mae = 918.6496661551722
CPU times: user 1min 11s, sys: 1.92 s, total: 1min 13s
Wall time: 1min 13s


In [36]:
%%time
print(cross_val_score(model_RandForest_oe, x_train_oe, y_train_oe, cv=5, verbose=0, scoring = 'neg_root_mean_squared_error').mean())

-1427.7897620271015
CPU times: user 4min 43s, sys: 4.72 s, total: 4min 47s
Wall time: 4min 49s


### SGD regressor

In [40]:
%%time
model_SGD_ohe = SGDRegressor(max_iter=100,)
model_SGD_ohe.fit(x_train_ohe, y_train_ohe)
predict_model_SGD_ohe = model_SGD_ohe.predict(x_valid_ohe)
rmse_model_SGD_ohe = mse(y_valid_ohe, predict_model_SGD_oe)**(0.5)
mae_model_SGD_ohe = mean_absolute_error(y_valid_ohe, predict_model_SGD_ohe)
print('rmse =', rmse_model_SGD_ohe)
print('mae =', mae_model_SGD_ohe)

rmse = 2217.1753948778987
mae = 1595.2806633421444
CPU times: user 29.9 s, sys: 701 ms, total: 30.6 s
Wall time: 30.6 s




In [41]:
%%time
print(cross_val_score(model_SGD_ohe, x_train_ohe, y_train_ohe, cv=5, verbose=0, scoring = 'neg_root_mean_squared_error').mean())



-2208.1327847643847
CPU times: user 1min 59s, sys: 2.01 s, total: 2min 1s
Wall time: 2min 2s




### LightGBM

In [42]:
trainX,testX,trainY,testY = train_test_split(frame_ohe.drop("Price",axis = 1),
                                             frame_ohe["Price"],
                                             test_size = 0.25,
                                             random_state = 42)

trainX_wo_ohe,testX_wo_ohe,trainY_wo_ohe,testY_wo_ohe = train_test_split(frame.drop("Price",axis = 1),
                                             frame["Price"],
                                             test_size = 0.25,
                                             random_state = 42)
(trainX_wo_ohe_light,
 testX_wo_ohe_light,
 trainY_wo_ohe_light,
 testY_wo_ohe_light) = train_test_split(frame_light.drop("Price",axis = 1),
                                             frame_light["Price"],
                                             test_size = 0.25,
                                             random_state = 42)

In [None]:
%%time
# LightGBM
lgb_train = lgb.Dataset(trainX_wo_ohe_light, trainY_wo_ohe_light)
# lgb_test = lgb.Dataset(testX_wo_ohe_light, testY_wo_ohe_light, reference=lgb_train)
parameters_light = {'metric': 'rmse', 'max_depth': 4,"random_state": 42,"learning_rate":0.1}
light = lgb.train(parameters_light,
                lgb_train,
                num_boost_round=1000,
                valid_sets=[lgb_train], # lgb_test],
                verbose_eval=100)




You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 648
[LightGBM] [Info] Number of data points in the train set: 212535, number of used features: 10
[LightGBM] [Info] Start training from score 3782.977872
[100]	training's rmse: 1416.78
[200]	training's rmse: 1367.77


In [42]:
%%time
LightGBM_rmse = mean_squared_error(testY,light.predict(testX_wo_ohe_light),squared=False)
print("LightGBM:",LightGBM_rmse )

LightGBM: 1314.4109024286072
CPU times: user 7.8 s, sys: 0 ns, total: 7.8 s
Wall time: 7.82 s


## Анализ моделей

Исходя из полученных показателей качества: значения метрики RMSE и времени - следует, что LightGBM выполняется по суммарному значению итераций 1 минуту, против самой быстрой модели SGD, которая выполняется менее чем за 1 секунду. Большее количество гиперпараметров использовать не удается, т.к. слишком много времени уходит.

In [49]:
index_table = 0
table = pd.DataFrame(columns=['MODEL',"RMSE",'CV_SCORE RMSE','ВРЕМЯ ОБУЧЕНИЯ','ВРЕМЯ ПРЕДСКАЗАНИЯ'])


index_table +=1
table.loc[index_table] = ['SGD regressor', rmse_model_SGD_oe, cross_val_score(
    model_SGD_oe, x_test, y_test, cv=5, verbose=0, error_score = 
                                                           rmse_model_SGD_oe).mean(), ' 0.728 sec ', "1.12 sec"]

index_table +=1
table.loc[index_table] = ['LightGBM', LightGBM_rmse ,"N\A", '1 min 3 sec','7.82 sec']

index_table +=1
table.loc[index_table] = ['Linear Reg.',rmse_model_linReg, cross_val_score(
    model_linReg, x_test, y_test, cv=5, verbose=0, error_score =
                                                         rmse_model_linReg).mean(), '16.5 sec',' 0.797  sec']

index_table +=1
table.loc[index_table] = ['RandForest',rmse_model_RandForest_oe, cross_val_score(
    model_RandForest_oe, x_test, y_test, cv=5, verbose=0, error_score =
                                                        rmse_model_RandForest_oe).mean(), '1 min 1 sec', "45.6 sec"]

table

Unnamed: 0,MODEL,RMSE,CV_SCORE RMSE,ВРЕМЯ ОБУЧЕНИЯ,ВРЕМЯ ПРЕДСКАЗАНИЯ
1,SGD regressor,2441.825182,0.485622,0.728 sec,1.12 sec
2,LightGBM,1314.410902,N\A,1 min 3 sec,7.82 sec
3,Linear Reg.,2187.299649,0.487162,16.5 sec,0.797 sec
4,RandForest,1430.485876,0.797021,1 min 1 sec,45.6 sec
