In [218]:
# С catboost может возникнуть ошибка, потому что его нужно предвариетльно загружать. 
# pip install catboost
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

In [219]:
import warnings
warnings.filterwarnings('ignore')

In [220]:
import pandas as pd
import numpy as np
from sklearn import metrics

In [221]:
data_table = pd.read_csv('./price_prediction.csv')
data_table.head()

Unnamed: 0,name,manufacturer,year,age,mileage,engine,transmission,price
0,Mazda MX5,Mazda,2007,14,63131,Petrol,Manual,7499
1,Jaguar XF,Jaguar,2010,11,61890,Petrol,Automatic,7775
2,Audi A6,Audi,2012,9,129170,Diesel,Automatic,6950
3,Nissan Qashqai,Nissan,2013,8,44900,Petrol,Automatic,7790
4,MINI Mini,Mini,2017,4,32012,Petrol,Manual,15999


In [222]:
#drop the name and year columns because it is irrelevant in our model building
data_table = data_table.drop(['name', 'year'], axis=1)

In [223]:
data_table.head()

Unnamed: 0,manufacturer,age,mileage,engine,transmission,price
0,Mazda,14,63131,Petrol,Manual,7499
1,Jaguar,11,61890,Petrol,Automatic,7775
2,Audi,9,129170,Diesel,Automatic,6950
3,Nissan,8,44900,Petrol,Automatic,7790
4,Mini,4,32012,Petrol,Manual,15999


In [224]:
data_table['manufacturer'].unique()

array(['Mazda', 'Jaguar', 'Audi', 'Nissan', 'Mini', 'Mercedes-Benz',
       'Volkswagen', 'Vauxhall', 'Toyota', 'Skoda', 'Ford', 'Chevrolet',
       'BMW', 'Suzuki', 'Renault', 'Peugeot', 'Citroen', 'Volvo', 'Fiat',
       'DS', 'Dacia', 'Abarth', 'Smart', 'Seat', 'Mitsubishi', 'MG',
       'Lexus', 'Land-Rover', 'Kia', 'Jeep', 'Hyundai', 'Honda',
       'Maserati', 'Subaru', 'Porsche', 'Infiniti', 'Bentley',
       'Alfa-Romero', 'Chrysler', 'Isuzu'], dtype=object)

In [225]:
# Кодируем категориальные переменные
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, accuracy_score, mean_squared_error
carBrand = LabelEncoder()
fuelType = LabelEncoder()
transmissionType = LabelEncoder()

In [226]:
data_table['manufacturer'] = carBrand.fit_transform(data_table['manufacturer'])
data_table['engine'] = fuelType.fit_transform(data_table['engine'])
data_table['transmission'] = transmissionType.fit_transform(data_table['transmission'])

In [227]:
data_table.head(10)

Unnamed: 0,manufacturer,age,mileage,engine,transmission,price
0,23,14,63131,3,1,7499
1,16,11,61890,3,0,7775
2,2,9,129170,0,0,6950
3,27,8,44900,3,0,7790
4,25,4,32012,3,1,15999
5,24,6,33050,0,0,10995
6,38,10,62000,0,1,7250
7,37,7,44000,3,1,5990
8,36,8,172000,2,0,6290
9,32,9,74000,0,1,4450


In [228]:
X = data_table.drop('price', axis=1)

# Логарифмируем цену
y = np.log(data_table['price'])

In [229]:
# Делим выборку на обучающую и тестовую
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=40)

In [230]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2239, 5), (747, 5), (2239,), (747,))

In [231]:
# Нормализуем значения в X_train и X_test с использованием MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
norm = MinMaxScaler().fit(X_train)

X_train = norm.transform(X_train)
X_test = norm.transform(X_test)

X_train
X_test

array([[0.71794872, 0.02083333, 0.01571282, 0.75      , 1.        ],
       [0.28205128, 0.0625    , 0.1102987 , 0.75      , 0.5       ],
       [0.30769231, 0.14583333, 0.24122082, 0.75      , 0.5       ],
       ...,
       [0.97435897, 0.20833333, 0.95392929, 0.        , 0.        ],
       [0.94871795, 0.125     , 0.0269866 , 0.75      , 0.5       ],
       [0.92307692, 0.10416667, 0.02454756, 0.75      , 0.        ]])

In [232]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
# Задаем список анализируемых моделей
models = {
    'linear': LinearRegression(),
    'ridge': Ridge(random_state=123),
    'extree':ExtraTreesRegressor(random_state = 123),
    'lgbm':LGBMRegressor(random_state = 123),
    'xgboost':XGBRegressor(random_state = 123),
    'rf' : RandomForestRegressor(random_state = 123)
 }

In [257]:
def test_models(models: dict) -> pd.DataFrame:

    test_table = {}
    names, train_scores, r2s, maes, rmses = [], [], [], [], []
    for name, evaluation in models.items():
        # Обучаем
        evaluation.fit(X_train, y_train)

        # Делаем предсказание
        y_pred = evaluation.predict(X_test)

        ts_metric = evaluation.score(X_train, y_train)
        r2_metric = metrics.r2_score(y_test, y_pred)
        mae_metric = metrics.mean_absolute_error(y_test, y_pred)
        mse_metric = metrics.mean_squared_error(y_test, y_pred)
        rmse_metric = np.sqrt(mse_metric)

        names.append(name)
        train_scores.append(ts_metric)
        r2s.append(r2_metric)
        maes.append(mae_metric)
        rmses.append(rmse_metric)

    test_table["model"] = names
    test_table["train_score"] = train_scores
    test_table["r2"] = r2s
    test_table["mae"] = maes
    test_table["rmse"] = rmses

    result_table = pd.DataFrame(test_table)
    result_table = result_table.sort_values("rmse")
    return result_table
    

In [258]:
test_models(models)

Unnamed: 0,model,train_score,r2,mae,rmse
3,lgbm,0.868353,0.797178,0.186938,0.251137
4,xgboost,0.949535,0.769511,0.197808,0.267718
5,rf,0.961061,0.746753,0.214018,0.280624
2,extree,0.999987,0.717887,0.227945,0.296186
0,linear,0.552253,0.597345,0.278478,0.35385
1,ridge,0.549262,0.582249,0.285156,0.360422


In [259]:
# Сетка параметров для GridSearchCV
grid_var = {'max_depth': [3,4,5],'n_estimators':[100, 200, 300]}

model = GridSearchCV (estimator = LGBMRegressor(random_state = 123), param_grid = grid_var, scoring ='neg_root_mean_squared_error', cv = 5)

In [260]:
model.fit(X_train,y_train, verbose = False)
#model.fit(X_train,y_train)

In [261]:
# Делаем предсказание на тест. выборке
y_pred = model.predict(X_test)

In [262]:
# Таблица с метриками модели, подобранной с помощью GridSearchCV
grid_model = pd.DataFrame({
    'model': ['LGBM'],
    'r2': [metrics.r2_score(y_test, y_pred)],
    'mae': [mean_absolute_error(y_test, y_pred)],
    'rmse': [np.sqrt(metrics.mean_squared_error(y_test, y_pred))]
    })
grid_model

Unnamed: 0,model,r2,mae,rmse
0,LGBM,0.805569,0.189618,0.245887


In [263]:
# Пример предсказания на тестовых данных
test_sample = [['Mini', 4, 32012, 'Petrol', 'Manual']]
test_sample = np.array(test_sample)
test_sample

array([['Mini', '4', '32012', 'Petrol', 'Manual']], dtype='<U21')

In [264]:
test_sample[:,0] = carBrand.transform(test_sample[:,0])
test_sample[:,3] = fuelType.transform(test_sample[:,3])
test_sample[:,4] = transmissionType.transform(test_sample[:,4])
test_sample

array([['25', '4', '32012', '3', '1']], dtype='<U21')

In [265]:
test_sample = norm.transform(test_sample)
test_sample

array([[0.64102564, 0.08333333, 0.1735023 , 0.75      , 0.5       ]])

In [266]:
price = model.predict(test_sample)
coins_price = np.exp(price) + 1
coins_price = round(coins_price[0])
coins_price

11192

In [267]:
# Сохраняем модель в формате pickle, чтобы получить доступ к ней с сервера
import pickle
data = {"model": model, "normalization": norm}
with open('./price_prediction.pkl', 'wb') as file:
    pickle.dump(data, file)