In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
cars = pd.read_csv('../Data sets/cars_encoded_no_outliers.csv')

In [None]:
X = cars.drop('price', axis=1)
y = cars['price']

# Натренируем несколько моделей и проверим их перфоманс
1. Linear regression
2. Lasso regression
3. Ridge regression
4. Desicion Tree regression
5. Random Forest regression
6. Gradient Boosting regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate

In [None]:
def fit_evaluate(estimator):
    res_reg = cross_validate(
        estimator,
        X,
        y,
        cv=4,
        scoring="neg_root_mean_squared_error",
        return_train_score=True, 
        return_estimator=True
    )
    return res_reg['test_score'].mean() * -1, res_reg['estimator']

In [None]:
linear_regression = LinearRegression(normalize=True, n_jobs=-1)
lin_reg_test_score, lin_reg_tup = fit_evaluate(linear_regression)

In [None]:
lin_reg, _, _, _ = lin_reg_tup

In [None]:
lin_reg_test_score

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33)

In [None]:
def grid_search(estimator, param_grid, X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val):
    grid_search = GridSearchCV(
        estimator,
        param_grid=param_grid,
        n_jobs=-1,
        scoring="neg_root_mean_squared_error",
        cv=4
    )
    grid_search.fit(X_train, y_train)
    best_estimator = grid_search.best_estimator_
    grid_search_test = np.sqrt(mean_squared_error(y_val, best_estimator.predict(X_val))).mean()
    return best_estimator, grid_search_test

In [None]:
from sklearn.linear_model import Lasso

In [None]:
param_grid = {
    'alpha': np.arange(0.0, 3.0, 0.1)
}

In [None]:
lasso_reg, lasso_reg_test_score = grid_search(Lasso(), param_grid)

In [None]:
lasso_reg_test_score

In [None]:
from sklearn.linear_model import Ridge

In [None]:
param_grid = {
    'alpha': np.arange(0.0, 3.0, 0.1)
}

In [None]:
ridge_reg, ridge_reg_test_score  = grid_search(Ridge(), param_grid)

In [None]:
ridge_reg_test_score

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
param_grid = {
    'max_depth': range(2, 10, 4),
    'min_samples_split': range(3, 9, 2),
    'min_samples_leaf': range(4, 10, 2),
    'max_leaf_nodes': range(5, 20, 5)
}

In [None]:
ds_reg, ds_reg_test_res = grid_search(DecisionTreeRegressor(), param_grid)

In [None]:
ds_reg_test_res

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
param_grid = {
    'n_estimators': range(1000, 4000, 1000),
    'max_depth': range(2, 10, 4),
    'min_samples_split': range(3, 9, 2),
    'min_samples_leaf': range(4, 10, 2),
    'max_leaf_nodes': range(5, 20, 5)
}

In [None]:
rf_reg, rf_reg_test_res = grid_search(RandomForestRegressor(), param_grid)

In [None]:
rf_reg_test_res

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
param_grid = {
    'n_estimators': range(1000, 4000, 1000),
    'max_depth': range(2, 10, 4),
    'min_samples_split': range(3, 9, 2),
    'min_samples_leaf': range(4, 10, 2),
    'max_leaf_nodes': range(5, 20, 5)
}

In [None]:
grad_reg, grad_reg_test_res = grid_search(GradientBoostingRegressor(), param_grid)

In [None]:
grad_reg_test_res

In [None]:
res = [lin_reg_test_score, lasso_reg_test_score, ridge_reg_test_score, \
       rf_reg_test_res, grad_reg_test_res, ds_reg_test_res]
names = ["Linear Regressor", "Lasso Regressor", "Ridge Regressor", \
         "Random Forest Regressor", "Gradient Boosting Regressor", "Decision Tree Regressor"]

In [None]:
result_models_df = pd.DataFrame({'names': names, 'test_score': res})

In [None]:
plt.figure(figsize=(20, 10))
estimators = sns.barplot(x='names', y='test_score', data=result_models_df);
estimators.set_title("All estimators test score");

In [None]:
from sklearn.ensemble import StackingRegressor

In [None]:
estimators = [ridge_reg, rf_reg, grad_reg]
names = ["Ridge Regressor", "Random Forest Regressor", "Gradient Boosting Regressor"]

In [None]:
estimators_n_name = list(zip(names, estimators))
estimators_n_name

In [None]:
stacking_reg = StackingRegressor(estimators_n_name, final_estimator=RandomForestRegressor(n_estimators=3000), \
                                 cv=4, n_jobs=-1)

In [None]:
stacking_reg.fit(X_train, y_train)

In [None]:
stacking_reg_test_score = np.sqrt(mean_squared_error(y_val, stacking_reg.predict(X_val))).mean()
stacking_reg_test_score

In [None]:
res = [ridge_reg_test_score, rf_reg_test_res, grad_reg_test_res, stacking_reg_test_score]

In [None]:
names.append("Stacking Regressor")

In [None]:
result_models_df = pd.DataFrame({'names': names, 'test_score': res})

plt.figure(figsize=(20, 10))
best_estimators_score = sns.barplot(x='names', y='test_score', data=result_models_df);
best_estimators_score.set_title("Best estimators test scores");

## Выведем свойства `feature_importance_` у моделей
Это поможет нам понять, какие фичи важнее остальных.

In [None]:
models_perfomance = pd.DataFrame({"Name": X.columns, \
                                  "Value": rf_reg.feature_importances_, \
                                  "Model_Name": "Random Forest Regressor"})

In [None]:
grad_reg_perfomance = pd.DataFrame({"Name": X.columns, \
                                    "Value": grad_reg.feature_importances_, \
                                    "Model_Name": "Gradient Boosting Regressor"})

In [None]:
models_perfomance = models_perfomance.append(grad_reg_perfomance, ignore_index=True)

In [None]:
plt.figure(figsize=(20, 10))
grad_reg_perfomance_bar_plot = sns.barplot(x='Name', y='Value', hue='Model_Name', data=models_perfomance);
grad_reg_perfomance_bar_plot.set_xticklabels(models_perfomance['Name'].unique(), rotation=30);
grad_reg_perfomance_bar_plot.set_title("Gradient Boosting Regressor and Random Forest Regressor Feature Importance");

### Наиболее важными фича для моделей были:
1. Кол-во лошадиных сил
2. Год выпуска
3. Пробег
4. Сегмент
5. Тип двигателя