In [9]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor 
from sklearn import datasets
from sklearn import tree
from matplotlib import pyplot as plt
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from math import sqrt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor

In [10]:
df = pd.read_csv('../data/dataset_regression.csv')
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.isnull().sum().sum()

0

In [11]:
y = df["price_usd"]
X = df.drop(["price_usd"], axis=1)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3)

In [19]:
gradient_boosting_regressor_optimal_params = RandomizedSearchCV(GradientBoostingRegressor(),
                                                                 {
                                                                    'loss': ['squared_error', 'huber', 'absolute_error', 'quantile'],
                                                                    'n_estimators': np.arange(50, 200),
                                                                    'criterion': ['friedman_mse', 'squared_error'],
                                                                    'min_samples_split': np.arange(2, 10),
                                                                    'min_samples_leaf': np.arange(1, 9),
                                                                    'min_weight_fraction_leaf': np.arange(0, 0.5, 0.01),
                                                                    'max_depth': np.arange(3, 4),
                                                                    'alpha': np.arange(0, 1, 0.01)
                                                                 },
                                                                 n_iter=4,
                                                                 n_jobs=4,
                                                                 random_state=71
                                                                 ).fit(X_train, y_train)
gradient_boosting_regressor_optimal_params.best_params_

{'n_estimators': 161,
 'min_weight_fraction_leaf': 0.08,
 'min_samples_split': 3,
 'min_samples_leaf': 1,
 'max_depth': 3,
 'loss': 'absolute_error',
 'criterion': 'squared_error',
 'alpha': 0.67}

In [20]:
y_predict = gradient_boosting_regressor_optimal_params.predict(X_test)
y_predict

array([ 2306.87022547,  7364.53101347, 13109.68112526, ...,
        2193.25764468, 12275.07748275, 17219.00001329])

In [21]:
errors = {'MAE': mean_absolute_error(y_predict, y_test), 
          'RMSE': mean_squared_error(y_predict, y_test),
          'MSE': mean_squared_error(y_predict, y_test)**0.5,
          'MAPE': mean_absolute_percentage_error(y_predict, y_test),
          'R^2': gradient_boosting_regressor_optimal_params.score(X_test, y_test)}
errors

{'MAE': 1499.030318994498,
 'RMSE': 8913522.750792518,
 'MSE': 2985.55233596608,
 'MAPE': 0.28643815732960204,
 'R^2': 0.7828987429953077}