In [None]:
import joblib

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error

In [None]:
from xgboost import XGBRegressor

In [None]:
from scipy.stats import uniform
from scipy.stats import randint


In [None]:
df = pd.read_csv('data3_0505.csv', index_col=0)

In [None]:
X=df.drop(['price'], axis=1)
y = df['price']

### Training/Test Set Split and MinMaxScaler

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=7633)

In [None]:
scaler = MinMaxScaler()
scaler.fit(X_train)

MinMaxScaler()

In [None]:
X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index)
X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index)

#### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
# initialize a model
model_lr = LinearRegression()

model_lr.fit(X_train, np.log(y_train))

pred_train = np.exp(model_lr.predict(X_train))
pred_test = np.exp(model_lr.predict(X_test))
print('training set r-squared: ', model_lr.score(X_train, np.log(y_train)))
print('test set r-squared:     ', model_lr.score(X_test, np.log(y_test)))
print('training set rmse:      ', np.sqrt(mean_squared_error(y_train, pred_train)))
print('test set rmse:          ', np.sqrt(mean_squared_error(y_test, pred_test)))
print('training set mape:      ', mean_absolute_percentage_error(y_train, pred_train))
print('test set mape:          ', mean_absolute_percentage_error(y_test, pred_test))

training set r-squared:  0.8570159254983258
test set r-squared:      0.8641137742202539
training set rmse:       158393.27341934544
test set rmse:           227402.86456328136
training set mape:       0.15249827480967895
test set mape:           0.14902217604324036


#### RandomizedSearchCV SVR

In [None]:
from sklearn.svm import SVR

In [None]:
# define the parameter grid for the randomized search
param_grid_svr = {'C': np.logspace(-3, 3, 7),
              'gamma': np.logspace(-3, 3, 7)}

# initialize a svr model
model_svr = SVR(kernel='rbf')

# initialize a randomized search with 5-fold cross-validation
random_search_svr = RandomizedSearchCV(estimator=model_svr, param_distributions=param_grid_svr,
                                   n_iter=10, cv=5, n_jobs=-1, random_state=42)

# fit the randomized search to the data
random_search_svr.fit(X_train, np.log(y_train))

# print the best hyperparameters and best score
# print("Best hyperparameters: ", random_search_svr.best_params_)
print("Best score: ", random_search_svr.best_score_)

best_svr_log = random_search_svr.best_estimator_

pred_train = np.exp(best_svr_log.predict(X_train))
pred_test = np.exp(best_svr_log.predict(X_test))
print('training set r-squared: ', best_svr_log.score(X_train, np.log(y_train)))
print('test set r-squared:     ', best_svr_log.score(X_test, np.log(y_test)))
print('training set rmse:      ', np.sqrt(mean_squared_error(y_train, pred_train)))
print('test set rmse:          ', np.sqrt(mean_squared_error(y_test, pred_test)))
print('training set mape:      ', mean_absolute_percentage_error(y_train, pred_train))
print('test set mape:          ', mean_absolute_percentage_error(y_test, pred_test))

# joblib.dump(best_svr_log, 'best_svr_log_0505.joblib')

Best score:  0.8922618494995372
training set r-squared:  0.9247470135646605
test set r-squared:      0.9024839395704978
training set rmse:       85921.355889983
test set rmse:           131303.7090388297
training set mape:       0.1071762330973566
test set mape:           0.11947261438914024


['best_svr_log_0505.joblib']

In [None]:
print(random_search_svr.best_params_)

{'gamma': 0.1, 'C': 1000.0}


In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import StackingRegressor

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
best_xgb = joblib.load('best_xgb_log_0506_9195.joblib')
best_mlp = joblib.load('best_mlp_0507.joblib')

In [None]:
best_models = [('xgb', best_xgb), ('mlp', best_mlp), ('svr', best_svr_log)]

In [None]:
# define the StackingRegressor
stack = StackingRegressor(
#    estimators=[('xgb', best_xgb), ('mlp', best_mlp)],
    estimators=best_models,
    final_estimator=Ridge(),
    cv=5
)

In [None]:
# define the parameter grid for the search
param_grid = {
    'final_estimator__alpha': [0.01, 0.1, 1, 10]
}

In [None]:
# perform GridSearchCV for the StackingRegressor
stack_search = GridSearchCV(
    estimator=stack,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1
)


In [None]:
# fit the StackingRegressor to the data
stack_search.fit(X_train, np.log(y_train))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

GridSearchCV(cv=5,
             estimator=StackingRegressor(cv=5,
                                         estimators=[('xgb',
                                                      XGBRegressor(base_score=0.5,
                                                                   booster='gbtree',
                                                                   callbacks=None,
                                                                   colsample_bylevel=1,
                                                                   colsample_bynode=1,
                                                                   colsample_bytree=0.3,
                                                                   early_stopping_rounds=None,
                                                                   enable_categorical=False,
                                                                   eval_metric=None,
                                                                   feature_types=None

In [None]:
print(stack_search.best_params_)

{'final_estimator__alpha': 0.1}


In [None]:
pred_train = np.exp(stack_search.best_estimator_.predict(X_train))
pred_test = np.exp(stack_search.best_estimator_.predict(X_test))
print('training set r-squared: ', stack_search.best_estimator_.score(X_train, np.log(y_train)))
print('test set r-squared:     ', stack_search.best_estimator_.score(X_test, np.log(y_test)))
print('training set rmse:      ', np.sqrt(mean_squared_error(y_train, pred_train)))
print('test set rmse:          ', np.sqrt(mean_squared_error(y_test, pred_test)))
print('training set mape:      ', mean_absolute_percentage_error(y_train, pred_train))
print('test set mape:          ', mean_absolute_percentage_error(y_test, pred_test))

training set r-squared:  0.951564749493026
test set r-squared:      0.9198386903336616
training set rmse:       72649.74741942894
test set rmse:           117446.71732378469
training set mape:       0.0839570871730442
test set mape:           0.10687460220777367


In [None]:
# get the best StackingRegressor
best_stack = stack_search.best_estimator_

In [None]:
# joblib.dump(best_stack, 'best_stack_0507_9198+1069.joblib')

['best_stack_0507_9198+1069.joblib']

In [None]:
pred_train = np.exp(best_stack.predict(X_train))
pred_test = np.exp(best_stack.predict(X_test))
print('training set r-squared: ', best_stack.score(X_train, np.log(y_train)))
print('test set r-squared:     ', best_stack.score(X_test, np.log(y_test)))
print('training set rmse:      ', np.sqrt(mean_squared_error(y_train, pred_train)))
print('test set rmse:          ', np.sqrt(mean_squared_error(y_test, pred_test)))
print('training set mape:      ', mean_absolute_percentage_error(y_train, pred_train))
print('test set mape:          ', mean_absolute_percentage_error(y_test, pred_test))

training set r-squared:  0.9481188577013246
test set r-squared:      0.918874459100527
training set rmse:       72450.97793478268
test set rmse:           117162.33131669447
training set mape:       0.0872553418310514
test set mape:           0.10764618386715775


In [None]:
# joblib.dump(best_stack, 'best_stack_0505.joblib')