In [None]:
import joblib
import numpy as np
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error
from xgboost import XGBRegressor

In [None]:
from scipy.stats import uniform
from scipy.stats import randint


In [None]:
df = pd.read_csv('data3_0505.csv', index_col=0)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21613 entries, 0 to 21612
Data columns (total 30 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   price           21613 non-null  float64
 1   bedrooms        21613 non-null  int64  
 2   bathrooms       21613 non-null  float64
 3   sqft_living     21613 non-null  int64  
 4   sqft_lot        21613 non-null  int64  
 5   floors          21613 non-null  float64
 6   waterfront      21613 non-null  int64  
 7   view            21613 non-null  int64  
 8   condition       21613 non-null  int64  
 9   grade           21613 non-null  int64  
 10  sqft_above      21613 non-null  int64  
 11  sqft_basement   21613 non-null  int64  
 12  yr_built        21613 non-null  int64  
 13  yr_renovated    21613 non-null  int64  
 14  lat             21613 non-null  float64
 15  long            21613 non-null  float64
 16  sqft_living15   21613 non-null  int64  
 17  sqft_lot15      21613 non-null 

In [None]:
X=df.drop(['price'], axis=1)
y = df['price']

### Training/Test Set Split and MinMaxScaler

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=7633)

In [None]:
scaler = MinMaxScaler()
scaler.fit(X_train)

MinMaxScaler()

In [None]:
X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index)
X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index)

## RandomizedSearchCV XGB

In [None]:
xgb_param_dist = {
    'learning_rate': [0.03, 0.04, 0.05, 0.06, 0.07],
    'max_depth': [5, 6, 7],
    'n_estimators': [400, 500, 600],
    'colsample_bytree': np.arange(0.1, 1.1, 0.1),
    'reg_alpha': [1, 3, 5],
    'reg_lambda': [3, 5, 7]
}

In [None]:
model = XGBRegressor()

In [None]:
model_search = RandomizedSearchCV(model, param_distributions=xgb_param_dist, cv=5, n_jobs=-1)

In [None]:
model_search.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          callbacks=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None,
                                          early_stopping_rounds=None,
                                          enable_categorical=False,
                                          eval_metric=None, feature_types=None,
                                          gamma=None, gpu_id=None,
                                          grow_policy=None,
                                          importance_type=None,
                                          interaction_constraints=None,
                                          learning_rate=...
                                          min_child_weight=None, missing=nan,
                             

In [None]:
print(model_search.best_params_)

{'reg_lambda': 5, 'reg_alpha': 3, 'n_estimators': 500, 'max_depth': 6, 'learning_rate': 0.06, 'colsample_bytree': 0.30000000000000004}


In [None]:
best_xgb = model_search.best_estimator_

In [None]:
pred_train = best_xgb.predict(X_train)
pred_test = best_xgb.predict(X_test)
print('training set r-squared: ', best_xgb.score(X_train, y_train))
print('test set r-squared:     ', best_xgb.score(X_test, y_test))
print('training set rmse:      ', np.sqrt(mean_squared_error(y_train, pred_train)))
print('test set rmse:          ', np.sqrt(mean_squared_error(y_test, pred_test)))
print('training set mape:      ', mean_absolute_percentage_error(y_train, pred_train))
print('test set mape:          ', mean_absolute_percentage_error(y_test, pred_test))

training set r-squared:  0.9742728269817792
test set r-squared:      0.9093028511552357
training set rmse:       58449.9822487445
test set rmse:           112973.15818153936
training set mape:       0.08862337699680914
test set mape:           0.11299003587536398


In [None]:
model_search_log = RandomizedSearchCV(model, param_distributions=xgb_param_dist, cv=5, n_jobs=-1)

In [None]:
model_search_log.fit(X_train, np.log(y_train))

RandomizedSearchCV(cv=5,
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          callbacks=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None,
                                          early_stopping_rounds=None,
                                          enable_categorical=False,
                                          eval_metric=None, feature_types=None,
                                          gamma=None, gpu_id=None,
                                          grow_policy=None,
                                          importance_type=None,
                                          interaction_constraints=None,
                                          learning_rate=...
                                          min_child_weight=None, missing=nan,
                             

In [None]:
print(model_search_log.best_params_)

{'reg_lambda': 5, 'reg_alpha': 1, 'n_estimators': 500, 'max_depth': 7, 'learning_rate': 0.04, 'colsample_bytree': 0.5}


In [None]:
best_xgb_log = model_search_log.best_estimator_

In [None]:
pred_train = np.exp(best_xgb_log.predict(X_train))
pred_test = np.exp(best_xgb_log.predict(X_test))
print('training set r-squared: ', best_xgb_log.score(X_train, np.log(y_train)))
print('test set r-squared:     ', best_xgb_log.score(X_test, np.log(y_test)))
print('training set rmse:      ', np.sqrt(mean_squared_error(y_train, pred_train)))
print('test set rmse:          ', np.sqrt(mean_squared_error(y_test, pred_test)))
print('training set mape:      ', mean_absolute_percentage_error(y_train, pred_train))
print('test set mape:          ', mean_absolute_percentage_error(y_test, pred_test))

training set r-squared:  0.9514713053000792
test set r-squared:      0.9181730567603775
training set rmse:       72355.35138612155
test set rmse:           121580.84223753597
training set mape:       0.08433775819376059
test set mape:           0.10783065960394303


In [None]:
joblib.dump(best_xgb_log, 'best_xgb_log_0505.joblib')

['best_xgb_log_0505.joblib']

### KernelRidge

In [None]:
from sklearn.kernel_ridge import KernelRidge

In [None]:
# define the parameter grid for the randomized search
param_grid_KR = {'alpha': np.logspace(-3, 3, 7),
              'gamma': np.logspace(-3, 3, 7)}

# initialize a kernel ridge model with RBF kernel
model_KR = KernelRidge(kernel='rbf')

# initialize a randomized search with 5-fold cross-validation
random_search_KR = RandomizedSearchCV(estimator=model_KR, param_distributions=param_grid_KR,
                                   n_iter=10, cv=5, n_jobs=-1, random_state=42)

# fit the randomized search to the data
random_search_KR.fit(X_train, np.log(y_train))

# print the best hyperparameters and best score
print("Best hyperparameters: ", random_search_KR.best_params_)
print("Best score: ", random_search_KR.best_score_)

best_KernelRidge_log = random_search_KR.best_estimator_

pred_train = np.exp(best_KernelRidge_log.predict(X_train))
pred_test = np.exp(best_KernelRidge_log.predict(X_test))
# print('training set r-squared: ', best_KernelRidge_log.score(X_train, y_train))
# print('test set r-squared:     ', best_KernelRidge_log.score(X_test, y_test))
print('training set r-squared: ', best_KernelRidge_log.score(X_train, np.log(y_train)))
print('test set r-squared:     ', best_KernelRidge_log.score(X_test, np.log(y_test)))
print('training set rmse:      ', np.sqrt(mean_squared_error(y_train, pred_train)))
print('test set rmse:          ', np.sqrt(mean_squared_error(y_test, pred_test)))
print('training set mape:      ', mean_absolute_percentage_error(y_train, pred_train))
print('test set mape:          ', mean_absolute_percentage_error(y_test, pred_test))

joblib.dump(best_KernelRidge_log, 'best_kr_log_0505.joblib')

Best hyperparameters:  {'gamma': 1.0, 'alpha': 0.1}
Best score:  0.5025412902064414
training set r-squared:  -2.192623329720447
test set r-squared:      -2.083761174785212
training set rmse:       142951.6879019184
test set rmse:           371429.76996746095
training set mape:       0.1295081037707068
test set mape:           0.16279750183436106


['best_kr_log_0505.joblib']

In [None]:
print('training set r-squared: ', best_KernelRidge_log.score(X_train, np.log(y_train)))
print('test set r-squared:     ', best_KernelRidge_log.score(X_test, np.log(y_test)))

training set r-squared:  0.8933392320316527
test set r-squared:      0.5154397140143625


#### RandomizedSearchCV KNN

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
# define the parameter grid for the randomized search
param_grid_knn = {'n_neighbors': range(1, 31)}

# initialize a knn model
model_knn = KNeighborsRegressor()

# initialize a randomized search with 5-fold cross-validation
random_search_knn = RandomizedSearchCV(estimator=model_knn, param_distributions=param_grid_knn,
                                   n_iter=10, cv=5, n_jobs=-1, random_state=42)

# fit the randomized search to the data
random_search_knn.fit(X_train, np.log(y_train))

# print the best hyperparameters and best score
# print("Best hyperparameters: ", random_search_knn.best_params_)
print("Best score: ", random_search_knn.best_score_)

best_knn_log = random_search_knn.best_estimator_

pred_train = np.exp(best_knn_log.predict(X_train))
pred_test = np.exp(best_knn_log.predict(X_test))
print('training set r-squared: ', best_knn_log.score(X_train, np.log(y_train)))
print('test set r-squared:     ', best_knn_log.score(X_test, np.log(y_test)))
print('training set rmse:      ', np.sqrt(mean_squared_error(y_train, pred_train)))
print('test set rmse:          ', np.sqrt(mean_squared_error(y_test, pred_test)))
print('training set mape:      ', mean_absolute_percentage_error(y_train, pred_train))
print('test set mape:          ', mean_absolute_percentage_error(y_test, pred_test))

joblib.dump(best_knn_log, 'best_knn_log_0505.joblib')



Best score:  0.7932777109919785
training set r-squared:  -2.192623823572037
test set r-squared:      -2.0837609553845398
training set rmse:       172078.59633296877
test set rmse:           197271.6451630098
training set mape:       0.15162098360822165
test set mape:           0.16337972428614167


['best_knn_log_0505.joblib']

In [None]:
print('training set r-squared: ', best_knn_log.score(X_train, np.log(y_train)))
print('test set r-squared:     ', best_knn_log.score(X_test, np.log(y_test)))

training set r-squared:  0.8435536253143543
test set r-squared:      0.8169335202708206


#### RandomizedSearchCV SVR

In [None]:
from sklearn.svm import SVR

In [None]:
# define the parameter grid for the randomized search
param_grid_svr = {'C': np.logspace(-3, 3, 7),
              'gamma': np.logspace(-3, 3, 7)}

# initialize a svr model
model_svr = SVR(kernel='rbf')

# initialize a randomized search with 5-fold cross-validation
random_search_svr = RandomizedSearchCV(estimator=model_svr, param_distributions=param_grid_svr,
                                   n_iter=10, cv=5, n_jobs=-1, random_state=42)

# fit the randomized search to the data
random_search_svr.fit(X_train, np.log(y_train))

# print the best hyperparameters and best score
# print("Best hyperparameters: ", random_search_svr.best_params_)
print("Best score: ", random_search_svr.best_score_)

best_svr_log = random_search_svr.best_estimator_

pred_train = np.exp(best_svr_log.predict(X_train))
pred_test = np.exp(best_svr_log.predict(X_test))
print('training set r-squared: ', best_svr_log.score(X_train, np.log(y_train)))
print('test set r-squared:     ', best_svr_log.score(X_test, np.log(y_test)))
print('training set rmse:      ', np.sqrt(mean_squared_error(y_train, pred_train)))
print('test set rmse:          ', np.sqrt(mean_squared_error(y_test, pred_test)))
print('training set mape:      ', mean_absolute_percentage_error(y_train, pred_train))
print('test set mape:          ', mean_absolute_percentage_error(y_test, pred_test))

joblib.dump(best_svr_log, 'best_svr_log_0505.joblib')

Best score:  0.8922618494995372
training set r-squared:  0.9247470135646605
test set r-squared:      0.9024839395704978
training set rmse:       85921.355889983
test set rmse:           131303.7090388297
training set mape:       0.1071762330973566
test set mape:           0.11947261438914024


['best_svr_log_0505.joblib']

In [None]:
print(random_search_svr.best_params_)

{'gamma': 0.1, 'C': 1000.0}


#### RandomizedSearchCV MLP

In [None]:
from sklearn.neural_network import MLPRegressor
from scipy.stats import reciprocal

In [None]:
# define the parameter grid for the randomized search
param_grid_mlp = {
    'hidden_layer_sizes': [(50,), (100,), (200,), (300,), (400,), (500,)],
    'alpha': reciprocal(1e-4, 1e-2),
    'learning_rate_init': reciprocal(1e-4, 1e-2),
}

# initialize a mlp model
model_mlp = MLPRegressor(activation='relu', solver='adam', max_iter=500)

# initialize a randomized search with 5-fold cross-validation
random_search_mlp = RandomizedSearchCV(estimator=model_mlp, param_distributions=param_grid_mlp,
                                   n_iter=10, cv=5, n_jobs=-1, random_state=42)

# fit the randomized search to the data
random_search_mlp.fit(X_train, np.log(y_train))

print("Best score: ", random_search_mlp.best_score_)

best_mlp_log = random_search_mlp.best_estimator_

pred_train = np.exp(best_mlp_log.predict(X_train))
pred_test = np.exp(best_mlp_log.predict(X_test))
print('training set r-squared: ', best_mlp_log.score(X_train, np.log(y_train)))
print('test set r-squared:     ', best_mlp_log.score(X_test, np.log(y_test)))
print('training set rmse:      ', np.sqrt(mean_squared_error(y_train, pred_train)))
print('test set rmse:          ', np.sqrt(mean_squared_error(y_test, pred_test)))
print('training set mape:      ', mean_absolute_percentage_error(y_train, pred_train))
print('test set mape:          ', mean_absolute_percentage_error(y_test, pred_test))



Best score:  0.8848314758817685
training set r-squared:  0.8836447419579355
test set r-squared:      0.8878801470723188
training set rmse:       125907.6980874654
test set rmse:           130012.05710282196
training set mape:       0.13298862223614344
test set mape:           0.13003709103798688


In [None]:
print(random_search_mlp.best_params_)

{'alpha': 0.00020511104188433973, 'hidden_layer_sizes': (200,), 'learning_rate_init': 0.000828891686688514}


In [None]:
joblib.dump(best_mlp_log, 'best_mlp_log_0505.joblib')

In [None]:
best_models = [('knn', best_knn_log), ('svr', best_svr_log),
               ('xgb', best_xgb_log), ('mlp', best_mlp_log)]


In [None]:
from sklearn.linear_model import Ridge
from sklearn.ensemble import StackingRegressor

In [None]:
# define the StackingRegressor
stack = StackingRegressor(
    estimators=best_models,
    final_estimator=Ridge(),
    cv=5
)

In [None]:
# define the parameter distributions for the randomized search
param_distributions = {
    'final_estimator__alpha': uniform(loc=0.0, scale=10.0),
    'final_estimator__fit_intercept': [True, False]
}

In [None]:
# perform RandomizedSearchCV for the StackingRegressor
stack_search = RandomizedSearchCV(
    estimator=stack,
    param_distributions=param_distributions,
    n_iter=10,
    cv=5,
    n_jobs=-1
)


In [None]:
# fit the StackingRegressor to the data
stack_search.fit(X_train, np.log(y_train))







RandomizedSearchCV(cv=5,
                   estimator=StackingRegressor(cv=5,
                                               estimators=[('knn',
                                                            KNeighborsRegressor(n_neighbors=9)),
                                                           ('svr',
                                                            SVR(C=1000.0,
                                                                gamma=0.1)),
                                                           ('xgb',
                                                            XGBRegressor(base_score=0.5,
                                                                         booster='gbtree',
                                                                         callbacks=None,
                                                                         colsample_bylevel=1,
                                                                         colsample_bynode=1,
                 

In [None]:
# get the best StackingRegressor
best_stack = stack_search.best_estimator_

In [None]:
pred_train = np.exp(best_stack.predict(X_train))
pred_test = np.exp(best_stack.predict(X_test))
print('training set r-squared: ', best_stack.score(X_train, np.log(y_train)))
print('test set r-squared:     ', best_stack.score(X_test, np.log(y_test)))
print('training set rmse:      ', np.sqrt(mean_squared_error(y_train, pred_train)))
print('test set rmse:          ', np.sqrt(mean_squared_error(y_test, pred_test)))
print('training set mape:      ', mean_absolute_percentage_error(y_train, pred_train))
print('test set mape:          ', mean_absolute_percentage_error(y_test, pred_test))

training set r-squared:  0.9481188577013246
test set r-squared:      0.918874459100527
training set rmse:       72450.97793478268
test set rmse:           117162.33131669447
training set mape:       0.0872553418310514
test set mape:           0.10764618386715775


In [None]:
joblib.dump(best_stack, 'best_stack_0505.joblib')