In [None]:
import joblib

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor

In [None]:
df = pd.read_csv('data3_0505.csv', index_col=0)

In [None]:
X=df.drop(['price'], axis=1)
y = df['price']

### Training/Test Set Split and MinMaxScaler

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=7633)

In [None]:
scaler = MinMaxScaler()
scaler.fit(X_train)

In [None]:
X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index)
X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index)

### RandomizedSearchCV Random Forest

In [None]:
# Define parameter grid for RandomizedSearchCV
rf_param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],  # Number of trees
    'max_depth': [None, 10, 20, 30, 40],        # Maximum depth of trees
    'min_samples_split': [2, 5, 10],            # Minimum samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],              # Minimum samples required to be at a leaf node
    'max_features': ['auto', 'sqrt', 'log2'],   # Number of features to consider at each split
    'bootstrap': [True, False]                  # Whether bootstrap samples are used when building trees
}

# Initialize RandomForestRegressor
rf_model = RandomForestRegressor(random_state=42)

# Initialize RandomizedSearchCV
rf_random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=rf_param_dist,
    n_iter=20,               # Number of parameter settings sampled
    cv=4,                    # 4-fold cross-validation
    scoring='neg_mean_squared_error',
    random_state=42,
    n_jobs=-1                # Use all available processors
)

# Fit RandomizedSearchCV
rf_random_search.fit(X_train, np.log(y_train))

# Best model
best_rf_log = rf_random_search.best_estimator_

pred_train = np.exp(best_rf_log.predict(X_train))
pred_test = np.exp(best_rf_log.predict(X_test))
print('training set r-squared: ', best_rf_log.score(X_train, np.log(y_train)))
print('test set r-squared:     ', best_rf_log.score(X_test, np.log(y_test)))
print('training set rmse:      ', np.sqrt(mean_squared_error(y_train, pred_train)))
print('test set rmse:          ', np.sqrt(mean_squared_error(y_test, pred_test)))
print('training set mape:      ', mean_absolute_percentage_error(y_train, pred_train))
print('test set mape:          ', mean_absolute_percentage_error(y_test, pred_test))


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(


training set r-squared:  0.9819377236513135
test set r-squared:      0.9027655723772603
training set rmse:       58057.15315649944
test set rmse:           141273.7032235313
training set mape:       0.05046143847373636
test set mape:           0.11698165842131433


### RandomizedSearchCV DecisionTreeRegressor

In [None]:
# Define parameter grid for RandomizedSearchCV
dt_param_dist = {
    'max_depth': [None, 10, 20, 30, 40],          # Maximum depth of the tree
    'min_samples_split': [2, 5, 10, 20],         # Minimum samples required to split an internal node
    'min_samples_leaf': [1, 2, 4, 8],            # Minimum samples required to be at a leaf node
    'max_features': [None, 'sqrt', 'log2']       # Number of features to consider for the best split
}

# Initialize DecisionTreeRegressor
dt_model = DecisionTreeRegressor(random_state=42)

# Initialize RandomizedSearchCV
dt_random_search = RandomizedSearchCV(
    estimator=dt_model,
    param_distributions=dt_param_dist,
    n_iter=20,               # Number of parameter settings sampled
    cv=4,                    # 4-fold cross-validation
    scoring='neg_mean_squared_error',
    random_state=42,
    n_jobs=-1                # Use all available processors
)

# Fit RandomizedSearchCV
dt_random_search.fit(X_train, np.log(y_train))

# Best model
best_dt_log = dt_random_search.best_estimator_

pred_train = np.exp(best_dt_log.predict(X_train))
pred_test = np.exp(best_dt_log.predict(X_test))
print('training set r-squared: ', best_dt_log.score(X_train, np.log(y_train)))
print('test set r-squared:     ', best_dt_log.score(X_test, np.log(y_test)))
print('training set rmse:      ', np.sqrt(mean_squared_error(y_train, pred_train)))
print('test set rmse:          ', np.sqrt(mean_squared_error(y_test, pred_test)))
print('training set mape:      ', mean_absolute_percentage_error(y_train, pred_train))
print('test set mape:          ', mean_absolute_percentage_error(y_test, pred_test))


training set r-squared:  0.9245675676615596
test set r-squared:      0.8357026706796978
training set rmse:       109046.17964093712
test set rmse:           162793.9042886265
training set mape:       0.10550816798000565
test set mape:           0.15612923397713668


## RandomizedSearchCV XGB

In [None]:
#### Original parameter grid for RandomizedSearchCV

xgb_param_dist = {
    'learning_rate': [0.03, 0.04, 0.05, 0.06, 0.07],
    'max_depth': [5, 6, 7],
    'n_estimators': [400, 500, 600],
    'colsample_bytree': np.arange(0.1, 1.1, 0.1),
    'reg_alpha': [1, 3, 5],
    'reg_lambda': [3, 5, 7]
}

In [None]:
# Define parameter grid for GridSearchCV
param_grid = {
    'learning_rate': [0.05, 0.06, 0.07],
    'max_depth': [5, 6, 7],
    'n_estimators': [450, 500, 550],
    'colsample_bytree': np.arange(0.2, 0.5, 0.1),
    'reg_alpha': [2, 3, 4],
    'reg_lambda': [4, 5, 6]
}


In [None]:
model = XGBRegressor(**{'reg_lambda': 5,
                        'reg_alpha': 3,
                        'n_estimators': 500,
                        'max_depth': 6,
                        'learning_rate': 0.06,
                        'colsample_bytree': 0.3})

In [None]:
model_search_log = GridSearchCV(model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

In [None]:
model_search_log.fit(X_train, np.log(y_train))

Fitting 5 folds for each of 729 candidates, totalling 3645 fits


GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    callbacks=None, colsample_bylevel=None,
                                    colsample_bynode=None, colsample_bytree=0.3,
                                    early_stopping_rounds=None,
                                    enable_categorical=False, eval_metric=None,
                                    feature_types=None, gamma=None, gpu_id=None,
                                    grow_policy=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=0.06, ma...
                                    max_depth=6, max_leaves=None,
                                    min_child_weight=None, missing=nan,
                                    monotone_constraints=None, n_estimators=500,
                                    n_jobs=None, num_parallel_tree=None,
                                    

In [None]:
print(model_search_log.best_params_)

{'colsample_bytree': 0.30000000000000004, 'learning_rate': 0.06, 'max_depth': 7, 'n_estimators': 550, 'reg_alpha': 2, 'reg_lambda': 6}


In [None]:
best_xgb_log = model_search_log.best_estimator_

In [None]:
pred_train = np.exp(best_xgb_log.predict(X_train))
pred_test = np.exp(best_xgb_log.predict(X_test))
print('training set r-squared: ', best_xgb_log.score(X_train, np.log(y_train)))
print('test set r-squared:     ', best_xgb_log.score(X_test, np.log(y_test)))
print('training set rmse:      ', np.sqrt(mean_squared_error(y_train, pred_train)))
print('test set rmse:          ', np.sqrt(mean_squared_error(y_test, pred_test)))
print('training set mape:      ', mean_absolute_percentage_error(y_train, pred_train))
print('test set mape:          ', mean_absolute_percentage_error(y_test, pred_test))

training set r-squared:  0.9519741197164792
test set r-squared:      0.9192446044219327
training set rmse:       74766.35000405362
test set rmse:           120609.19385731044
training set mape:       0.08367631684636473
test set mape:           0.10717089609238625


In [None]:
joblib.dump(best_xgb_log, 'best_xgb_log_0506_9192.joblib')

['best_xgb_log_0506_9192.joblib']

In [None]:
# Define parameter grid for GridSearchCV
param_grid = {
    'max_depth': [7, 8],
    'n_estimators': [550, 600],
    'colsample_bytree': [0.25, 0.3, 0.35],
    'reg_alpha': [1.5, 2, 2.5],
    'reg_lambda': [5.5, 6, 6.5]
}


In [None]:
model = XGBRegressor(**{'reg_lambda': 6,
                        'reg_alpha': 2,
                        'n_estimators': 550,
                        'max_depth': 7,
                        'learning_rate': 0.06,
                        'colsample_bytree': 0.3})

In [None]:
model_search_log = GridSearchCV(model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

In [None]:
model_search_log.fit(X_train, np.log(y_train))

Fitting 5 folds for each of 108 candidates, totalling 540 fits


GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    callbacks=None, colsample_bylevel=None,
                                    colsample_bynode=None, colsample_bytree=0.3,
                                    early_stopping_rounds=None,
                                    enable_categorical=False, eval_metric=None,
                                    feature_types=None, gamma=None, gpu_id=None,
                                    grow_policy=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=0.06, ma...
                                    max_cat_to_onehot=None, max_delta_step=None,
                                    max_depth=7, max_leaves=None,
                                    min_child_weight=None, missing=nan,
                                    monotone_constraints=None, n_estimators=550,
                            

In [None]:
print(model_search_log.best_params_)

{'colsample_bytree': 0.3, 'max_depth': 7, 'n_estimators': 600, 'reg_alpha': 2, 'reg_lambda': 6}


In [None]:
best_xgb_log = model_search_log.best_estimator_

In [None]:
pred_train = np.exp(best_xgb_log.predict(X_train))
pred_test = np.exp(best_xgb_log.predict(X_test))
print('training set r-squared: ', best_xgb_log.score(X_train, np.log(y_train)))
print('test set r-squared:     ', best_xgb_log.score(X_test, np.log(y_test)))
print('training set rmse:      ', np.sqrt(mean_squared_error(y_train, pred_train)))
print('test set rmse:          ', np.sqrt(mean_squared_error(y_test, pred_test)))
print('training set mape:      ', mean_absolute_percentage_error(y_train, pred_train))
print('test set mape:          ', mean_absolute_percentage_error(y_test, pred_test))

training set r-squared:  0.9538955070530486
test set r-squared:      0.9193857002194192
training set rmse:       73188.7344169428
test set rmse:           120298.3705110394
training set mape:       0.08195083553787552
test set mape:           0.10707397737459101


In [None]:
# joblib.dump(best_xgb_log, 'best_xgb_log_0506_9192.joblib')

In [None]:
# Define parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [600, 650, 700],
}


In [None]:
model = XGBRegressor(**{'reg_lambda': 6,
                        'reg_alpha': 2,
                        'n_estimators': 600,
                        'max_depth': 7,
                        'learning_rate': 0.06,
                        'colsample_bytree': 0.3})

In [None]:
model_search_log = GridSearchCV(model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

In [None]:
model_search_log.fit(X_train, np.log(y_train))

Fitting 5 folds for each of 3 candidates, totalling 15 fits


GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    callbacks=None, colsample_bylevel=None,
                                    colsample_bynode=None, colsample_bytree=0.3,
                                    early_stopping_rounds=None,
                                    enable_categorical=False, eval_metric=None,
                                    feature_types=None, gamma=None, gpu_id=None,
                                    grow_policy=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=0.06, max_bin=None,
                                    max_cat_threshold=None,
                                    max_cat_to_onehot=None, max_delta_step=None,
                                    max_depth=7, max_leaves=None,
                                    min_child_weight=None, missing=nan,
                                    monot

In [None]:
print(model_search_log.best_params_)

{'n_estimators': 700}


In [None]:
best_xgb_log = model_search_log.best_estimator_

In [None]:
pred_train = np.exp(best_xgb_log.predict(X_train))
pred_test = np.exp(best_xgb_log.predict(X_test))
print('training set r-squared: ', best_xgb_log.score(X_train, np.log(y_train)))
print('test set r-squared:     ', best_xgb_log.score(X_test, np.log(y_test)))
print('training set rmse:      ', np.sqrt(mean_squared_error(y_train, pred_train)))
print('test set rmse:          ', np.sqrt(mean_squared_error(y_test, pred_test)))
print('training set mape:      ', mean_absolute_percentage_error(y_train, pred_train))
print('test set mape:          ', mean_absolute_percentage_error(y_test, pred_test))

training set r-squared:  0.9572883349077336
test set r-squared:      0.9195335721578998
training set rmse:       69895.63161204052
test set rmse:           119799.14741726019
training set mape:       0.07884935493327737
test set mape:           0.10696072300390187


In [None]:
joblib.dump(best_xgb_log, 'best_xgb_log_0506_9195.joblib')

['best_xgb_log_0506_9195.joblib']

In [None]:
# Define parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [700, 750, 800],
}


In [None]:
model = XGBRegressor(**{'reg_lambda': 6,
                        'reg_alpha': 2,
                        'n_estimators': 600,
                        'max_depth': 7,
                        'learning_rate': 0.06,
                        'colsample_bytree': 0.3})

In [None]:
model_search_log = GridSearchCV(model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

In [None]:
model_search_log.fit(X_train, np.log(y_train))

Fitting 5 folds for each of 3 candidates, totalling 15 fits


GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    callbacks=None, colsample_bylevel=None,
                                    colsample_bynode=None, colsample_bytree=0.3,
                                    early_stopping_rounds=None,
                                    enable_categorical=False, eval_metric=None,
                                    feature_types=None, gamma=None, gpu_id=None,
                                    grow_policy=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=0.06, max_bin=None,
                                    max_cat_threshold=None,
                                    max_cat_to_onehot=None, max_delta_step=None,
                                    max_depth=7, max_leaves=None,
                                    min_child_weight=None, missing=nan,
                                    monot

In [None]:
print(model_search_log.best_params_)

{'n_estimators': 700}


In [None]:
best_xgb_log = model_search_log.best_estimator_

In [None]:
pred_train = np.exp(best_xgb_log.predict(X_train))
pred_test = np.exp(best_xgb_log.predict(X_test))
print('training set r-squared: ', best_xgb_log.score(X_train, np.log(y_train)))
print('test set r-squared:     ', best_xgb_log.score(X_test, np.log(y_test)))
print('training set rmse:      ', np.sqrt(mean_squared_error(y_train, pred_train)))
print('test set rmse:          ', np.sqrt(mean_squared_error(y_test, pred_test)))
print('training set mape:      ', mean_absolute_percentage_error(y_train, pred_train))
print('test set mape:          ', mean_absolute_percentage_error(y_test, pred_test))

training set r-squared:  0.9572883349077336
test set r-squared:      0.9195335721578998
training set rmse:       69895.63161204052
test set rmse:           119799.14741726019
training set mape:       0.07884935493327737
test set mape:           0.10696072300390187


In [None]:
# joblib.dump(best_xgb_log, 'best_xgb_log_0506_9195.joblib')