### Importing Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Importing ML Libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error,root_mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.tree import DecisionTreeRegressor



### Reading CSV File

In [3]:
df = pd.read_csv('Singapore_Flat_Cleaned.csv')

In [4]:
df = df.sample(50000,random_state=101)

In [5]:
df = df.reset_index(drop = True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   flat_type            50000 non-null  float64
 1   block_0              50000 non-null  float64
 2   block_1              50000 non-null  float64
 3   block_2              50000 non-null  float64
 4   street_name_0        50000 non-null  float64
 5   street_name_1        50000 non-null  float64
 6   floor_area_sqm       50000 non-null  float64
 7   flat_model           50000 non-null  float64
 8   lease_commence_date  50000 non-null  float64
 9   resale_price         50000 non-null  float64
 10  year                 50000 non-null  float64
 11  start_storey_range   50000 non-null  float64
 12  end_storey_range     50000 non-null  float64
 13  town_PUNGGOL         50000 non-null  float64
 14  town_SENGKANG        50000 non-null  float64
dtypes: float64(15)
memory usage: 5.7 MB


In [7]:
df.describe()

Unnamed: 0,flat_type,block_0,block_1,block_2,street_name_0,street_name_1,floor_area_sqm,flat_model,lease_commence_date,resale_price,year,start_storey_range,end_storey_range,town_PUNGGOL,town_SENGKANG
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,3.01278,0.0283,0.0954,0.26314,0.03928,0.38936,95.671686,9.33158,1988.2026,318003.4,2006.02026,6.65616,8.67012,0.0221,0.03474
std,0.950871,0.16583,0.29377,0.440342,0.194262,0.48761,25.875773,3.800958,10.570316,167803.3,9.230124,4.791012,4.791033,0.14701,0.183122
min,0.0,0.0,0.0,0.0,0.0,0.0,28.0,0.0,1966.0,5600.0,1990.0,1.0,3.0,0.0,0.0
25%,2.0,0.0,0.0,0.0,0.0,0.0,73.0,5.0,1981.0,192000.0,1998.0,4.0,6.0,0.0,0.0
50%,3.0,0.0,0.0,0.0,0.0,0.0,93.0,9.0,1986.0,295000.0,2005.0,7.0,9.0,0.0,0.0
75%,4.0,0.0,0.0,1.0,0.0,1.0,113.0,12.0,1996.0,415000.0,2013.0,10.0,12.0,0.0,0.0
max,6.0,1.0,1.0,1.0,1.0,1.0,280.0,20.0,2020.0,1385000.0,2024.0,49.0,51.0,1.0,1.0


In [8]:
df.shape

(50000, 15)

In [9]:
df.columns

Index(['flat_type', 'block_0', 'block_1', 'block_2', 'street_name_0',
       'street_name_1', 'floor_area_sqm', 'flat_model', 'lease_commence_date',
       'resale_price', 'year', 'start_storey_range', 'end_storey_range',
       'town_PUNGGOL', 'town_SENGKANG'],
      dtype='object')

In [10]:
df.head()

Unnamed: 0,flat_type,block_0,block_1,block_2,street_name_0,street_name_1,floor_area_sqm,flat_model,lease_commence_date,resale_price,year,start_storey_range,end_storey_range,town_PUNGGOL,town_SENGKANG
0,4.0,0.0,0.0,0.0,0.0,0.0,118.0,5.0,1980.0,390000.0,2006.0,19.0,21.0,0.0,0.0
1,4.0,0.0,0.0,1.0,0.0,1.0,124.0,5.0,1996.0,440000.0,2018.0,4.0,6.0,0.0,0.0
2,2.0,0.0,0.0,0.0,0.0,0.0,74.0,9.0,1985.0,181000.0,2001.0,4.0,6.0,0.0,0.0
3,5.0,0.0,1.0,0.0,0.0,0.0,150.0,3.0,1995.0,485000.0,1999.0,10.0,12.0,0.0,0.0
4,3.0,0.0,0.0,0.0,0.0,0.0,99.0,12.0,1981.0,361000.0,2015.0,1.0,3.0,0.0,0.0


In [11]:
X = df.drop('resale_price', axis = 1)
y = df['resale_price']

In [12]:
def run_regression(pipeline, param_grid, X, y):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
    
    X_train = pipeline.named_steps['scaler'].fit_transform(X_train)
    
    X_test = pipeline.named_steps['scaler'].transform(X_test)
    
    grid = GridSearchCV(pipeline, param_grid, cv = 5, scoring='neg_mean_squared_error')
    
    grid.fit(X_train, y_train)
    
    print('\nBest Parameters: ',grid.best_params_)
    
    y_pred = grid.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = root_mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print('Mean Absolute Error (MAE): ', mae)
    print('Mean Squared Error (MSE): ', mse)
    print('Root Mean Squared Error (RMSE): ', rmse)
    print('R-squared (R2): ', r2)

### Elastic Net Regression

In [13]:
pipeline_elasticnet = Pipeline([
    ('scaler', StandardScaler()),
    ('elasticnet', ElasticNet())
])

param_grid_elasticnet = {
    'elasticnet__alpha': [0.1, 1, 10], # Strength of regularization penalty
    'elasticnet__l1_ratio': [0.1, 0.5, 0.9], # Balance between L2 and L1
    'elasticnet__max_iter': [10000] # Max iterations the optimization algorithm should run during training
}

run_regression(pipeline_elasticnet, param_grid_elasticnet, X, y)


Best Parameters:  {'elasticnet__alpha': 0.1, 'elasticnet__l1_ratio': 0.9, 'elasticnet__max_iter': 10000}
Mean Absolute Error (MAE):  60142.69767835745
Mean Squared Error (MSE):  6612291906.815933
Root Mean Squared Error (RMSE):  81316.00031245963
R-squared (R2):  0.7635708014040145


### Random Forest Regressor

In [14]:
pipeline_rf = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestRegressor())
])

param_grid_rf = {
    'rf__n_estimators': [10, 20], # No. of Trees
    'rf__max_depth': [None, 5], # Maximum depth of a Tree
    'rf__min_samples_split': [2, 5], # Min samples required to split an internal node
    'rf__min_samples_leaf': [1, 2] # Min samples required to be in a leaf node
}

run_regression(pipeline_rf, param_grid_rf, X, y)


Best Parameters:  {'rf__max_depth': None, 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 2, 'rf__n_estimators': 20}
Mean Absolute Error (MAE):  33710.07425689229
Mean Squared Error (MSE):  2606282859.6811357
Root Mean Squared Error (RMSE):  51051.76646974261
R-squared (R2):  0.9068097149199228


### Decision Tree Regressor

In [16]:
pipeline_decision_tree = Pipeline([
    ('scaler', StandardScaler()),  
    ('decision_tree', DecisionTreeRegressor())
])

param_grid_decision_tree = {
    'decision_tree__max_depth': [None, 10, 20, 30],  
    'decision_tree__min_samples_split': [2, 5, 10],   
    'decision_tree__min_samples_leaf': [1, 2, 4]      
}

run_regression(pipeline_decision_tree, param_grid_decision_tree, X, y)


Best Parameters:  {'decision_tree__max_depth': 10, 'decision_tree__min_samples_leaf': 2, 'decision_tree__min_samples_split': 10}
Mean Absolute Error (MAE):  39131.206640802375
Mean Squared Error (MSE):  3489659371.087043
Root Mean Squared Error (RMSE):  59073.3389194063
R-squared (R2):  0.8752236924645427


### Gradient Boosting Regressor

In [17]:
pipeline_gb = Pipeline([
    ('scaler', StandardScaler()),
    ('gb', GradientBoostingRegressor())
])

param_grid_gb = {
    'gb__n_estimators': [10, 20],
    'gb__learning_rate': [0.01, 0.1], # Step size at eaach iteration moving towards a min loss function
    'gb__max_depth': [3, 4, 5],
    'gb__min_samples_split': [2, 5],
    'gb__min_samples_leaf': [1, 2]
}

run_regression(pipeline_gb, param_grid_gb, X, y)


Best Parameters:  {'gb__learning_rate': 0.1, 'gb__max_depth': 5, 'gb__min_samples_leaf': 2, 'gb__min_samples_split': 5, 'gb__n_estimators': 20}
Mean Absolute Error (MAE):  44414.86306403085
Mean Squared Error (MSE):  4105836838.171308
Root Mean Squared Error (RMSE):  64076.80421315742
R-squared (R2):  0.8531916426414747


### LightGBM Regressor

In [13]:
pipeline_lightgbm = Pipeline([
    ('scaler', StandardScaler()),  
    ('lightgbm', LGBMRegressor())
])

param_grid_lightgbm = {
    'lightgbm__learning_rate': [0.05, 0.1, 0.2],
    'lightgbm__n_estimators': [50, 100, 200],
    'lightgbm__max_depth': [5, 10, -1] # -1 means no limit on the depth
}

run_regression(pipeline_lightgbm, param_grid_lightgbm, X, y)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001662 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 324
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 317876.449946
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000886 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 323
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 316955.186728
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000790 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is n

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000959 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 324
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 317876.449946
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000775 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 323
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 316955.186728


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000836 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 326
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 318130.329594
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001072 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 325
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 317517.475768


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000808 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 327
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 317376.867589
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000846 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 324
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 317876.449946


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000857 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 323
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 316955.186728


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001059 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 326
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 318130.329594


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000844 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 325
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 317517.475768
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000814 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 327
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 317376.867589


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000771 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 324
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 317876.449946


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000846 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 323
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 316955.186728
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000790 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 326
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 318130.329594
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000774 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is n

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003694 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 326
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 318130.329594
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000885 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 325
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 317517.475768
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000869 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] 

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000753 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 323
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 316955.186728
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000784 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 326
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 318130.329594


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000868 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 325
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 317517.475768
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000747 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 327
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 317376.867589
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000746 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is n

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000835 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 323
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 316955.186728
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000812 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 326
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 318130.329594


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000864 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 325
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 317517.475768
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000814 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 327
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 317376.867589


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000939 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 324
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 317876.449946
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000784 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 323
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 316955.186728


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000862 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 326
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 318130.329594


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000771 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 325
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 317517.475768


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001233 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 327
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 317376.867589
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000896 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 324
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 317876.449946


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000718 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 323
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 316955.186728
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000745 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 326
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 318130.329594
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000851 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is n

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000775 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 325
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 317517.475768
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000755 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 327
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 317376.867589
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000854 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is n

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000827 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 325
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 317517.475768
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000813 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 327
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 317376.867589
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000929 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is n

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000758 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 323
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 316955.186728
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000769 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 326
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 318130.329594


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000763 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 325
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 317517.475768
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000815 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 327
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 317376.867589
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000755 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is n

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000801 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 323
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 316955.186728


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000813 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 326
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 318130.329594


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000791 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 325
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 317517.475768
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000771 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 327
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 317376.867589


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000766 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 324
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 317876.449946
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000752 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 323
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 316955.186728


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000823 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 326
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 318130.329594
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000985 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 325
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 317517.475768
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000788 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is n

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000835 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 325
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 317517.475768
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000782 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 327
[LightGBM] [Info] Number of data points in the train set: 28000, number of used features: 14
[LightGBM] [Info] Start training from score 317376.867589
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000792 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is n


Best Parameters:  {'lightgbm__learning_rate': 0.2, 'lightgbm__max_depth': 10, 'lightgbm__n_estimators': 200}
Mean Absolute Error (MAE):  31260.518126512703
Mean Squared Error (MSE):  2138824579.3888588
Root Mean Squared Error (RMSE):  46247.4278137591
R-squared (R2):  0.9235241594943729


### XGBoost Regressor

In [14]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('xgb', XGBRegressor())
])

param_grid = {
    'xgb__n_estimators': [100, 200, 300], 
    'xgb__learning_rate': [0.01, 0.1, 0.2], 
    'xgb__max_depth': [3, 4, 5] 
}

run_regression(pipeline, param_grid, X, y)


Best Parameters:  {'xgb__learning_rate': 0.2, 'xgb__max_depth': 5, 'xgb__n_estimators': 300}
Mean Absolute Error (MAE):  31010.584877603516
Mean Squared Error (MSE):  2091790605.4283772
Root Mean Squared Error (RMSE):  45736.09740050387
R-squared (R2):  0.925205906901622


### Out of all the models, XGBoost performed better in terms of overall metrics such as MAE, MSE, RMSE, R2

In [15]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('xgb', XGBRegressor())
])

param_grid = {
    'xgb__n_estimators': [300], 
    'xgb__learning_rate': [0.2], 
    'xgb__max_depth': [5] 
}

run_regression(pipeline, param_grid, X, y)


Best Parameters:  {'xgb__learning_rate': 0.2, 'xgb__max_depth': 5, 'xgb__n_estimators': 300}
Mean Absolute Error (MAE):  31010.584877603516
Mean Squared Error (MSE):  2091790605.4283772
Root Mean Squared Error (RMSE):  45736.09740050387
R-squared (R2):  0.925205906901622


### Saving the model

In [16]:
import joblib

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

pipeline.fit(X_train, y_train)

joblib.dump(pipeline, 'regression_model.joblib')

['regression_model.joblib']

In [17]:
# Load the model from the file
loaded_model = joblib.load('regression_model.joblib')

# Use the loaded model for predictions
loaded_model.predict(X_test)

array([143792.36, 223571.02, 649770.4 , ..., 170395.73, 248380.97,
       138325.33], dtype=float32)