In [1]:
# 0.Import the needed modules/packages 
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import sklearn.datasets as skdatasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
import os, datetime
import time

In [2]:
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
boston_features = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
boston_labels = raw_df.values[1::2, 2]

In [78]:
SEED = 123
x_train,x_test,y_train,y_test = train_test_split(boston_features, boston_labels, test_size=0.2, random_state=SEED)

In [79]:
from sklearn.preprocessing import StandardScaler
standardizer = StandardScaler()
for i in [x_train,x_test]:
    standardizer.fit(i)
    i= standardizer.transform(i)

In [5]:
# Using linear regressor
from sklearn.linear_model import LinearRegression
model_l = LinearRegression()
now = time.time() #(seconds)
model_l.fit(x_train, y_train)
print(time.time()-now)

0.019435405731201172


In [6]:
# Using SKlearn to do statistical analysics
# mae(y_pred, y_test)
# matthews_corrcoef(y_true, y_pred)
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae

In [7]:
pred_l = model_l.predict(x_test)
mae_l = mae(pred_l, y_test)
mse_l = mse(pred_l, y_test)
corr_l = np.corrcoef(pred_l, y_test)[0,1]
print("mae = {:.3f}".format(mae_l))
print("mse = {:.3f}".format(mse_l))
print("correlation = {:.3f}".format(corr_l))

mae = 3.660
mse = 28.192
correlation = 0.815


In [8]:
table_l = pd.DataFrame({"True Price" : y_test, "Predicted Price": model_l.predict(x_test)})
table_l.head()

Unnamed: 0,True Price,Predicted Price
0,15.0,16.0033
1,26.6,27.794474
2,45.4,39.267695
3,20.8,18.326136
4,34.9,30.454875


In [9]:
# Using random forest regressor
from sklearn.ensemble import RandomForestRegressor
model_RF = RandomForestRegressor()
now = time.time()
model_RF.fit(x_train, y_train)
print(time.time()-now)

0.45168423652648926


In [10]:
pred_RF = model_RF.predict(x_test)
mae_RF = mae(pred_RF, y_test)
mse_RF = mse(pred_RF, y_test)
corr_RF = np.corrcoef(pred_RF, y_test)[0,1]
print("mae = {:.3f}".format(mae_RF))
print("mse = {:.3f}".format(mse_RF))
print("correlation = {:.3f}".format(corr_RF))

mae = 2.499
mse = 19.231
correlation = 0.882


In [11]:
table_RF = pd.DataFrame({"True Price" : y_test, "Predicted Price": model_RF.predict(x_test)})
table_RF.head()

Unnamed: 0,True Price,Predicted Price
0,15.0,31.913
1,26.6,27.88
2,45.4,47.983
3,20.8,21.135
4,34.9,30.936


In [12]:
# Using K-Nearest Neighbor regressor
from sklearn.neighbors import KNeighborsRegressor
model_KNN = KNeighborsRegressor()
now = time.time()
model_KNN.fit(x_train, y_train)
print(time.time()-now)

0.001462697982788086


In [13]:
pred_KNN = model_KNN.predict(x_test)
mae_KNN = mae(pred_KNN, y_test)
mse_KNN = mse(pred_KNN, y_test)
corr_KNN = np.corrcoef(pred_KNN, y_test)[0,1]
print("mae = {:.3f}".format(mae_KNN))
print("mse = {:.3f}".format(mse_KNN))
print("correlation = {:.3f}".format(corr_KNN))

mae = 4.153
mse = 37.430
correlation = 0.743


In [14]:
table_KNN = pd.DataFrame({"True Price" : y_test, "Predicted Price": model_RF.predict(x_test)})
table_KNN.head()

Unnamed: 0,True Price,Predicted Price
0,15.0,31.913
1,26.6,27.88
2,45.4,47.983
3,20.8,21.135
4,34.9,30.936


In [15]:
# Using support vector machine
from sklearn.svm import SVR
model_SVR = SVR()
now = time.time()
model_SVR.fit(x_train, y_train)
print(time.time()-now)

0.03123307228088379


In [16]:
pred_SVR = model_SVR.predict(x_test)
mae_SVR = mae(pred_SVR, y_test)
mse_SVR = mse(pred_SVR, y_test)
corr_SVR = np.corrcoef(pred_SVR, y_test)[0,1]
print("mae = {:.3f}".format(mae_SVR))
print("mse = {:.3f}".format(mse_SVR))
print("correlation = {:.3f}".format(corr_SVR))

mae = 5.561
mse = 74.233
correlation = 0.440


In [17]:
table_SVR = pd.DataFrame({"True Price" : y_test, "Predicted Price": model_RF.predict(x_test)})
table_SVR.head()

Unnamed: 0,True Price,Predicted Price
0,15.0,31.913
1,26.6,27.88
2,45.4,47.983
3,20.8,21.135
4,34.9,30.936


In [18]:
# SGD Regressor
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
now = time.time()
model_SGD = make_pipeline(StandardScaler(), SGDRegressor(max_iter=1000, tol=1e-3, early_stopping=True, n_iter_no_change=100))
model_SGD.fit(x_train, y_train)
print(time.time()-now)

0.2778902053833008


In [19]:
pred_SGD = model_SGD.predict(x_test)
mae_SGD = mae(pred_SGD, y_test)
mse_SGD = mse(pred_SGD, y_test)
corr_SGD = np.corrcoef(pred_SGD, y_test)[0,1]
print("mae = {:.3f}".format(mae_SGD))
print("mse = {:.3f}".format(mse_SGD))
print("correlation = {:.3f}".format(corr_SGD))

mae = 3.682
mse = 28.226
correlation = 0.814


In [20]:
table_SGD = pd.DataFrame({"True Price" : y_test, "Predicted Price": model_SGD.predict(x_test)})
table_SGD

Unnamed: 0,True Price,Predicted Price
0,15.0,16.148443
1,26.6,27.847513
2,45.4,39.375237
3,20.8,18.423487
4,34.9,30.159950
...,...,...
97,31.5,31.373880
98,23.3,26.560149
99,33.3,36.300826
100,17.5,16.540990


In [21]:
# Using Bayesian Regressor
from sklearn.linear_model import BayesianRidge as BR
model_BR = BR()
now = time.time()
model_BR.fit(x_train, y_train)
print(time.time()-now)

0.0033674240112304688


In [22]:
pred_BR = model_BR.predict(x_test)
mae_BR = mae(pred_BR, y_test)
mse_BR = mse(pred_BR, y_test)
corr_BR = np.corrcoef(pred_BR, y_test)[0,1]
print("mae = {:.3f}".format(mae_BR))
print("mse = {:.3f}".format(mse_BR))
print("correlation = {:.3f}".format(corr_BR))

mae = 3.740
mse = 29.803
correlation = 0.802


In [23]:
table_BR = pd.DataFrame({"True Price" : y_test, "Predicted Price": model_BR.predict(x_test)})
table_BR

Unnamed: 0,True Price,Predicted Price
0,15.0,15.200026
1,26.6,27.553841
2,45.4,38.344317
3,20.8,17.778715
4,34.9,30.596764
...,...,...
97,31.5,31.567388
98,23.3,29.527391
99,33.3,35.758091
100,17.5,16.856841


In [24]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.0.6-cp37-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 63.0 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.6


In [102]:
# Using CatBoost Regressor
from catboost import CatBoostRegressor as CR
model_CR = CR(learning_rate=0.032, depth=7, iterations=2000, early_stopping_rounds=300, use_best_model=True)
now = time.time()
model_CR.fit(x_train, y_train, eval_set=(x_test, y_test))
print(time.time()-now)

0:	learn: 9.0260370	test: 8.9705282	best: 8.9705282 (0)	total: 4.08ms	remaining: 8.15s
1:	learn: 8.8596001	test: 8.8073052	best: 8.8073052 (1)	total: 7.85ms	remaining: 7.85s
2:	learn: 8.6865624	test: 8.6415225	best: 8.6415225 (2)	total: 11.5ms	remaining: 7.64s
3:	learn: 8.5303259	test: 8.5196486	best: 8.5196486 (3)	total: 15.3ms	remaining: 7.63s
4:	learn: 8.3909854	test: 8.3885833	best: 8.3885833 (4)	total: 18.9ms	remaining: 7.55s
5:	learn: 8.2333715	test: 8.2170467	best: 8.2170467 (5)	total: 22.3ms	remaining: 7.42s
6:	learn: 8.0951048	test: 8.0904828	best: 8.0904828 (6)	total: 25.8ms	remaining: 7.35s
7:	learn: 7.9445392	test: 7.9557880	best: 7.9557880 (7)	total: 29.6ms	remaining: 7.38s
8:	learn: 7.7977058	test: 7.8180918	best: 7.8180918 (8)	total: 33.3ms	remaining: 7.37s
9:	learn: 7.6695811	test: 7.7165036	best: 7.7165036 (9)	total: 36.6ms	remaining: 7.28s
10:	learn: 7.5271517	test: 7.5956187	best: 7.5956187 (10)	total: 40ms	remaining: 7.23s
11:	learn: 7.3936463	test: 7.4819415	best: 

In [103]:
pred_CR = model_CR.predict(x_test)
mae_CR = mae(pred_CR, y_test)
mse_CR = mse(pred_CR, y_test)
corr_CR = np.corrcoef(pred_CR, y_test)[0,1]
print("mae = {:.3f}".format(mae_CR))
print("mse = {:.3f}".format(mse_CR))
print("correlation = {:.3f}".format(corr_CR))

mae = 1.962
mse = 9.398
correlation = 0.942


In [27]:
# Using Extreme Gradient Boosting Regressor
from xgboost.sklearn import XGBRegressor as XGB
model_XGB= XGB()
now = time.time()
model_XGB.fit(x_train, y_train)
print(time.time()-now)

0.16559743881225586


In [28]:
pred_XGB = model_XGB.predict(x_test)
mae_XGB = mae(pred_XGB, y_test)
mse_XGB = mse(pred_XGB, y_test)
corr_XGB = np.corrcoef(pred_XGB, y_test)[0,1]
print("mae = {:.3f}".format(mae_XGB))
print("mse = {:.3f}".format(mse_XGB))
print("correlation = {:.3f}".format(corr_XGB))

mae = 2.403
mse = 15.113
correlation = 0.905


In [29]:
# Using Light Gradient Boosting Machines Regressor
from lightgbm import LGBMRegressor as LGBM
model_LGBM = LGBM()
now = time.time()
model_LGBM.fit(x_train, y_train)
print(time.time()-now)

0.0621335506439209


In [30]:
pred_LGBM = model_LGBM.predict(x_test)
mae_LGBM = mae(pred_LGBM, y_test)
mse_LGBM = mse(pred_LGBM, y_test)
corr_LGBM = np.corrcoef(pred_LGBM, y_test)[0,1]
print("mae = {:.3f}".format(mae_LGBM))
print("mse = {:.3f}".format(mse_LGBM))
print("correlation = {:.3f}".format(corr_LGBM))

mae = 2.672
mse = 21.316
correlation = 0.870


In [31]:
!pip install pytorch-tabnet

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch-tabnet
  Downloading pytorch_tabnet-3.1.1-py3-none-any.whl (39 kB)
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-3.1.1


In [32]:
from pytorch_tabnet.tab_model import TabNetRegressor as TR
model_TR = TR(verbose=0,seed=137)
y_train = np.array(y_train).reshape(-1,1)
y_test = np.array(y_test).reshape(-1,1)
now = time.time()
model_TR.fit(X_train=x_train, y_train=y_train, eval_set=[(x_test, y_test)], patience=300, max_epochs=2000, eval_metric=['mae'])
print(time.time()-now)


Early stopping occurred at epoch 974 with best_epoch = 674 and best_val_0_mae = 2.16171
Best weights from best epoch are automatically used!
42.42603778839111


In [33]:
pred_TR = model_TR.predict(x_test).flatten()
mae_TR = mae(pred_TR, y_test.flatten())
mse_TR = mse(pred_TR, y_test.flatten())
corr_TR = np.corrcoef(pred_TR, y_test.flatten())[0,1]
print("mae = {:.3f}".format(mae_TR))
print("mse = {:.3f}".format(mse_TR))
print("correlation = {:.3f}".format(corr_TR))

mae = 2.162
mse = 11.864
correlation = 0.927


In [106]:
Regressor_Name = ["sklearn.linear_model.LinearRegression", 
                  "sklearn.ensemble.RandomForestRegressor", 
                  "sklearn.neighbors.KNeighborsRegressor", 
                  "sklearn.svm.SVR", 
                  "sklearn.linear_model.SGDRegressor", 
                  "sklearn.linear_model.BayesianRidge", 
                  "catboost.CatBoostRegressor", 
                  "xgboost.sklearn.XGBRegressor", 
                  "lightgbm.LGBMRegressor", 
                  "pytorch_tabnet.tab_model.TabNetRegressor"]

In [105]:
list_alias = ["l","RF","KNN","SVR","SGD","BR","CR","XGB","LGBM","TR"]
mae_list = [] 
mse_list = []
corr_list = []
for i in list_alias:
    mae_list.append(vars()["mae_"+str(i)])
    mse_list.append(vars()["mse_"+str(i)])
    corr_list.append(vars()["corr_"+str(i)])

In [108]:
Result_table = pd.DataFrame({"Regressor" : Regressor_Name, "MAE" : mae_list, "MSE" : mse_list, "Correlation" : corr_list}) 
Result_table.sort_values(["MAE", "Correlation"],
               axis = 0, ascending = [True,False],
               inplace = True, ignore_index=True,
               na_position = "first")
left_aligned_RT = Result_table.style.set_properties(**{'text-align': 'left'})
display(left_aligned_RT)

Unnamed: 0,Regressor,MAE,MSE,Correlation
0,catboost.CatBoostRegressor,1.962225,9.397912,0.941806
1,pytorch_tabnet.tab_model.TabNetRegressor,2.161713,11.863635,0.92702
2,xgboost.sklearn.XGBRegressor,2.402975,15.112581,0.905172
3,sklearn.ensemble.RandomForestRegressor,2.499471,19.231287,0.882384
4,lightgbm.LGBMRegressor,2.671799,21.316456,0.870443
5,sklearn.linear_model.LinearRegression,3.660153,28.192486,0.814584
6,sklearn.linear_model.SGDRegressor,3.682288,28.225564,0.814199
7,sklearn.linear_model.BayesianRidge,3.740144,29.803061,0.802493
8,sklearn.neighbors.KNeighborsRegressor,4.152549,37.429969,0.74266
9,sklearn.svm.SVR,5.561361,74.233229,0.440066


In [109]:
from tabulate import tabulate
print(tabulate(Result_table, showindex=False, headers=Result_table.columns))

Regressor                                     MAE       MSE    Correlation
----------------------------------------  -------  --------  -------------
catboost.CatBoostRegressor                1.96222   9.39791       0.941806
pytorch_tabnet.tab_model.TabNetRegressor  2.16171  11.8636        0.92702
xgboost.sklearn.XGBRegressor              2.40298  15.1126        0.905172
sklearn.ensemble.RandomForestRegressor    2.49947  19.2313        0.882384
lightgbm.LGBMRegressor                    2.6718   21.3165        0.870443
sklearn.linear_model.LinearRegression     3.66015  28.1925        0.814584
sklearn.linear_model.SGDRegressor         3.68229  28.2256        0.814199
sklearn.linear_model.BayesianRidge        3.74014  29.8031        0.802493
sklearn.neighbors.KNeighborsRegressor     4.15255  37.43          0.74266
sklearn.svm.SVR                           5.56136  74.2332        0.440066
