In [None]:
!pip install catboost
!pip install pytorch-tabnet

In [2]:
import sklearn.datasets as skdatasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os, datetime
import time
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.linear_model import LinearRegression as l
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.neighbors import KNeighborsRegressor as KNN
from sklearn.svm import SVR 
from sklearn.linear_model import SGDRegressor as SGD
from sklearn.pipeline import make_pipeline 
from sklearn.linear_model import BayesianRidge as BR
from catboost import CatBoostRegressor as CR
from xgboost.sklearn import XGBRegressor as XGB
from lightgbm import LGBMRegressor as LGBM
from pytorch_tabnet.tab_model import TabNetRegressor as TR

In [3]:
# import the dataset
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
boston_features = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
boston_labels = raw_df.values[1::2, 2]

# Apply train_test_split
SEED = 123
x_train,x_test,y_train,y_test = train_test_split(boston_features, boston_labels, test_size=0.2, random_state=SEED)

# standardize the data
standardizer = StandardScaler()
for i in [x_train,x_test]:
    standardizer.fit(i)
    i= standardizer.transform(i)

In [4]:
from pandas.core.arrays.sparse.array import make_sparse_index
# Employ the regressors to construct model and evaluate their performances.
name_alias = ["l","RF","KNN","SVR","BR","XGB","LGBM"]
model_list = [l(), RF(), KNN(), SVR(), BR(), XGB(), LGBM()]
mae_list = [] 
mse_list = []
corr_list = []
time_list = []
for index,name in enumerate(name_alias):
    model = model_list[index]
    now = time.time()
    model.fit(x_train, y_train)
    training_time = time.time()-now
    predictions = model.predict(x_test)
    mae_i = mae(predictions, y_test)
    mse_i = mse(predictions, y_test)
    corr_i = np.corrcoef(predictions, y_test)[0,1]
    mae_list.append(mae_i)
    mse_list.append(mse_i)
    corr_list.append(corr_i)
    time_list.append(training_time)



In [5]:
model_SGD = make_pipeline(StandardScaler(), SGD(max_iter=1000, tol=1e-3, early_stopping=True, n_iter_no_change=200))
now = time.time()
model_SGD.fit(x_train, y_train)
time_SGD = time.time()-now
pred_SGD = model_SGD.predict(x_test)
mae_SGD = mae(pred_SGD, y_test)
mse_SGD = mse(pred_SGD, y_test)
corr_SGD = np.corrcoef(pred_SGD, y_test)[0,1]
time_list.append(time_SGD)
mae_list.append(mae_SGD)
mse_list.append(mse_SGD)
corr_list.append(corr_SGD)

In [None]:
model_CR = CR(learning_rate=0.032, depth=7, iterations=5000, early_stopping_rounds=500, use_best_model=True)
now = time.time()
model_CR.fit(x_train, y_train, eval_set=(x_test, y_test))
time_CR = time.time()-now
pred_CR = model_CR.predict(x_test)
mae_CR = mae(pred_CR, y_test)
mse_CR = mse(pred_CR, y_test)
corr_CR = np.corrcoef(pred_CR, y_test)[0,1]
time_list.append(time_CR)
mae_list.append(mae_CR)
mse_list.append(mse_CR)
corr_list.append(corr_CR)

In [7]:
model_TR = TR(verbose=0,seed=137)
y_train = np.array(y_train).reshape(-1,1)
y_test = np.array(y_test).reshape(-1,1)
now = time.time()
model_TR.fit(X_train=x_train, y_train=y_train, eval_set=[(x_test, y_test)], patience=500, max_epochs=5000, eval_metric=['mae'])
time_TR = time.time()-now
pred_TR = model_TR.predict(x_test).flatten()
mae_TR = mae(pred_TR, y_test.flatten())
mse_TR = mse(pred_TR, y_test.flatten())
corr_TR = np.corrcoef(pred_TR, y_test.flatten())[0,1]
time_list.append(time_TR)
mae_list.append(mae_TR)
mse_list.append(mse_TR)
corr_list.append(corr_TR)


Early stopping occurred at epoch 1259 with best_epoch = 759 and best_val_0_mae = 2.01008
Best weights from best epoch are automatically used!


In [10]:
Regressor_Name = ["sklearn.linear_model.LinearRegression", 
                  "sklearn.ensemble.RandomForestRegressor", 
                  "sklearn.neighbors.KNeighborsRegressor", 
                  "sklearn.svm.SVR", 
                  "sklearn.linear_model.BayesianRidge", 
                  "xgboost.sklearn.XGBRegressor", 
                  "lightgbm.LGBMRegressor", 
                  "sklearn.linear_model.SGDRegressor",
                  "catboost.CatBoostRegressor",  
                  "pytorch_tabnet.tab_model.TabNetRegressor"]
Result_table = pd.DataFrame({"Regressor" : Regressor_Name, "MAE" : mae_list, "MSE" : mse_list, "Correlation" : corr_list, "Time Elapsed":time_list}) 
Result_table.sort_values(["Time Elapsed","Correlation","MAE", "MSE"],
               axis = 0, ascending = [True, False, True, True],
               inplace = True, ignore_index=True,
               na_position = "first")
left_aligned_RT = Result_table.style.set_properties(**{'text-align': 'left'})
display(left_aligned_RT)

Unnamed: 0,Regressor,MAE,MSE,Correlation,Time Elapsed
0,sklearn.neighbors.KNeighborsRegressor,4.152549,37.429969,0.74266,0.000553
1,sklearn.linear_model.BayesianRidge,3.740144,29.803061,0.802493,0.001281
2,sklearn.svm.SVR,5.561361,74.233229,0.440066,0.00778
3,sklearn.linear_model.LinearRegression,3.660153,28.192486,0.814584,0.017675
4,lightgbm.LGBMRegressor,2.671799,21.316456,0.870443,0.04333
5,xgboost.sklearn.XGBRegressor,2.402975,15.112581,0.905172,0.113461
6,sklearn.linear_model.SGDRegressor,3.601442,27.774514,0.816334,0.168876
7,sklearn.ensemble.RandomForestRegressor,2.482069,18.009322,0.888759,0.215385
8,catboost.CatBoostRegressor,1.962225,9.397912,0.941806,3.306138
9,pytorch_tabnet.tab_model.TabNetRegressor,2.010076,7.272916,0.955575,31.901599
