# TASK 3 A regression example: predicting apartment prices

In [None]:
#Imports
import pandas as pd
from pandas.core.common import random_state
from pandas import Series, DataFrame
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn import metrics
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score


# Step 1. Reading the data & Splitting

In [None]:
# Read the CSV file using Pandas.
alldata = pd.read_csv("sberbank.csv")

# Convert the timestamp string to an integer representing the year.
def get_year(timestamp):
    return int(timestamp[:4])
alldata['year'] = alldata.timestamp.apply(get_year)

# Select the 9 input columns and the output column.
selected_columns = ['price_doc', 'year', 'full_sq', 'life_sq', 'floor', 'num_room', 'kitch_sq', 'full_all']
alldata = alldata[selected_columns]
alldata = alldata.dropna()

# Shuffle.
alldata_shuffled = alldata.sample(frac=1.0, random_state=0)

# Separate the input and output columns.
X = alldata_shuffled.drop('price_doc', axis=1)
# For the output, we'll use the log of the sales price.
Y = alldata_shuffled['price_doc'].apply(np.log)

# Split into training and test sets.
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=0)

20% - 80% splitting technique was used.

In [None]:
alldata_shuffled

Unnamed: 0,price_doc,year,full_sq,life_sq,floor,num_room,kitch_sq,full_all
25252,6150880,2014,61,32.0,8.0,2.0,13.0,247469
9943,6900000,2013,43,20.0,10.0,1.0,8.0,68630
18040,9600000,2014,56,30.0,11.0,2.0,8.0,78507
8625,10300000,2013,54,32.0,10.0,2.0,9.0,26943
13495,5000000,2013,38,20.0,2.0,1.0,8.0,132349
...,...,...,...,...,...,...,...,...
20816,3888640,2014,43,43.0,2.0,1.0,0.0,13890
25722,3000000,2014,30,18.0,1.0,1.0,5.0,103746
21502,30000000,2014,116,78.0,5.0,4.0,14.0,75377
22695,8350000,2014,37,19.0,16.0,1.0,9.0,102590


In [None]:
X.head()

Unnamed: 0,year,full_sq,life_sq,floor,num_room,kitch_sq,full_all
25252,2014,61,32.0,8.0,2.0,13.0,247469
9943,2013,43,20.0,10.0,1.0,8.0,68630
18040,2014,56,30.0,11.0,2.0,8.0,78507
8625,2013,54,32.0,10.0,2.0,9.0,26943
13495,2013,38,20.0,2.0,1.0,8.0,132349


In [None]:
X.shape

In [None]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16758 entries, 25252 to 12925
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   year      16758 non-null  int64  
 1   full_sq   16758 non-null  int64  
 2   life_sq   16758 non-null  float64
 3   floor     16758 non-null  float64
 4   num_room  16758 non-null  float64
 5   kitch_sq  16758 non-null  float64
 6   full_all  16758 non-null  int64  
dtypes: float64(4), int64(3)
memory usage: 1.0 MB


There is no null value so we do not need to handle missing values. Since we need to predict a number, this is a regression problem. 

# Step 2. Cross Validation On Baseline Regressor

Explanation of The Metrics

Negative mean squared error between predicted and actual target values is measured and then negated in order to make it suitable for comparison with other scores. This is necessary because the cross_validate function aims to maximize scores by default, but in this case, we want to minimize the mean squared error. By negating it, we ensure that larger values are considered better, allowing for direct comparison with other scores. (-0.38 is better than -0.39) The closer values to 0, the better performance we have.

Negative man absolute error definition is that the magnitude of difference between the prediction of an observation and the true value of that observation but negated after calculation. Interpretation is the same as nmse, larger values are considered better. The closer value to 0, the better performance we have.

R^2 measure of how well the regression line approximates the actual data. Normally, we are excepting values of this metric lies between 0 and 1. When it approaches to 1, it means that model fit better. There are some cases where R^2 can be negative. This is because we did not set an intercept and model fits the data very badly. 

Cross Validate function provide values for 5 different splitted training data. To better overview and interpret, we took the mean of these metrics. 

fit_time: the time to train the model on the training data for each fold, fit_time 

score_time: the time to predict with the model on the testing data for each fold, score_time 

test_score:  the default score on the testing data for each fold, test_score. ( test score can be neg_mean_squared_error, neg_mean_absolute_error and r2)

In [None]:
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import cross_validate
m1 = DummyRegressor()
dummy_regressor_cross_validate_scores_nmse = cross_validate(m1, Xtrain, Ytrain, scoring='neg_mean_squared_error')
print("nmse: ",dummy_regressor_cross_validate_scores_nmse)
dummy_regressor_cross_validate_scores_nmae = cross_validate(m1, Xtrain, Ytrain, scoring='neg_mean_absolute_error')
print("nmae: ",dummy_regressor_cross_validate_scores_nmae)
dummy_regressor_cross_validate_scores_r2 = cross_validate(m1, Xtrain, Ytrain, scoring='r2')
print("r^2: ",dummy_regressor_cross_validate_scores_r2)

print('average negative mean squared error: ',dummy_regressor_cross_validate_scores_nmse['test_score'].mean())
print('average negative mean absolute error: ',dummy_regressor_cross_validate_scores_nmae['test_score'].mean())
print('average r2: ',dummy_regressor_cross_validate_scores_r2['test_score'].mean())



nmse:  {'fit_time': array([0.002671  , 0.00126648, 0.00116777, 0.00114417, 0.00133491]), 'score_time': array([0.00042915, 0.00032187, 0.00029182, 0.00030732, 0.00031161]), 'test_score': array([-0.39897319, -0.37113485, -0.38083108, -0.39057156, -0.40475168])}
nmae:  {'fit_time': array([0.00109625, 0.00116801, 0.00111604, 0.00122213, 0.00120592]), 'score_time': array([0.00032806, 0.0003264 , 0.00030279, 0.00045562, 0.00032282]), 'test_score': array([-0.44105965, -0.42808124, -0.432226  , -0.43307538, -0.4459258 ])}
r^2:  {'fit_time': array([0.00112462, 0.00174022, 0.00119996, 0.00132346, 0.00111508]), 'score_time': array([0.00046396, 0.0007906 , 0.00042653, 0.00044155, 0.00036764]), 'test_score': array([-9.42135190e-04, -2.41896469e-04, -7.41744989e-05, -1.00025775e-03,
       -1.46718324e-03])}
average negative mean squared error:  -0.38925247260237555
average negative mean absolute error:  -0.43607361426140007
average r2:  -0.0007451294280989895


We implemented cross validation on DummyRegressor model. We measure the metrics "negative mean square error", "negative mean absolute error" and "r^2" for each model. The reason that we measure nmae and r^2 is that only nmse is not enough to get comprehensive picture of the model performance.  We can say that dummyRegressor model fits quite badly. We also observed that, fit_time and score_time changes according to distribution of the data.

In [None]:
from sklearn.linear_model import LinearRegression

LinearRegressionModel = LinearRegression()
linear_regression_cross_validate_scores_nmse = cross_validate(LinearRegressionModel, Xtrain, Ytrain, scoring='neg_mean_squared_error')
print('nmse: ',linear_regression_cross_validate_scores_nmse)
linear_regression_cross_validate_scores_nmae = cross_validate(LinearRegressionModel, Xtrain, Ytrain, scoring='neg_mean_absolute_error')
print('nmae: ',linear_regression_cross_validate_scores_nmae)
linear_regression_cross_validate_scores_r2 = cross_validate(LinearRegressionModel, Xtrain, Ytrain, scoring='r2')
print('r^2: ',linear_regression_cross_validate_scores_r2)

print('average negative mean squared error: ',linear_regression_cross_validate_scores_nmse['test_score'].mean())
print('average negative mean absolute error: ',linear_regression_cross_validate_scores_nmae['test_score'].mean())
print('average r2: ',linear_regression_cross_validate_scores_r2['test_score'].mean())


nmse:  {'fit_time': array([0.00909615, 0.0949924 , 0.01768684, 0.03425527, 0.01030922]), 'score_time': array([0.02350783, 0.03247857, 0.05797219, 0.05245757, 0.00476623]), 'test_score': array([-0.30222063, -0.32537384, -0.29377903, -0.29296258, -0.29265721])}
nmae:  {'fit_time': array([0.01307559, 0.02608967, 0.08335209, 0.08394885, 0.0713625 ]), 'score_time': array([0.06517386, 0.00805974, 0.02017212, 0.00722504, 0.00391388]), 'test_score': array([-0.38290928, -0.37584158, -0.37777796, -0.37699774, -0.38031612])}
r^2:  {'fit_time': array([0.02138519, 0.067698  , 0.01102662, 0.06986403, 0.08360004]), 'score_time': array([0.00500441, 0.00669146, 0.00809169, 0.01722574, 0.00519443]), 'test_score': array([0.24179026, 0.12308815, 0.22852718, 0.24916292, 0.27588541])}
average negative mean squared error:  -0.30139865887672357
average negative mean absolute error:  -0.3787685345905958
average r2:  0.22369078326219166


In [None]:
from sklearn.linear_model import Ridge
RidgeModel = Ridge()
ridge_cross_validate_scores_nmse = cross_validate(RidgeModel, Xtrain, Ytrain, scoring='neg_mean_squared_error')
print('nmse: ',ridge_cross_validate_scores_nmse)
ridge_cross_validate_scores_nmae = cross_validate(RidgeModel, Xtrain, Ytrain, scoring='neg_mean_absolute_error')
print('nmae: ',ridge_cross_validate_scores_nmae)
ridge_cross_validate_scores_r2 = cross_validate(RidgeModel, Xtrain, Ytrain, scoring='r2')
print('r^2: ',ridge_cross_validate_scores_r2)

print('average negative mean squared error: ',ridge_cross_validate_scores_nmse['test_score'].mean())
print('average negative mean absolute error: ',ridge_cross_validate_scores_nmae['test_score'].mean())
print('average r2: ',ridge_cross_validate_scores_r2['test_score'].mean())




nmse:  {'fit_time': array([0.01937318, 0.00935388, 0.01019073, 0.05680585, 0.05849528]), 'score_time': array([0.00391269, 0.005867  , 0.00310135, 0.01749158, 0.00745034]), 'test_score': array([-0.30222063, -0.32537046, -0.29377831, -0.29296256, -0.29265724])}
nmae:  {'fit_time': array([0.01092553, 0.00429606, 0.01199508, 0.01034379, 0.06361771]), 'score_time': array([0.00406933, 0.06920671, 0.0033021 , 0.0041976 , 0.00889254]), 'test_score': array([-0.38291068, -0.37584267, -0.37777866, -0.37699853, -0.38031718])}
r^2:  {'fit_time': array([0.00700331, 0.00677609, 0.0144639 , 0.06776524, 0.01444817]), 'score_time': array([0.0054493 , 0.06877184, 0.00849581, 0.01568365, 0.00838804]), 'test_score': array([0.24179025, 0.12309726, 0.22852907, 0.24916295, 0.27588534])}
average negative mean squared error:  -0.3013978423217972
average negative mean absolute error:  -0.3787695458292662
average r2:  0.22369297499842652


In [None]:
from sklearn import linear_model
LassoModel = linear_model.Lasso(alpha=0.1)
lasso_cross_validate_scores_nmse = cross_validate(LassoModel, Xtrain, Ytrain, scoring='neg_mean_squared_error')
print('average negative mean squared error: ',lasso_cross_validate_scores_nmse['test_score'].mean())
print('nmse: ',lasso_cross_validate_scores_nmse)
lasso_cross_validate_scores_nmae = cross_validate(LassoModel, Xtrain, Ytrain, scoring='neg_mean_absolute_error')
print('nmae: ',lasso_cross_validate_scores_nmae)
lasso_cross_validate_scores_r2 = cross_validate(LassoModel, Xtrain, Ytrain, scoring='r2')
print('r^2: ',lasso_cross_validate_scores_r2)

print('average negative mean squared error: ',lasso_cross_validate_scores_nmse['test_score'].mean())
print('average negative mean absolute error: ',lasso_cross_validate_scores_nmae['test_score'].mean())
print('average r2: ',lasso_cross_validate_scores_r2['test_score'].mean())



average negative mean squared error:  -0.2981786493131544
nmse:  {'fit_time': array([0.00683522, 0.1337738 , 0.09029913, 0.0395    , 0.09774637]), 'score_time': array([0.02238798, 0.0929141 , 0.02257109, 0.06844139, 0.02230644]), 'test_score': array([-0.30610375, -0.29565704, -0.29538912, -0.29693918, -0.29680415])}
nmae:  {'fit_time': array([0.04119682, 0.09453082, 0.12699962, 0.10441208, 0.0630157 ]), 'score_time': array([0.07533383, 0.0081284 , 0.05656981, 0.01921296, 0.00952959]), 'test_score': array([-0.3891366 , -0.37791776, -0.38252097, -0.3816379 , -0.38545663])}
r^2:  {'fit_time': array([0.01539326, 0.08390021, 0.07616854, 0.01952887, 0.08644128]), 'score_time': array([0.00623775, 0.00745916, 0.00688672, 0.07271504, 0.00743866]), 'test_score': array([0.23204831, 0.20317762, 0.22429904, 0.23897122, 0.26562473])}
average negative mean squared error:  -0.2981786493131544
average negative mean absolute error:  -0.38333397164540234
average r2:  0.2328241843769893


In [None]:
from sklearn.tree import DecisionTreeRegressor
DecisionTreeRegressorModel = DecisionTreeRegressor(random_state=0)
decision_tree_regressor_cross_validate_scores_nmse = cross_validate(DecisionTreeRegressorModel, Xtrain, Ytrain, scoring='neg_mean_squared_error')
print('nmse: ',decision_tree_regressor_cross_validate_scores_nmse)
decision_tree_regressor_cross_validate_scores_nmae = cross_validate(DecisionTreeRegressorModel, Xtrain, Ytrain, scoring='neg_mean_absolute_error')
print('nmae: ',decision_tree_regressor_cross_validate_scores_nmae)
decision_tree_regressor_cross_validate_scores_r2 = cross_validate(DecisionTreeRegressorModel, Xtrain, Ytrain, scoring='r2')
print('r^2: ',decision_tree_regressor_cross_validate_scores_r2)

print('average negative mean squared error: ',decision_tree_regressor_cross_validate_scores_nmse['test_score'].mean())
print('average negative mean absolute error: ',decision_tree_regressor_cross_validate_scores_nmae['test_score'].mean())
print('average r2: ',decision_tree_regressor_cross_validate_scores_r2['test_score'].mean())



nmse:  {'fit_time': array([0.05243897, 0.07286644, 0.06622243, 0.06106138, 0.07107353]), 'score_time': array([0.00309587, 0.00323677, 0.00513744, 0.00500345, 0.0030601 ]), 'test_score': array([-0.54959121, -0.54311873, -0.50499217, -0.51492692, -0.54968825])}
nmae:  {'fit_time': array([0.07165408, 0.05587506, 0.05782461, 0.07629466, 0.06883788]), 'score_time': array([0.00335813, 0.0048449 , 0.00312757, 0.00425911, 0.00479865]), 'test_score': array([-0.44460109, -0.43485817, -0.42063627, -0.42352509, -0.44564287])}
r^2:  {'fit_time': array([0.0684042 , 0.06096292, 0.07080865, 0.05649304, 0.06775641]), 'score_time': array([0.00567794, 0.00309372, 0.00342631, 0.00510025, 0.00307631]), 'test_score': array([-0.37881193, -0.46375394, -0.32612505, -0.319712  , -0.36008022])}
average negative mean squared error:  -0.5324634565214887
average negative mean absolute error:  -0.43385269775307433
average r2:  -0.36969662778736906


Decision Tree Regressor also fit quite badly, maybe we should tune the hyperparameters. 

In [None]:
from sklearn.ensemble import RandomForestRegressor
RandomForestRegressorModel = RandomForestRegressor(max_depth=2, random_state=0)
random_forest_regressor_cross_validate_scores_nmse = cross_validate(RandomForestRegressorModel, Xtrain, Ytrain, scoring='neg_mean_squared_error')
print('nmse: ',random_forest_regressor_cross_validate_scores_nmse)
random_forest_regressor_cross_validate_scores_nmae = cross_validate(RandomForestRegressorModel, Xtrain, Ytrain, scoring='neg_mean_absolute_error')
print('nmae: ',random_forest_regressor_cross_validate_scores_nmae)
random_forest_regressor_cross_validate_scores_r2 = cross_validate(RandomForestRegressorModel, Xtrain, Ytrain, scoring='r2')
print('r^2: ',random_forest_regressor_cross_validate_scores_r2)

print('average negative mean squared error: ',random_forest_regressor_cross_validate_scores_nmse['test_score'].mean())
print('average negative mean absolute error: ',random_forest_regressor_cross_validate_scores_nmae['test_score'].mean())
print('average r2: ',random_forest_regressor_cross_validate_scores_r2['test_score'].mean())




nmse:  {'fit_time': array([0.55967212, 0.55358672, 0.5597744 , 0.55391002, 0.56303263]), 'score_time': array([0.02109766, 0.02379036, 0.02447414, 0.02397633, 0.01689649]), 'test_score': array([-0.30660535, -0.28378306, -0.29688942, -0.30035284, -0.29904841])}
nmae:  {'fit_time': array([0.5163734 , 0.56101441, 0.52687025, 0.45470786, 0.49899578]), 'score_time': array([0.02627301, 0.02528358, 0.01607227, 0.01680803, 0.01722169]), 'test_score': array([-0.38750283, -0.37350955, -0.38173922, -0.38334677, -0.38409094])}
r^2:  {'fit_time': array([0.47419667, 0.5809679 , 0.51166534, 0.47979355, 0.54787874]), 'score_time': array([0.02178574, 0.01892114, 0.01656461, 0.01756096, 0.01763177]), 'test_score': array([0.2307899 , 0.23517905, 0.2203592 , 0.23022233, 0.26007183])}
average negative mean squared error:  -0.2973358159115129
average negative mean absolute error:  -0.38203786260805056
average r2:  0.23532446280260508


In [None]:
from sklearn.ensemble import GradientBoostingRegressor
GradientBoostingRegressorModel = GradientBoostingRegressor(random_state=0)
gradiont_boosting_regressor_cross_validate_scores_nmse = cross_validate(GradientBoostingRegressorModel, Xtrain, Ytrain, scoring='neg_mean_squared_error')
print('nmse: ',gradiont_boosting_regressor_cross_validate_scores_nmse)
gradiont_boosting_regressor_cross_validate_scores_nmae = cross_validate(GradientBoostingRegressorModel, Xtrain, Ytrain, scoring='neg_mean_absolute_error')
print('nmae: ',gradiont_boosting_regressor_cross_validate_scores_nmae)
gradiont_boosting_regressor_cross_validate_scores_r2 = cross_validate(GradientBoostingRegressorModel, Xtrain, Ytrain, scoring='r2')
print('r^2: ',gradiont_boosting_regressor_cross_validate_scores_r2)

print('average negative mean squared error: ',gradiont_boosting_regressor_cross_validate_scores_nmse['test_score'].mean())
print('average negative mean absolute error: ',gradiont_boosting_regressor_cross_validate_scores_nmae['test_score'].mean())
print('average r^2 : ',gradiont_boosting_regressor_cross_validate_scores_r2['test_score'].mean())



nmse:  {'fit_time': array([0.85510373, 0.88673234, 0.78066015, 0.87919354, 0.87461519]), 'score_time': array([0.0583899 , 0.00571871, 0.00543022, 0.00531888, 0.00580645]), 'test_score': array([-0.27648513, -0.24920729, -0.2629464 , -0.27079364, -0.26331844])}
nmae:  {'fit_time': array([0.86489487, 0.93257523, 0.90562248, 0.89623022, 0.89423895]), 'score_time': array([0.00864792, 0.00722742, 0.00531673, 0.00539422, 0.00552392]), 'test_score': array([-0.33756755, -0.32426666, -0.33151341, -0.3387949 , -0.33413871])}
r^2:  {'fit_time': array([0.8967514 , 0.94391584, 0.8863306 , 0.92805886, 0.9867754 ]), 'score_time': array([0.00555754, 0.0093627 , 0.00782394, 0.0075891 , 0.00984263]), 'test_score': array([0.30635536, 0.32836389, 0.30949463, 0.30597993, 0.34847761])}
average negative mean squared error:  -0.26455018016714577
average negative mean absolute error:  -0.33325624537133863
average r^2 :  0.31973428388248176


Here, the 'average negative mean squared error' metric is the closest to the 0 among all other models which means that specifically for this metric Gradient Boosting Regressor model is the best. Also, the same thing applies to the negative mean absolute error metric. Apart from that, r^2 metric is the closest to the 1,  among all other models. All metrics that we see here prove that the best model is the Gradient boosting regressor model. We will go forward with this model.

In [None]:
from sklearn.neural_network import MLPRegressor
MLPRegressorModel = MLPRegressor(random_state=1, max_iter=500)
mlp_regressor_cross_validate_scores_nmse = cross_validate(MLPRegressorModel, Xtrain, Ytrain, scoring='neg_mean_squared_error')
print('nmse: ',mlp_regressor_cross_validate_scores_nmse)
#mlp_regressor_cross_validate_scores_nmae = cross_validate(MLPRegressorModel, Xtrain, Ytrain, scoring='neg_mean_absolute_error')
#print('nmae: ',mlp_regressor_cross_validate_scores_nmae)
#mlp_regressor_cross_validate_scores_r2 = cross_validate(MLPRegressorModel, Xtrain, Ytrain, scoring='r2')
#print('r^2: ',mlp_regressor_cross_validate_scores_r2)

#print('average negative mean squared error: ',mlp_regressor_cross_validate_scores_nmse['test_score'].mean())
#print('average negative mean absolute error: ',mlp_regressor_cross_validate_scores_nmae['test_score'].mean())
#print('average r2: ',mlp_regressor_cross_validate_scores_r2['test_score'].mean())


average negative mean squared error:  -20.58979378733003
nmse:  {'fit_time': array([44.66321397, 52.71104479, 43.15420675, 26.85527182, 23.21644592]), 'score_time': array([0.00802565, 0.07691765, 0.04641843, 0.04793024, 0.00483441]), 'test_score': array([-44.93719246, -18.88982994,  -7.5506087 , -23.50678314,
        -8.0645547 ])}
nmae:  {'fit_time': array([28.50909925, 67.41957951, 53.77104712, 40.88643241, 24.52359605]), 'score_time': array([0.05278373, 0.00604677, 0.02005553, 0.00792241, 0.0124042 ]), 'test_score': array([-3.81741804, -2.45897142, -1.65565593, -3.07765448, -1.77264382])}
r^2:  {'fit_time': array([34.38376999, 17.28021145, 47.50930381, 29.39098501, 37.07565761]), 'score_time': array([0.07837439, 0.01738596, 0.00675583, 0.00897312, 0.01941657]), 'test_score': array([-111.73822391, -109.98949258,  -18.8281318 ,  -59.24579924,
        -18.95393072])}


MLP regressor also fit quite badly as we got incredibly low negative mean squared errors. We didn't even calculate the other metrics. 

# Step 3. Hyperparameter Tuning

In [None]:
maximum_nmse_check = 1
maximum_nmae_check = 1
maximum_r2_check = 0

optimum_depth_check_for_nmse = 0
optimum_depth_check_for_nmae = 0
optimum_depth_check_for_r2 = 0


for max_depth_index in range(1,20):

    gbrModel = GradientBoostingRegressor(max_depth=max_depth_index )
    gbr_cross_validate_nmse = cross_validate(gbrModel, Xtrain, Ytrain, scoring='neg_mean_squared_error')
    gbr_cross_validate_nmae = cross_validate(gbrModel, Xtrain, Ytrain, scoring='neg_mean_absolute_error')
    gbr_cross_validate_r2 = cross_validate(gbrModel, Xtrain, Ytrain, scoring='r2')

    print('average negative mean squared error: ',gbr_cross_validate_nmse['test_score'].mean())
    print('average negative mean absolute error: ',gbr_cross_validate_nmae['test_score'].mean())
    print('average r^2 : ',gbr_cross_validate_r2['test_score'].mean())
    print("max_depth_index: ",max_depth_index)
    if abs(gbr_cross_validate_nmse['test_score'].mean()) < abs(maximum_nmse_check):
        maximum_nmse_check = gbr_cross_validate_nmse['test_score'].mean()
        optimum_depth_check_for_nmse = max_depth_index

    
    if abs(gbr_cross_validate_nmae['test_score'].mean()) < abs(maximum_nmae_check):
        maximum_nmae_check = gbr_cross_validate_nmae['test_score'].mean()
        optimum_depth_check_for_nmae = max_depth_index

    if gbr_cross_validate_r2['test_score'].mean() > maximum_r2_check:
        maximum_r2_check = gbr_cross_validate_r2['test_score'].mean()
        optimum_depth_check_for_r2 = max_depth_index

print("max nmse is : ", maximum_nmse_check," with max_depth: ", optimum_depth_check_for_nmse, "by observing cross validation ")
print("max nmae is : ", maximum_nmae_check," with max_depth: ", optimum_depth_check_for_nmae, "by observing cross validation ")
print("max r2 is : ", maximum_r2_check," with max_depth: ", optimum_depth_check_for_r2, "by observing cross validation ")



average negative mean squared error:  -0.27984569212049737
average negative mean absolute error:  -0.353895802874919
average r^2 :  0.2803660479027418
max_depth_index:  1
average negative mean squared error:  -0.26912190652133694
average negative mean absolute error:  -0.3390265162000158
average r^2 :  0.30794503295211734
max_depth_index:  2
average negative mean squared error:  -0.26453832789557213
average negative mean absolute error:  -0.3332495529240975
average r^2 :  0.31978238256746033
max_depth_index:  3
average negative mean squared error:  -0.2600996631124464
average negative mean absolute error:  -0.3275430034382622
average r^2 :  0.3311736759614753
max_depth_index:  4
average negative mean squared error:  -0.2584407480627212
average negative mean absolute error:  -0.32395692039811486
average r^2 :  0.33554025723412567
max_depth_index:  5
average negative mean squared error:  -0.25921883485218966
average negative mean absolute error:  -0.32308939166965917
average r^2 :  0.334

In this context, the model has several parameters, such as "loss", "learning rate", "max_depth", "max_features", among others. However, only the parameter "max_depth" was adjusted in this scenario. By changing the value of "max_depth" from 1 to 20, both the R^2 and nmse values increased until 5. However, the "nmae" value improved only after the 5th step until the 6th step and then started to decrease. Based on these observations, it can be concluded that the best value for "max_depth" is 5.

# Step 4. Fit The Model

In [None]:
from sklearn.metrics import mean_squared_error

bestDepthGBRModel = GradientBoostingRegressor(max_depth=5,random_state=0)
bestDepthGBRModel.fit(Xtrain, Ytrain)
y_prediction = bestDepthGBRModel.predict(Xtest)


# Step 5. Evaluate The Model

In [None]:

print(bestDepthGBRModel.score(Xtest, Ytest))
mean_squared_error(Ytest, y_prediction)


0.3444232932088789


0.2640549225110702

model.score() function does not return accuracy unlike classification problems. The .score method returns the coefficient of determination R^2 of the prediction. That is it return R^2. 

# Step 6. Proof

In [None]:
MyLinearRegressionModel = LinearRegression()
MyLinearRegressionModel.fit(Xtrain, Ytrain)
y_predictionLinearRegression = MyLinearRegressionModel.predict(Xtest)
print(MyLinearRegressionModel.score(Xtest, Ytest))


0.21647806684053728


In [None]:
MyRidgeModel = Ridge()
MyRidgeModel.fit(Xtrain, Ytrain)
y_predictionMyRidgeModel = MyRidgeModel.predict(Xtest)
print(MyRidgeModel.score(Xtest, Ytest))



0.21647509809866516


In [None]:
MyLassoModel = linear_model.Lasso(alpha=0.1)
MyLassoModel.fit(Xtrain, Ytrain)
y_predictionMyLassoModel = MyLassoModel.predict(Xtest)
print(MyLassoModel.score(Xtest, Ytest))



0.19050878984811803


In [None]:
MyDecisionTreeRegressorModel = DecisionTreeRegressor(random_state=0)
MyDecisionTreeRegressorModel.fit(Xtrain, Ytrain)
y_predictionMyDecisionTreeRegressorModel = MyDecisionTreeRegressorModel.predict(Xtest)
print(MyDecisionTreeRegressorModel.score(Xtest, Ytest))


-0.38353226784689554


In [None]:
MyRandomForestRegressorModel = RandomForestRegressor(max_depth=2, random_state=0)
MyRandomForestRegressorModel.fit(Xtrain, Ytrain)
y_predictionMyRandomForestRegressorModel = MyRandomForestRegressorModel.predict(Xtest)
print(MyRandomForestRegressorModel.score(Xtest, Ytest))


0.23787694255535197


Here, we also fit the other models and this is a proof that the best model is gradient boosting regression. 

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=82bb48ff-b539-4abb-b8e9-17b011c6b53b' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>