In [59]:
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold

import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor

california_housing = fetch_california_housing(as_frame=True)

In [2]:
california_housing.frame

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [3]:
def visualize_metrics(y_test, y_pred):
    print("R2 : ", r2_score(y_test, y_pred),'\n'
     "MSE: ", mean_squared_error(y_test, y_pred),"\n",
     "RMSE: ", mean_squared_error(y_test, y_pred, squared = False))

### Without preprocessing

In [18]:
X = california_housing.frame.drop(columns = ['MedHouseVal'])
Y = california_housing.frame['MedHouseVal']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state=45)

#### Decision Tree

In [7]:
tree_regressor = DecisionTreeRegressor(max_depth = 5).fit(x_train, y_train)

print("Training score : ", tree_regressor.score(x_train, y_train))

y_pred = tree_regressor.predict(x_test)
visualize_metrics(y_test, y_pred)



Training score :  0.6287954685947139
R2 :  0.6183213705485122 
MSE:  0.506460585172765 
 RMSE:  0.7116604423267918


In [6]:
# hyper-Parameters Tuning

tree_regressor_parameter = {'max_depth': [2,3,4,5,6,7,8,9,10]}
tree_regressor_grid_search = GridSearchCV(DecisionTreeRegressor(), tree_regressor_parameter, cv = 2)
tree_regressor_grid_search.fit(X, Y)
print('The best score',tree_regressor_grid_search.best_score_)
print('The best parameters',tree_regressor_grid_search.best_params_)

The best score 0.4812874665130862
The best parameters {'max_depth': 5}


#### Random Forests

In [13]:
clf = RandomForestRegressor(n_estimators=75)
clf.fit(x_train, y_train)

y_pred=clf.predict(x_test)

visualize_metrics(y_test, y_pred)

R2 :  0.816710576402612 
MSE:  0.24321212027122593 
 RMSE:  0.4931654086320592


In [11]:
random_forest_parameter = {'n_estimators': [10,25,50,75,100]}
random_forest_grid_search = GridSearchCV(RandomForestRegressor(), random_forest_parameter, cv = 2)
random_forest_grid_search.fit(X, Y)
print('The best score',random_forest_grid_search.best_score_)
print('The best parameters',random_forest_grid_search.best_params_)

The best score 0.601726179076523
The best parameters {'n_estimators': 75}


#### Gradient Boosting

In [60]:
gb = GradientBoostingRegressor(learning_rate = 0.1, max_depth = 9, n_estimators = 100, subsample = 0.7, random_state=0)
gb.fit(x_train, y_train)

y_pred = gb.predict(x_test)
visualize_metrics(y_test, y_pred)

R2 :  0.8436412311966435 
MSE:  0.2074770433410031 
 RMSE:  0.4554964800533623


In [61]:
model = GradientBoostingRegressor()
grid = dict()
grid['n_estimators'] = [10, 50, 100]
grid['learning_rate'] = [0.0001, 0.001, 0.01, 0.1, 1.0]
grid['subsample'] = [0.5, 0.7, 1.0]
grid['max_depth'] = [3, 7, 9]
# define the evaluation procedure
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define the grid search procedure
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv)
# execute the grid search
grid_result = grid_search.fit(X, Y)
# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) # summarize all scores that were evaluated
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.840177 using {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 100, 'subsample': 0.7}
0.000604 (0.000481) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 10, 'subsample': 0.5}
0.000604 (0.000480) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 10, 'subsample': 0.7}
0.000604 (0.000481) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 10, 'subsample': 1.0}
0.004824 (0.000472) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.5}
0.004819 (0.000473) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.7}
0.004814 (0.000474) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 50, 'subsample': 1.0}
0.010050 (0.000492) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.5}
0.010039 (0.000490) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.7}
0.010032 (0.000495) with: {'learning_rate': 0.0001,

#### Linear Regression

In [12]:
linear_regressor = LinearRegression(fit_intercept = True, copy_X = True).fit(x_train, y_train)

print("Training score : ", linear_regressor.score(x_train, y_train))

y_pred = linear_regressor.predict(x_test)
visualize_metrics(y_test, y_pred)

Training score :  0.6058389654865454
R2 :  0.6074054352929974 
MSE:  0.5209452603172996 
 RMSE:  0.7217653776105498


In [9]:
linear_regressor_parameter = {'fit_intercept': [True, False]}
linear_regressor_grid_search = GridSearchCV(LinearRegression(), linear_regressor_parameter, cv = 2)
linear_regressor_grid_search.fit(X, Y)
print('The best score',linear_regressor_grid_search.best_score_)
print('The best parameters',linear_regressor_grid_search.best_params_)

The best score 0.5720433116500588
The best parameters {'fit_intercept': True}


#### LASSO

In [17]:
lasso_regressor = Lasso(alpha = 0.2, fit_intercept= False, max_iter = 20000).fit(x_train, y_train)

print("Training score : ", lasso_regressor.score(x_train, y_train))

y_pred = lasso_regressor.predict(x_test)
visualize_metrics(y_test, y_pred)

Training score :  0.5008409584159668
R2 :  0.510383190381585 
MSE:  0.6496869271044228 
 RMSE:  0.8060315918774045


In [16]:
lasso_regressor_parameter = {'alpha': [0.2,0.4,0.6,0.8,1], 'fit_intercept': [True, False]}
lasso_regressor_grid_search = GridSearchCV(Lasso(max_iter = 400000), lasso_regressor_parameter, cv = 2)
lasso_regressor_grid_search.fit(X, Y)
print('The best score',lasso_regressor_grid_search.best_score_)
print('The best parameters',lasso_regressor_grid_search.best_params_)

The best score 0.4983501284200035
The best parameters {'alpha': 0.2, 'fit_intercept': False}


#### Ridge Regression

In [20]:
ridge_regressor = Ridge(alpha = 0.2, fit_intercept= True, max_iter = 20000).fit(x_train, y_train)

print("Training score : ", ridge_regressor.score(x_train, y_train))

y_pred = ridge_regressor.predict(x_test)
visualize_metrics(y_test, y_pred)

Training score :  0.605838964000722
R2 :  0.6074043035287395 
MSE:  0.5209467620885389 
 RMSE:  0.7217664179556561


In [13]:
ridge_regressor_parameter = {'alpha': [0.2,0.4,0.6,0.8,1], 'fit_intercept': [True, False]}
ridge_regressor_grid_search = GridSearchCV(Ridge(max_iter = 400000), ridge_regressor_parameter, cv = 2)
ridge_regressor_grid_search.fit(X, Y)
print('The best score',ridge_regressor_grid_search.best_score_)
print('The best parameters',ridge_regressor_grid_search.best_params_)

The best score 0.5720383415046448
The best parameters {'alpha': 0.2, 'fit_intercept': True}


### Standartization

In [31]:
scaler = preprocessing.StandardScaler()
d = scaler.fit_transform(california_housing.frame)
scaled_df = pd.DataFrame(d, columns=california_housing.frame.columns)

In [54]:
X = scaled_df.drop(columns = ['MedHouseVal']) # Features
#Y = scaled_df['MedHouseVal'] # Target
Y = california_housing.frame['MedHouseVal']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state=45)

#### Linear Regression

In [55]:
from sklearn.linear_model import LinearRegression

linear_regressor = LinearRegression(fit_intercept = True, copy_X = True).fit(x_train, y_train)

print("Training score : ", linear_regressor.score(x_train, y_train))

y_pred = linear_regressor.predict(x_test)
visualize_metrics(y_test, y_pred)

Training score :  0.6058389654865454
R2 :  0.6074054352929976 
MSE:  0.5209452603172995 
 RMSE:  0.7217653776105497


In [56]:
linear_regressor_parameter = {'fit_intercept': [True, False]}
linear_regressor_grid_search = GridSearchCV(LinearRegression(), linear_regressor_parameter, cv = 2)
linear_regressor_grid_search.fit(X, Y)
print('The best score',linear_regressor_grid_search.best_score_)
print('The best parameters',linear_regressor_grid_search.best_params_)

The best score 0.5720433116500585
The best parameters {'fit_intercept': True}


#### LASSO

In [57]:
lasso_regressor = Lasso(alpha = 0.2, fit_intercept= False, max_iter = 20000).fit(x_train, y_train)

print("Training score : ", lasso_regressor.score(x_train, y_train))

y_pred = lasso_regressor.predict(x_test)
visualize_metrics(y_test, y_pred)

Training score :  -2.768823389299004
R2 :  -2.7891290361209307 
MSE:  5.0279066231368255 
 RMSE:  2.2422994053285628


In [50]:
lasso_regressor_parameter = {'alpha': [0.2,0.4,0.6,0.8,1], 'fit_intercept': [True, False]}
lasso_regressor_grid_search = GridSearchCV(Lasso(max_iter = 400000), lasso_regressor_parameter, cv = 2)
lasso_regressor_grid_search.fit(X, Y)
print('The best score',lasso_regressor_grid_search.best_score_)
print('The best parameters',lasso_regressor_grid_search.best_params_)

The best score 0.44478151701980545
The best parameters {'alpha': 0.2, 'fit_intercept': True}


#### Ridge Regression

In [47]:
ridge_regressor = Ridge(alpha = 0.2, fit_intercept= True, max_iter = 20000).fit(x_train, y_train)

print("Training score : ", ridge_regressor.score(x_train, y_train))

y_pred = ridge_regressor.predict(x_test)
visualize_metrics(y_test, y_pred)

Training score :  0.6043726934259184
R2 :  0.6048561892912513 
MSE:  0.5243279297207429 
 RMSE:  0.7241049162384847


In [46]:
ridge_regressor_parameter = {'alpha': [0.2,0.4,0.6,0.8,1], 'fit_intercept': [True, False]}
ridge_regressor_grid_search = GridSearchCV(Ridge(max_iter = 400000), ridge_regressor_parameter, cv = 2)
ridge_regressor_grid_search.fit(X, Y)
print('The best score',ridge_regressor_grid_search.best_score_)
print('The best parameters',ridge_regressor_grid_search.best_params_)

The best score 0.5668651843380306
The best parameters {'alpha': 0.2, 'fit_intercept': True}


### Normalization

In [39]:
normalizer = preprocessing.MinMaxScaler()
d = normalizer.fit_transform(california_housing.frame)
normalized_df = pd.DataFrame(d, columns=california_housing.frame.columns)
#print(normalized_df)

In [52]:
X = normalized_df.drop(columns = ['MedHouseVal']) # Features
Y = california_housing.frame['MedHouseVal'] # Target

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state=45)

#### Linear Regression

In [41]:
linear_regressor = LinearRegression(fit_intercept = True, copy_X = True).fit(x_train, y_train)

print("Training score : ", linear_regressor.score(x_train, y_train))

y_pred = linear_regressor.predict(x_test)
visualize_metrics(y_test, y_pred)

Training score :  0.6058389654865455
R2 :  0.6074054352929978 
MSE:  0.022146496498591562 
 RMSE:  0.14881698995273207


In [None]:
linear_regressor_parameter = {'fit_intercept': [True, False]}
linear_regressor_grid_search = GridSearchCV(LinearRegression(), linear_regressor_parameter, cv = 2)
linear_regressor_grid_search.fit(X, Y)
print('The best score',linear_regressor_grid_search.best_score_)
print('The best parameters',linear_regressor_grid_search.best_params_)

The best score 0.572043311650059
The best parameters {'fit_intercept': True}


#### LASSO

In [30]:
lasso_regressor = Lasso(alpha = 0.2, fit_intercept= True, max_iter = 20000).fit(x_train, y_train)

print("Training score : ", lasso_regressor.score(x_train, y_train))

y_pred = lasso_regressor.predict(x_test)
visualize_metrics(y_test, y_pred)

Training score :  0.0
R2 :  -0.00012376056125318335 
MSE:  0.056417585347770284 
 RMSE:  0.23752386269124684


In [58]:
lasso_regressor_parameter = {'alpha': [0.2,0.4,0.6,0.8,1], 'fit_intercept': [True, False]}
lasso_regressor_grid_search = GridSearchCV(Lasso(max_iter = 400000), lasso_regressor_parameter, cv = 2)
lasso_regressor_grid_search.fit(X, Y)
print('The best score',lasso_regressor_grid_search.best_score_)
print('The best parameters',lasso_regressor_grid_search.best_params_)

The best score -0.0007528673641564732
The best parameters {'alpha': 0.2, 'fit_intercept': True}


#### Ridge Regression

In [59]:
ridge_regressor = Ridge(alpha = 0.2, fit_intercept= True, max_iter = 20000).fit(x_train, y_train)

print("Training score : ", ridge_regressor.score(x_train, y_train))

y_pred = ridge_regressor.predict(x_test)
visualize_metrics(y_test, y_pred)

Training score :  0.6043726934259184
R2 :  0.6048561892912513 
MSE:  0.5243279297207429 
 RMSE:  0.7241049162384847


In [60]:
ridge_regressor_parameter = {'alpha': [0.2,0.4,0.6,0.8,1], 'fit_intercept': [True, False]}
ridge_regressor_grid_search = GridSearchCV(Ridge(max_iter = 400000), ridge_regressor_parameter, cv = 2)
ridge_regressor_grid_search.fit(X, Y)
print('The best score',ridge_regressor_grid_search.best_score_)
print('The best parameters',ridge_regressor_grid_search.best_params_)

The best score 0.5668651843380306
The best parameters {'alpha': 0.2, 'fit_intercept': True}


### Neural Network approach

In [53]:
nn_regressor = MLPRegressor(activation = 'relu', hidden_layer_sizes = (32,64,128,64,8), solver= 'lbfgs', max_iter= 20000).fit(x_train, y_train)

print("Training score : ", nn_regressor.score(x_train, y_train))

y_pred = nn_regressor.predict(x_test)
visualize_metrics(y_test, y_pred)

Training score :  0.8247275796352957
R2 :  0.807983087159884 
MSE:  0.2547928821161143 
 RMSE:  0.5047701279950254


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
