In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = load_dataset("gvlassis/california_housing")

In [3]:
# merge the train, validation and test data
df_train = pd.DataFrame(df['train'])
df_validation = pd.DataFrame(df['validation'])
df_test = pd.DataFrame(df['test'])

df = pd.concat([df_train, df_validation, df_test])

In [4]:
df.shape

(20640, 9)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20640 entries, 0 to 1999
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.6 MB


In [6]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [7]:
df.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


In [9]:
X = df.drop(columns='MedHouseVal')
y = df['MedHouseVal']

In [10]:
X.shape, y.shape

((20640, 8), (20640,))

In [11]:
X

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
1995,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
1996,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
1997,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
1998,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [12]:
y

0       4.526
1       3.585
2       3.521
3       3.413
4       3.422
        ...  
1995    0.781
1996    0.771
1997    0.923
1998    0.847
1999    0.894
Name: MedHouseVal, Length: 20640, dtype: float64

In [14]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16512, 8), (4128, 8), (16512,), (4128,))

In [18]:
scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)
X_test = scalar.transform(X_test)

In [19]:
X_train

array([[-0.326196  ,  0.34849025, -0.17491646, ...,  0.05137609,
        -1.3728112 ,  1.27258656],
       [-0.03584338,  1.61811813, -0.40283542, ..., -0.11736222,
        -0.87669601,  0.70916212],
       [ 0.14470145, -1.95271028,  0.08821601, ..., -0.03227969,
        -0.46014647, -0.44760309],
       ...,
       [-0.49697313,  0.58654547, -0.60675918, ...,  0.02030568,
        -0.75500738,  0.59946887],
       [ 0.96545045, -1.07984112,  0.40217517, ...,  0.00707608,
         0.90651045, -1.18553953],
       [-0.68544764,  1.85617335, -0.85144571, ..., -0.08535429,
         0.99543676, -1.41489815]], shape=(16512, 8))

In [20]:
X_test

array([[-1.15508475, -0.28632369, -0.52068576, ...,  0.06740798,
         0.1951    ,  0.28534728],
       [-0.70865905,  0.11043502, -0.16581537, ..., -0.03602975,
        -0.23549054,  0.06097472],
       [-0.21040155,  1.85617335, -0.61076476, ..., -0.14998876,
         1.00947776, -1.42487026],
       ...,
       [ 2.80902421, -0.28632369,  0.75501156, ..., -0.02646898,
         0.78014149, -1.23041404],
       [-0.57542978,  0.58654547, -0.06124296, ..., -0.04390537,
         0.52740357, -0.08860699],
       [-0.17259111, -0.92113763, -0.6058703 , ...,  0.05466644,
        -0.66608108,  0.60445493]], shape=(4128, 8))

In [21]:
y_train

14196    1.030
8267     3.821
805      1.726
14265    0.934
2271     0.965
         ...  
11284    2.292
11964    0.978
5390     2.221
860      2.835
15795    3.250
Name: MedHouseVal, Length: 16512, dtype: float64

In [22]:
y_test

1406     0.47700
3024     0.45800
15663    5.00001
1844     2.18600
9814     2.78000
          ...   
15362    2.63300
16623    2.66800
1446     5.00001
2144     0.72300
3665     1.51500
Name: MedHouseVal, Length: 4128, dtype: float64

# XGBoost : Regression

In [23]:
import xgboost as xgb
from sklearn.neighbors import KNeighborsRegressor

In [24]:
xgb_regressor = xgb.XGBRegressor(objective='reg:squarederror', eval_metric='rmse')

In [25]:
xgb_regressor.fit(X_train, y_train)

In [26]:
y_pred = xgb_regressor.predict(X_test)

In [27]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [28]:
mean_absolute_error(y_test, y_pred), mean_squared_error(y_test, y_pred), r2_score(y_test, y_pred)

(0.30957335413783094, 0.2225899267544737, 0.8301370561019205)

In [41]:
from sklearn.model_selection import GridSearchCV

In [42]:
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.5],
    'n_estimators': [50, 100, 200],
    'gamma': [0, 0.1, 0.2],
    'min_child_weight': [1, 2, 3]
}

In [43]:
grid_search = GridSearchCV(xgb_regressor, param_grid, cv=3, n_jobs=-1, verbose=1)

In [44]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 243 candidates, totalling 729 fits


In [45]:
grid_search.best_params_

{'gamma': 0,
 'learning_rate': 0.1,
 'max_depth': 7,
 'min_child_weight': 1,
 'n_estimators': 200}

In [46]:
grid_search.best_score_

np.float64(0.8356425335118843)

In [48]:
xgb_regressor_best_params = xgb.XGBRegressor(**grid_search.best_params_, objective='reg:squarederror', eval_metric='rmse')

In [49]:
xgb_regressor_best_params.fit(X_train, y_train)

In [50]:
y_pred_best_params = xgb_regressor_best_params.predict(X_test)

In [51]:
mean_absolute_error(y_test, y_pred_best_params), mean_squared_error(y_test, y_pred_best_params), r2_score(y_test, y_pred_best_params)

(0.2965878856061526, 0.20846232033009232, 0.8409181226688989)

# KNN : Regression

In [52]:
knn_regressor = KNeighborsRegressor(n_neighbors=5)

In [53]:
knn_regressor.fit(X_train, y_train)

In [54]:
y_pred_knn = knn_regressor.predict(X_test)

In [55]:
mean_absolute_error(y_test, y_pred_knn), mean_squared_error(y_test, y_pred_knn), r2_score(y_test, y_pred_knn)

(0.4461535271317829, 0.4324216146043236, 0.6700101862970989)

In [61]:
param_grid_knn = {
    'n_neighbors': range(1, 21, 2),
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': range(10, 51, 10),
    'p': [1, 2, 3]
}

In [62]:
grid_search_knn = GridSearchCV(knn_regressor, param_grid_knn, cv=3, n_jobs=-1, verbose=1)

In [63]:
grid_search_knn.fit(X_train, y_train)

Fitting 3 folds for each of 1200 candidates, totalling 3600 fits


In [64]:
grid_search_knn.best_params_

{'algorithm': 'auto',
 'leaf_size': 10,
 'n_neighbors': 9,
 'p': 1,
 'weights': 'distance'}

In [65]:
grid_search_knn.best_score_

np.float64(0.723161972581649)

In [66]:
grid_search_knn.best_params_

{'algorithm': 'auto',
 'leaf_size': 10,
 'n_neighbors': 9,
 'p': 1,
 'weights': 'distance'}

In [67]:
knn_regressor_best_params = KNeighborsRegressor(**grid_search_knn.best_params_, n_jobs=-1)

In [68]:
knn_regressor_best_params.fit(X_train, y_train)

In [69]:
y_pred_knn_best_params = knn_regressor_best_params.predict(X_test)

In [70]:
mean_absolute_error(y_test, y_pred_knn_best_params), mean_squared_error(y_test, y_pred_knn_best_params), r2_score(y_test, y_pred_knn_best_params)

(0.40580195325466495, 0.3641506481894662, 0.72210916268423)

In [71]:
import joblib

In [72]:
data_to_save = {
    'scalar': scalar,
    'xgb_regressor': xgb_regressor,
    'xgb_regressor_best_params': xgb_regressor_best_params,
    'knn_regressor': knn_regressor,
    'knn_regressor_best_params': knn_regressor_best_params
}

In [73]:
joblib.dump(data_to_save, 'xgboost regressor, knn regressor and scalar.pkl')

['xgboost regressor, knn regressor and scalar.pkl']