In [28]:
from sklearn import datasets
from sklearn.preprocessing import PolynomialFeatures
import joblib 

In [3]:
housing = datasets.fetch_california_housing()

In [3]:
print(housing.feature_names)

['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']


In [16]:
x = housing.data
y = housing.target
print(x[0])
print(y[0])

print(x.shape)
poly = PolynomialFeatures()
x = poly.fit_transform(x)
print(x.shape)



[   8.3252       41.            6.98412698    1.02380952  322.
    2.55555556   37.88       -122.23      ]
4.526
(20640, 8)
(20640, 45)


In [6]:
from sklearn.model_selection import train_test_split

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=432)
#the random state is used to ensure that the split is always the same
#it shuffles the data in the same way every time, so that if we wanted to run the same simulation again, we would get the same results

In [19]:
print(x_train[0])
print(y_train[0])

[ 1.00000000e+00  2.14420000e+00  5.20000000e+01  3.94886364e+00
  1.03693182e+00  9.21000000e+02  2.61647727e+00  3.73400000e+01
 -1.21880000e+02  4.59759364e+00  1.11498400e+02  8.46715341e+00
  2.22338920e+00  1.97480820e+03  5.61025057e+00  8.00644280e+01
 -2.61335096e+02  2.70400000e+03  2.05340909e+02  5.39204545e+01
  4.78920000e+04  1.36056818e+02  1.94168000e+03 -6.33776000e+03
  1.55935240e+01  4.09470235e+00  3.63690341e+03  1.03321120e+01
  1.47450568e+02 -4.81287500e+02  1.07522760e+00  9.55014205e+02
  2.71310854e+00  3.87190341e+01 -1.26381250e+02  8.48241000e+05
  2.40977557e+03  3.43901400e+04 -1.12251480e+05  6.84595332e+00
  9.76992614e+01 -3.18896250e+02  1.39427560e+03 -4.55099920e+03
  1.48547344e+04]
1.889


In [23]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

LR = LinearRegression()
GBR = HistGradientBoostingRegressor()
RFR = RandomForestRegressor(
    n_jobs = -1
)
for i in [LR, GBR, RFR]:
    
    i.fit(x_train, y_train)
    y_pred = i.predict(x_test)
    r2 = r2_score(y_test, y_pred)
    print(i,r2)
## this would give us the r2 score of our model, which is a measure of how well our model is performing. The r2 score ranges from 0 to 1, with 1 being a perfect fit.
## this is the baseline which we will try to improve upon using more complex models


LinearRegression() 0.6610240212874665
HistGradientBoostingRegressor() 0.8371572645475587
RandomForestRegressor(n_jobs=-1) 0.8048732112227579


 ##NOW FOR OPTIMISATON, through polynomial features, expanding current features into higher degree features

In [33]:

model = HistGradientBoostingRegressor(
    max_iter = 300,
    learning_rate= 0.05
)


model.fit(x_train, y_train)
joblib.dump(model, 'Housing_Prediction_model.joblib')
y_pred = model.predict(x_test)
r2 = r2_score(y_test, y_pred)
print(r2)

0.8448530464258948


In [34]:
local_model = joblib.load("Housing_Prediction_model.joblib")
y_pred = local_model.predict(x_test)
r2 = r2_score(y_test, y_pred)
print(r2)



0.8448530464258948
