# Training Set

In [1]:
import pandas as pd
import numpy as np

In [2]:
#import data

housing = pd.read_csv("housing.csv")

In [3]:
#Split data into train and test sets, 80% train 20% test

from sklearn.model_selection import train_test_split

train, test = train_test_split(housing, test_size=0.2, random_state=42)

In [4]:
test.shape, train.shape

((4128, 10), (16512, 10))

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16512 entries, 14196 to 15795
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           16512 non-null  float64
 1   latitude            16512 non-null  float64
 2   housing_median_age  16512 non-null  float64
 3   total_rooms         16512 non-null  float64
 4   total_bedrooms      16512 non-null  float64
 5   population          16512 non-null  float64
 6   households          16512 non-null  float64
 7   median_income       16512 non-null  float64
 8   median_house_value  16512 non-null  float64
 9   ocean_proximity     16512 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.4+ MB


In [6]:
#replace each attribute’s missing values with the median of that attribute

from sklearn.impute import SimpleImputer

imputer=SimpleImputer(strategy="median")

In [7]:
#exclude text attribute and fit to training data

train_num=train.drop("ocean_proximity", axis=1)
imputer.fit(train_num)

#result stored in statistics. print statistics
imputer.statistics_
train_num.median().values

array([-1.1851e+02,  3.4260e+01,  2.9000e+01,  2.1290e+03,  4.3700e+02,
        1.1670e+03,  4.1000e+02,  3.5458e+00,  1.7985e+05])

In [8]:
#transformed training set in a dataframe

X=imputer.transform(train_num)
train_tr=pd.DataFrame(X, columns=train_num.columns)

In [9]:
#one-hot encoding of categorical data to remove missing values

train_cat=train[["ocean_proximity"]]
train_cat.head(10)

Unnamed: 0,ocean_proximity
14196,NEAR OCEAN
8267,NEAR OCEAN
17445,NEAR OCEAN
14265,NEAR OCEAN
2271,INLAND
17848,<1H OCEAN
6252,<1H OCEAN
9389,NEAR BAY
6113,<1H OCEAN
6061,<1H OCEAN


In [10]:
#using One Hot Encoder

from sklearn.preprocessing import OneHotEncoder

cat_encoder=OneHotEncoder()
train_cat_1hot=cat_encoder.fit_transform(train_cat)
train_cat_1hot


<16512x5 sparse matrix of type '<class 'numpy.float64'>'
	with 16512 stored elements in Compressed Sparse Row format>

In [11]:
# convert normal 2D array to a (dense) NumPy array, use toarray() method

train_cat_1hot.toarray()

array([[0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       ...,
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])

In [12]:
#encoded categories

cat_encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

# Feature engineering

In [13]:
#exploration t0 gain insight
train["rooms_per_household"] = train["total_rooms"]/train["households"]
train["bedrooms_per_room"] = train["total_bedrooms"]/train["total_rooms"]
train["population_per_household"] = train["population"]/train["households"]

corr_matrix = train.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

median_house_value          1.000000
median_income               0.690647
rooms_per_household         0.158485
total_rooms                 0.133989
housing_median_age          0.103706
households                  0.063714
total_bedrooms              0.047980
population_per_household   -0.022030
population                 -0.026032
longitude                  -0.046349
latitude                   -0.142983
bedrooms_per_room          -0.257419
Name: median_house_value, dtype: float64

# Feature scaling

In [14]:
#standadization by scikit learn
train_labels=train['median_house_value'].copy()

# drop the labels from the train set via the pandas drop function
train=train.drop('median_house_value', axis=1)
# axis=1 means that median_house_value should be dropped column wise

train_num=train.drop("ocean_proximity", axis=1)
#  re-do since the additional columns; rooms_per_household, bedrooms_per_room and population_per_household were added

train_labels.head()

14196    103000.0
8267     382100.0
17445    172600.0
14265     93400.0
2271      96500.0
Name: median_house_value, dtype: float64

In [15]:
train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,NEAR OCEAN,5.017657,0.200576,3.691814
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,NEAR OCEAN,4.473545,0.232703,1.738095
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,NEAR OCEAN,5.645833,0.174486,2.723214
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425,NEAR OCEAN,4.002817,0.258269,3.994366
2271,-119.8,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,INLAND,6.268421,0.18094,2.3


In [16]:
#pipeline constructor for transformation

from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([('imputer', SimpleImputer(strategy="median")),('std_scaler', StandardScaler())])
train_num_tr = num_pipeline.fit_transform(train_num)

#column transformer for both numerical and categorical attributes

from sklearn.compose import ColumnTransformer

num_attribs = list(train_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([("num", num_pipeline, num_attribs), ("cat", OneHotEncoder(), cat_attribs) ])
train_prepared = full_pipeline.fit_transform(train)

In [17]:
# Stochastic gradient model

from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline

rng = np.random.RandomState(0)
sgd_model = make_pipeline(StandardScaler(),SGDRegressor(max_iter=1000, tol=1e-3)) 
sgd_model.fit(train_prepared, train_labels)


Pipeline(steps=[('standardscaler', StandardScaler()),
                ('sgdregressor', SGDRegressor())])

In [18]:
# Predictions
from sklearn.metrics import mean_squared_error

train_predictions = sgd_model.predict(train_prepared)
sgd_mse = mean_squared_error(train_labels, train_predictions)
sgd_rmse = np.sqrt(sgd_mse)
sgd_rmse

16276815819.616947

In [19]:
#cross validation

from sklearn.model_selection import cross_val_score

scores = cross_val_score(sgd_model, train_prepared, train_labels, scoring="neg_mean_squared_error", cv=10)
sgd_rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
display_scores(sgd_rmse_scores)

Scores: [2.64333156e+08 2.59558981e+10 1.85994034e+07 1.06202748e+08
 4.41458843e+10 4.49538317e+07 2.26594736e+10 2.02283811e+08
 2.84577842e+10 7.79256167e+07]
Mean: 12193333877.919527
Standard deviation: 15681905405.876303


In [20]:
#fine tuning the model (use hyperparameters)

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid

param_grid = {'sgdregressor__max_iter': [30, 1000, 2000],'sgdregressor__alpha': [0.001],'sgdregressor__random_state': [0, 2**32 - 1], 'sgdregressor__shuffle': [True], 'sgdregressor__average': [True], 'sgdregressor__validation_fraction': [0.05] }
grid_search = GridSearchCV(sgd_model, param_grid, cv=5,scoring='neg_mean_squared_error',return_train_score=True)
grid_search.fit(train_prepared, train_labels)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('sgdregressor', SGDRegressor())]),
             param_grid={'sgdregressor__alpha': [0.001],
                         'sgdregressor__average': [True],
                         'sgdregressor__max_iter': [30, 1000, 2000],
                         'sgdregressor__random_state': [0, 4294967295],
                         'sgdregressor__shuffle': [True],
                         'sgdregressor__validation_fraction': [0.05]},
             return_train_score=True, scoring='neg_mean_squared_error')

In [21]:
sgd_model.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'standardscaler', 'sgdregressor', 'standardscaler__copy', 'standardscaler__with_mean', 'standardscaler__with_std', 'sgdregressor__alpha', 'sgdregressor__average', 'sgdregressor__early_stopping', 'sgdregressor__epsilon', 'sgdregressor__eta0', 'sgdregressor__fit_intercept', 'sgdregressor__l1_ratio', 'sgdregressor__learning_rate', 'sgdregressor__loss', 'sgdregressor__max_iter', 'sgdregressor__n_iter_no_change', 'sgdregressor__penalty', 'sgdregressor__power_t', 'sgdregressor__random_state', 'sgdregressor__shuffle', 'sgdregressor__tol', 'sgdregressor__validation_fraction', 'sgdregressor__verbose', 'sgdregressor__warm_start'])

In [22]:
grid_search.best_estimator_

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('sgdregressor',
                 SGDRegressor(alpha=0.001, average=True, max_iter=30,
                              random_state=4294967295,
                              validation_fraction=0.05))])

In [23]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

1428744200.4486032 {'sgdregressor__alpha': 0.001, 'sgdregressor__average': True, 'sgdregressor__max_iter': 30, 'sgdregressor__random_state': 0, 'sgdregressor__shuffle': True, 'sgdregressor__validation_fraction': 0.05}
604977638.0833852 {'sgdregressor__alpha': 0.001, 'sgdregressor__average': True, 'sgdregressor__max_iter': 30, 'sgdregressor__random_state': 4294967295, 'sgdregressor__shuffle': True, 'sgdregressor__validation_fraction': 0.05}
1428744200.4486032 {'sgdregressor__alpha': 0.001, 'sgdregressor__average': True, 'sgdregressor__max_iter': 1000, 'sgdregressor__random_state': 0, 'sgdregressor__shuffle': True, 'sgdregressor__validation_fraction': 0.05}
604977638.0833852 {'sgdregressor__alpha': 0.001, 'sgdregressor__average': True, 'sgdregressor__max_iter': 1000, 'sgdregressor__random_state': 4294967295, 'sgdregressor__shuffle': True, 'sgdregressor__validation_fraction': 0.05}
1428744200.4486032 {'sgdregressor__alpha': 0.001, 'sgdregressor__average': True, 'sgdregressor__max_iter': 2

In [24]:

import pickle 


filename = 'SDG_housing_model.pkl'
pickle.dump(grid_search.best_estimator_,open(filename,'wb'))

In [25]:
#Reload model of test data

import pickle

with open('SDG_housing_model.pkl', 'rb') as a:
   model = pickle.load(a)

# Training test data

In [26]:
#exclude text attribute and fit to training data

test_num=test.drop("ocean_proximity", axis=1)
imputer.fit(test_num)

SimpleImputer(strategy='median')

In [27]:
imputer.statistics_
test_num.median().values

X=imputer.transform(test_num)
test_tr=pd.DataFrame(X, columns=test_num.columns)

In [28]:
#one-hot encoding of categorical data to remove missing values

test_cat=test[["ocean_proximity"]]
test_cat.head(10)

Unnamed: 0,ocean_proximity
20046,INLAND
3024,INLAND
15663,NEAR BAY
20484,<1H OCEAN
9814,NEAR OCEAN
13311,INLAND
7113,<1H OCEAN
7668,<1H OCEAN
18246,NEAR BAY
5723,<1H OCEAN


In [29]:
#using One Hot Encoder

from sklearn.preprocessing import OneHotEncoder

cat_encoder=OneHotEncoder()
test_cat_1hot=cat_encoder.fit_transform(test_cat)
test_cat_1hot


<4128x5 sparse matrix of type '<class 'numpy.float64'>'
	with 4128 stored elements in Compressed Sparse Row format>

In [30]:
# convert normal 2D array to a (dense) NumPy array, use toarray() method

train_cat_1hot.toarray()

#encoded categories

cat_encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

# Feature engineering

In [31]:
#exploration t0 gain insight
test["rooms_per_household"] = test["total_rooms"]/test["households"]
test["bedrooms_per_room"] = test["total_bedrooms"]/test["total_rooms"]
test["population_per_household"] = test["population"]/test["households"]

corr_matrix = test.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

median_house_value          1.000000
median_income               0.677502
total_rooms                 0.134697
rooms_per_household         0.130928
housing_median_age          0.113585
households                  0.074249
total_bedrooms              0.056667
population                 -0.019003
longitude                  -0.044062
population_per_household   -0.121853
latitude                   -0.149295
bedrooms_per_room          -0.249196
Name: median_house_value, dtype: float64

# Feature scaling

In [32]:
#standadization by scikit learn
test_labels=test['median_house_value'].copy()

# drop the labels from the train set via the pandas drop function
test = test.drop('median_house_value', axis=1)
# axis=1 means that median_house_value should be dropped column wise

test_num=test.drop("ocean_proximity", axis=1)
#  re-do since the additional columns; rooms_per_household, bedrooms_per_room and population_per_household were added

test_labels.head()

20046     47700.0
3024      45800.0
15663    500001.0
20484    218600.0
9814     278000.0
Name: median_house_value, dtype: float64

In [33]:
train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,NEAR OCEAN,5.017657,0.200576,3.691814
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,NEAR OCEAN,4.473545,0.232703,1.738095
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,NEAR OCEAN,5.645833,0.174486,2.723214
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425,NEAR OCEAN,4.002817,0.258269,3.994366
2271,-119.8,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,INLAND,6.268421,0.18094,2.3


In [34]:
#pipeline constructor for transformation

from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([('imputer', SimpleImputer(strategy="median")),('std_scaler', StandardScaler())])
test_num_tr = num_pipeline.fit_transform(test_num)

#column transformer for both numerical and categorical attributes

from sklearn.compose import ColumnTransformer

num_attribs = list(test_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([("num", num_pipeline, num_attribs), ("cat", OneHotEncoder(), cat_attribs) ])
test_prepared = full_pipeline.fit_transform(test)

In [36]:
# Stochastic gradient model
#Reload train model of test data

import pickle

with open('SDG_housing_model.pkl', 'rb') as a:
   model = pickle.load(a)

test_predict = model.predict(test_prepared)
sgd_mse = mean_squared_error(test_labels, test_predict)
sgd_rmse = np.sqrt(sgd_mse)
sgd_rmse


4184840843.425965

In [37]:
#cross validation

from sklearn.model_selection import cross_val_score

scores = cross_val_score(sgd_model, train_prepared, train_labels, scoring="neg_mean_squared_error", cv=10)
sgd_rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
display_scores(sgd_rmse_scores)

Scores: [3.24817719e+07 3.67984321e+10 2.29043085e+07 6.21576525e+06
 3.22578068e+10 1.95463053e+07 2.74887139e+10 2.80065807e+08
 2.56498952e+10 2.62429160e+08]
Mean: 12281849106.118147
Standard deviation: 15165580593.826283


In [38]:
#fine tuning the model (use hyperparameters)

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid

param_grid = {'sgdregressor__max_iter': [30, 1000, 2000],'sgdregressor__alpha': [0.001],'sgdregressor__random_state': [0, 2**32 - 1], 'sgdregressor__shuffle': [True], 'sgdregressor__average': [True], 'sgdregressor__validation_fraction': [0.05] }
grid_search = GridSearchCV(sgd_model, param_grid, cv=5,scoring='neg_mean_squared_error',return_train_score=True)
grid_search.fit(test_prepared, test_labels)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('sgdregressor', SGDRegressor())]),
             param_grid={'sgdregressor__alpha': [0.001],
                         'sgdregressor__average': [True],
                         'sgdregressor__max_iter': [30, 1000, 2000],
                         'sgdregressor__random_state': [0, 4294967295],
                         'sgdregressor__shuffle': [True],
                         'sgdregressor__validation_fraction': [0.05]},
             return_train_score=True, scoring='neg_mean_squared_error')

In [39]:
grid_search.best_estimator_

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('sgdregressor',
                 SGDRegressor(alpha=0.001, average=True, max_iter=30,
                              random_state=4294967295,
                              validation_fraction=0.05))])

In [40]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

71461.78766723892 {'sgdregressor__alpha': 0.001, 'sgdregressor__average': True, 'sgdregressor__max_iter': 30, 'sgdregressor__random_state': 0, 'sgdregressor__shuffle': True, 'sgdregressor__validation_fraction': 0.05}
70955.48822156455 {'sgdregressor__alpha': 0.001, 'sgdregressor__average': True, 'sgdregressor__max_iter': 30, 'sgdregressor__random_state': 4294967295, 'sgdregressor__shuffle': True, 'sgdregressor__validation_fraction': 0.05}
71461.78766723892 {'sgdregressor__alpha': 0.001, 'sgdregressor__average': True, 'sgdregressor__max_iter': 1000, 'sgdregressor__random_state': 0, 'sgdregressor__shuffle': True, 'sgdregressor__validation_fraction': 0.05}
70955.48822156455 {'sgdregressor__alpha': 0.001, 'sgdregressor__average': True, 'sgdregressor__max_iter': 1000, 'sgdregressor__random_state': 4294967295, 'sgdregressor__shuffle': True, 'sgdregressor__validation_fraction': 0.05}
71461.78766723892 {'sgdregressor__alpha': 0.001, 'sgdregressor__average': True, 'sgdregressor__max_iter': 2000

In [None]:
# I think the random forest model is better fitted to the data than the stochastic gradient model.
#Actual results form the SDG model for train and test data was overfitting. The test data had a higher rmse score than the train data
# After validation, the test data reduced significantly.

# Howerver, for the random forest model both train and test were similar for actual and after cross validation.