**Fetch Data**

In [1]:
import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
 if not os.path.isdir(housing_path):
  os.makedirs(housing_path)
 tgz_path = os.path.join(housing_path, "housing.tgz")
 urllib.request.urlretrieve(housing_url, tgz_path)
 housing_tgz = tarfile.open(tgz_path)
 housing_tgz.extractall(path=housing_path)
 housing_tgz.close()

fetch_housing_data()

**Load CSV**

In [2]:
import pandas as pd
import numpy as np

def load_housing_data(housing_path=HOUSING_PATH):
 csv_path = os.path.join(housing_path, "housing.csv")
 return pd.read_csv(csv_path)

housing = load_housing_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


**Split Data**

In [3]:
from sklearn.model_selection import StratifiedShuffleSplit

housing["income_category"] = pd.cut(housing["median_income"] , bins=[0 , 1.5 , 3 , 4.5 , 6 , np.inf] , labels=[1 , 2 , 3 , 4, 5])

split = StratifiedShuffleSplit(n_splits = 1 , random_state = 42 , test_size = 0.2)

for train_index , test_index in split.split(housing , housing["income_category"]):
  strat_test_set = housing.loc[test_index]
  strat_train_set = housing.loc[train_index]

strat_test_set.drop("income_category", inplace=True , axis=1)
strat_train_set.drop("income_category", inplace=True ,axis=1)


**Separate Labels and Features**

In [4]:
housing_features = strat_train_set.drop("median_house_value",axis=1)
housing_label = strat_train_set["median_house_value"].copy()
housing_split = strat_train_set.copy()
housing_num = housing_features.drop("ocean_proximity",axis=1)

**Add Custom Transformer to remove and add columns**

In [5]:
from sklearn.base import TransformerMixin , BaseEstimator

rooms_index , bedrooms_index , population_index , household_index = 3 , 4 , 5 , 6

class CombinedAttributesAdder(BaseEstimator , TransformerMixin):

  def __init__(self):
    return None

  def fit(self , X , y = None):
    return self

  def transform(self , X , y=None):

    rooms_per_household = X[:,rooms_index] / X[:,household_index]
    population_per_household = X[:,population_index] / X[:,household_index]
    bedrooms_per_household = X[:,bedrooms_index] / X[:,rooms_index]

    X = np.delete(X , [3,4,5,6] , axis=1)

    return np.c_[X , rooms_per_household , population_per_household , bedrooms_per_household]

new_labels = np.append( housing_split.columns.values , ["rooms_per_household" , "population_per_household" , "bedrooms_per_household"] )
new_labels = np.delete(new_labels , [3,4,5,6] )
Attribute_Adder = CombinedAttributesAdder()

housing_extra_attributes = Attribute_Adder.transform(housing_split.values)
housing_extra_attributes
pd.DataFrame( housing_extra_attributes ,columns = new_labels )

Unnamed: 0,longitude,latitude,housing_median_age,median_income,median_house_value,ocean_proximity,rooms_per_household,population_per_household,bedrooms_per_household
0,-121.46,38.52,29.0,2.1736,72100.0,INLAND,5.485836,3.168555,0.205784
1,-117.23,33.09,7.0,6.3373,279600.0,NEAR OCEAN,6.927083,2.623698,0.160714
2,-119.04,35.37,44.0,2.875,82700.0,INLAND,5.393333,2.223333,0.191595
3,-117.13,32.75,24.0,2.2264,112500.0,NEAR OCEAN,3.886128,1.859213,0.276505
4,-118.7,34.28,27.0,4.4964,238300.0,<1H OCEAN,6.096552,3.167241,0.182692
...,...,...,...,...,...,...,...,...,...
16507,-117.07,33.03,14.0,5.09,268500.0,<1H OCEAN,6.658342,2.023976,0.184696
16508,-121.42,38.51,15.0,2.8139,90400.0,INLAND,5.571932,3.363188,0.179977
16509,-122.72,38.44,48.0,3.1797,140400.0,<1H OCEAN,4.110465,2.662791,0.234795
16510,-122.7,38.31,14.0,4.1964,258100.0,<1H OCEAN,6.297405,2.411178,0.183835


**Create Pipleline For Numerical Data**

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

housing_split_num = housing_split.drop("ocean_proximity",axis=1)

num_pipeline = Pipeline([
    ('imputer' , SimpleImputer(strategy="median")),
    ('attributes_adder'  , CombinedAttributesAdder()),
    ('std_scaler' ,StandardScaler())
])


**Create Pipeline for Categorical data and Fitting it**

In [7]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

if 'median_house_value' in housing_split.columns:
  housing_split.drop("median_house_value", inplace = True , axis = 1)

if 'median_house_value' in housing_split_num.columns:
  housing_split_num.drop("median_house_value", inplace = True , axis = 1 )

numerical_labels = list(housing_split_num)
categorical_labels = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ("num" , num_pipeline , numerical_labels),
    ("cat" , OneHotEncoder() , categorical_labels)
])

housing_prepared = full_pipeline.fit_transform(housing_split)
housing_prepared


array([[-0.94135046,  1.34743822,  0.02756357, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.17178212, -1.19243966, -1.72201763, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.26758118, -0.1259716 ,  1.22045984, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-1.5707942 ,  1.31001828,  1.53856552, ...,  0.        ,
         0.        ,  0.        ],
       [-1.56080303,  1.2492109 , -1.1653327 , ...,  0.        ,
         0.        ,  0.        ],
       [-1.28105026,  2.02567448, -0.13148926, ...,  0.        ,
         0.        ,  0.        ]])

**Experiment with SVM because why not**

In [8]:
from sklearn import svm
from sklearn.metrics import mean_squared_error

linearSVR = svm.LinearSVR()
linearSVR.fit(housing_prepared , housing_label)

some_data = housing_split.iloc[:5]
some_labels = housing_label.iloc[:5]


some_data_prepared = full_pipeline.transform(some_data)
housing_predictions = linearSVR.predict(some_data_prepared)

linearSVR_mse = mean_squared_error(some_labels , housing_predictions)
print("RMSE :" , np.sqrt(linearSVR_mse))


RMSE : 160725.52098226594


In [9]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(linearSVR , housing_prepared , housing_label , scoring = "neg_mean_squared_error" , cv=10)
rmse_scores = np.sqrt(-scores)
rmse_scores.mean()

220171.1907964204

**Experiment with different types of SVM for regression**

In [10]:
# from sklearn import svm

# svr = svm.SVR()
# svr.fit(housing_prepared , housing_label)

# scores = cross_val_score(svr , housing_prepared , housing_label , scoring = "neg_mean_squared_error" , cv=10)
# rmse_scores = np.sqrt(-scores)
# rmse_scores.mean()

In [12]:
from sklearn import svm

svr = svm.NuSVR()
svr.fit(housing_prepared , housing_label)

scores = cross_val_score(svr , housing_prepared , housing_label , scoring = "neg_mean_squared_error" , cv=10)
rmse_scores = np.sqrt(-scores)
rmse_scores.mean()

116350.47558851689

116350.47558851689

In [18]:
from sklearn.ensemble import RandomForestRegressor

random_forest = RandomForestRegressor()
random_forest.fit(housing_prepared , housing_label)

In [20]:
random_forest_scores = cross_val_score(random_forest , housing_prepared , housing_label , scoring = "neg_mean_squared_error" , cv = 10)
random_forest_rmse_scores = np.sqrt(-random_forest_scores)

In [22]:
random_forest_rmse_scores.mean()

49959.11284169149

In [23]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [30],
    'max_features': [8]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(RandomForestRegressor(), param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score = True)

# Fit the GridSearchCV object to the data
grid_search.fit(housing_prepared, housing_label)

In [26]:
cv_result = grid_search.cv_results_
for mean_score , params in zip( cv_result["mean_test_score"] , cv_result["params"] ):
  print( np.sqrt(-mean_score) , params)

49848.52284357839 {'max_features': 8, 'n_estimators': 30}


In [27]:
X_test = strat_test_set.drop("median_house_value",axis=1)
Y_test = strat_test_set["median_house_value"].copy()

In [29]:
final_model = grid_search.best_estimator_
X_test_set_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_set_prepared)

final_mse = mean_squared_error(Y_test , final_predictions)
final_rmse = np.sqrt(final_mse)

print("rmse : " , final_rmse)
list(zip(final_predictions[:10] , Y_test[:10]))


rmse :  47602.14362221277


[(487897.43333333335, 500001.0),
 (224073.33333333334, 162500.0),
 (202416.66666666666, 204600.0),
 (169086.66666666666, 159700.0),
 (242363.33333333334, 184000.0),
 (160453.33333333334, 151900.0),
 (117500.0, 104900.0),
 (413460.1666666667, 500001.0),
 (229876.66666666666, 367400.0),
 (334293.5, 346500.0)]