# Project 1

## Chinagorom Mbaraonye

First set the directory of the data file and import all packages:

In [None]:
dir = "C:/Users/Chinagorom Mbaraonye/Desktop/handson-ml-master/"

import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import datetime


Read in the data:

In [3]:
#read the data
csv_path = os.path.join(dir,"datasets", "housing", "housing.csv")
print(csv_path)
housing = pd.read_csv(csv_path)
print(len(housing))

C:/Users/Chinagorom Mbaraonye/Desktop/handson-ml-master/datasets\housing\housing.csv
20640


Split the data into test data and train data using stratified sampling:

In [4]:
#create income categories for stratified sampling
housing["income_cat"] = pd.cut(housing["median_income"],
                                bins=[0, 1.5, 3, 4.5, 6, np.inf],
                                labels=[1,2,3,4,5])

#split the data into test and train sets using stratified sampling
split = StratifiedShuffleSplit(n_splits=1, test_size=.2, random_state=42)

for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]  

#remove the income_cat variable
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

Make a copy of the train set and seperate the attributes from the labels:

In [5]:
#make a copy of the training set
housing = strat_train_set.copy()

# drop labels for training set
housing = strat_train_set.drop("median_house_value", axis=1) 
housing_labels = strat_train_set["median_house_value"].copy()

Seperate the features into categorical and numeric:

In [6]:
#get only the quantitative features
housing_num = housing.drop("ocean_proximity", axis=1)

#get only the categorical data
housing_cat = housing[["ocean_proximity"]]

Make a function to create the combinations of the variables:

In [7]:
#row numbers for the attributes that we want to make combinations for
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]


Create a pipeline that prepares the data for the algorithm:

In [8]:
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing)

Running SVR with multiple parameters

In [1]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR



param_grid = [
        {'kernel': ['linear'], 'C': [10., 30., 100., 300., 1000., 3000., 10000., 30000.0]},
        {'kernel': ['rbf'], 'C': [1.0, 3.0, 10., 30., 100., 300., 1000.0],
         'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},
    ]

begin_time= datetime.datetime.now()
svm_reg = SVR()
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(svm_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True, verbose=1)

grid_search.fit(housing_prepared, housing_labels)
end_time = datetime.datetime.now()

NameError: name 'datetime' is not defined

In [13]:
print("It took :", end_time - begin_time , ".First time was 30 minutes less than that but I was using my computer during \
this iteration")

It took : 1:40:00.649017 .First time was 30 minutes less than that but I was using my computer during this iteration


The best value of parameters

In [14]:
grid_search.best_params_

{'C': 30000.0, 'kernel': 'linear'}

RSME on the test data set

In [15]:
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print("RSME is: ", final_rmse)

RSME is:  68139.43891327262
