In [None]:
# load the data

import pandas as pd

def load_data(path):
    return pd.read_csv(path)
housingData = load_data("../datasets/housing/housing.csv")

In [None]:
# add a new attribute for stratified split
import numpy as np

housingData["income_cat"] = np.ceil(housingData["median_income"]  / 1.5)
housingData["income_cat"].where(housingData["income_cat"] < 5, 5.0, inplace = True)



In [None]:
# split the data into training and test, stratified split based on income_cat attribute
# that we inserted earlier

from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
for train_index, test_index in split.split(housingData, housingData["income_cat"]):
    strat_train_set = housingData.loc[train_index]
    strat_test_set = housingData.loc[test_index]
    
housing_labels = strat_train_set["median_house_value"].copy() # our labels

In [None]:
# remove the attribute

for set in (strat_test_set, strat_train_set):
    set.drop(["income_cat"], axis = 1, inplace = True)

In [None]:
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, Imputer, LabelBinarizer
from sklearn.base import BaseEstimator, TransformerMixin

housing = strat_train_set.copy()

# select the dataframe, if it is numerical or text
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

housing_num = housing.drop("ocean_proximity", axis=1) # only numerical attributes
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

# add new attributes combined from existing, 1 hyperparameter: add_bedrooms_per_room
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]


num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('imputer', Imputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs)),
        ('label_binarizer', LabelBinarizer()),
    ])

preparation_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

In [None]:
# run the pipelines above

housing_prepared = preparation_pipeline.fit_transform(housing)
housing_prepared

In [None]:
# machine learning using linear regression model

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

In [None]:
# let's try the full pipeline on a few training instances
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = preparation_pipeline.transform(some_data)

print("Predictions:\t", lin_reg.predict(some_data_prepared))
print("Labels:\t\t", list(some_labels))

In [None]:
# calculate the error for linear regression

from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
round(lin_rmse)

In [None]:
# machine learning with decision tree

from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

In [None]:
# cross validation to fine tune the model

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", round(scores.mean()))
    print("Standard deviation:", scores.std())
    
from sklearn.model_selection import cross_val_score

# decision tree with cross validation
tree_scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
                              scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-tree_scores)
display_scores(tree_rmse_scores)

In [None]:
# linear regresion scores with cross validation

lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)