In [3]:
# Prepare the data set for machine learning by fixing missing values and adding combination variables

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

In [4]:
# import the trainging set and check that it's read in properly
housing = pd.read_csv("../data/interim/housing_train_set.csv")
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-121.89,37.29,38.0,1568.0,351.0,710.0,339.0,2.7042,286600.0,<1H OCEAN
1,-121.93,37.05,14.0,679.0,108.0,306.0,113.0,6.4214,340600.0,<1H OCEAN
2,-117.2,32.77,31.0,1952.0,471.0,936.0,462.0,2.8621,196900.0,NEAR OCEAN
3,-119.61,36.31,25.0,1847.0,371.0,1460.0,353.0,1.8839,46300.0,INLAND
4,-118.59,34.23,17.0,6592.0,1525.0,4459.0,1463.0,3.0347,254500.0,<1H OCEAN


In [5]:
# Separate the predictors and target values
housing_labels = housing.median_house_value
housing.drop("median_house_value", axis=1)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-121.89,37.29,38.0,1568.0,351.0,710.0,339.0,2.7042,<1H OCEAN
1,-121.93,37.05,14.0,679.0,108.0,306.0,113.0,6.4214,<1H OCEAN
2,-117.20,32.77,31.0,1952.0,471.0,936.0,462.0,2.8621,NEAR OCEAN
3,-119.61,36.31,25.0,1847.0,371.0,1460.0,353.0,1.8839,INLAND
4,-118.59,34.23,17.0,6592.0,1525.0,4459.0,1463.0,3.0347,<1H OCEAN
...,...,...,...,...,...,...,...,...,...
16507,-118.13,34.20,46.0,1271.0,236.0,573.0,210.0,4.9312,INLAND
16508,-117.56,33.88,40.0,1196.0,294.0,1052.0,258.0,2.0682,INLAND
16509,-116.40,34.09,9.0,4855.0,872.0,2098.0,765.0,3.2723,INLAND
16510,-118.01,33.82,31.0,1960.0,380.0,1356.0,356.0,4.0625,<1H OCEAN


In [9]:
# prepare to impute the missing numeric values, separate ocean_proximity b/c categorical variables dont work with the simple imputer
housing_num = housing.drop("ocean_proximity", axis=1)
# initialize the imputer object
imputer = SimpleImputer(strategy="median")
# fit the munerical data in the imputer
imputer.fit(housing_num)

SimpleImputer(strategy='median')

In [14]:
print(imputer.statistics_)
print(housing_num.median().values)
print(imputer.statistics_ == housing_num.median().values)

[-1.1851e+02  3.4260e+01  2.9000e+01  2.1195e+03  4.3300e+02  1.1640e+03
  4.0800e+02  3.5409e+00  1.7950e+05]
[-1.1851e+02  3.4260e+01  2.9000e+01  2.1195e+03  4.3300e+02  1.1640e+03
  4.0800e+02  3.5409e+00  1.7950e+05]
[ True  True  True  True  True  True  True  True  True]


In [15]:
# replace the missing numerical variables with the median
x = imputer.transform(housing_num)

In [16]:
housing_tr = pd.DataFrame(x, columns=housing_num.columns, index=housing_num.index)

In [22]:
# now to deal with the categorical variable
housing_cat = housing[['ocean_proximity']]
housing_cat.head(10)

Unnamed: 0,ocean_proximity
0,<1H OCEAN
1,<1H OCEAN
2,NEAR OCEAN
3,INLAND
4,<1H OCEAN
5,INLAND
6,<1H OCEAN
7,INLAND
8,<1H OCEAN
9,<1H OCEAN


In [23]:
# one-hot encode to avoide the problems associated with ordinal encoding
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)

In [24]:
housing_cat_1hot

<16512x5 sparse matrix of type '<class 'numpy.float64'>'
	with 16512 stored elements in Compressed Sparse Row format>

In [25]:
housing_cat_1hot.toarray()

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       ...,
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])

In [26]:
cat_encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

In [27]:
from sklearn.base import BaseEstimator, TransformerMixin

In [28]:
rooms_ix, bedrooms_ix, population_ix, households_ix = 3,4,5,6

In [32]:
class CombinedAttributesadder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]            

In [35]:
attr_adder = CombinedAttributesadder(add_bedrooms_per_room=-False)
housing_extra_attribs = attr_adder.transform(housing.values)