In [72]:
from __future__ import print_function, division, unicode_literals
import pandas as pd
import numpy as np
import os

In [73]:
#Load CSV 
def load_housing_data():
    csv_path= os.path.join("housing.csv")
    return pd.read_csv(csv_path)

In [74]:
housing= load_housing_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,NEAR BAY
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,NEAR BAY
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,NEAR BAY
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,NEAR BAY


In [75]:
#Divide by 1.5 to limit the number of income categories
housing["income_cat"]= np.ceil(housing["median_income"]/1.5)
housing["income_cat"].where(housing["income_cat"]<5,5.0, inplace=True)
housing["income_cat"].value_counts()


from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=123)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set= housing.loc[train_index]
    strat_test_set= housing.loc[test_index]


print(strat_test_set["income_cat"].value_counts()/len(strat_test_set))
print(strat_train_set["income_cat"].value_counts()/len(strat_train_set))
print(housing["income_cat"].value_counts()/len(housing))

#Ratio of split looks good enough to proceed


#Dropping Income_cat
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)
    
strat_train_label= strat_train_set["median_house_value"]
strat_train_set.drop("median_house_value", axis=1, inplace=True)

strat_train_label.head()
strat_train_set.head()

3.0    0.350533
2.0    0.318798
4.0    0.176357
5.0    0.114583
1.0    0.039729
Name: income_cat, dtype: float64
3.0    0.350594
2.0    0.318859
4.0    0.176296
5.0    0.114402
1.0    0.039850
Name: income_cat, dtype: float64
3.0    0.350581
2.0    0.318847
4.0    0.176308
5.0    0.114438
1.0    0.039826
Name: income_cat, dtype: float64


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
10888,-117.9,33.7,12,4695,1110.0,2153,989,4.6483,<1H OCEAN
2092,-119.78,36.75,43,2070,512.0,1925,444,1.4635,INLAND
18810,-121.67,40.89,17,2548,537.0,1118,461,2.267,INLAND
6905,-118.11,34.03,36,1493,316.0,989,293,3.5272,<1H OCEAN
10102,-117.97,33.93,35,1887,328.0,989,351,4.1321,<1H OCEAN


In [76]:
housing= strat_train_set.copy()
housing_num =housing.drop("ocean_proximity", axis=1)
housing_num.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
10888,-117.9,33.7,12,4695,1110.0,2153,989,4.6483
2092,-119.78,36.75,43,2070,512.0,1925,444,1.4635
18810,-121.67,40.89,17,2548,537.0,1118,461,2.267
6905,-118.11,34.03,36,1493,316.0,989,293,3.5272
10102,-117.97,33.93,35,1887,328.0,989,351,4.1321


In [77]:
#Custom Transformer

from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, household_ix= 3,4,5,6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room= add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X, Y=None):
        rooms_per_household= X[:,rooms_ix]/X[:,household_ix]
        population_per_household=X[:,population_ix]/X[:,household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room= X[:, bedrooms_ix]/X[:,rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [78]:
attr_adder= CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs= attr_adder.transform(housing.values)

In [79]:
#Transformation Pipeline

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, Imputer, LabelBinarizer

num_pipeline= Pipeline([
    ('imputer', Imputer(strategy='median')),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler())
])



In [80]:
housing_num_tr=num_pipeline.fit_transform(housing_num)
housing_num_tr=pd.DataFrame(
    housing_extra_attribs,
    columns=list(housing.columns)+["rooms_per_household", "population_per_household"])
housing_num_tr.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,population_per_household
0,-117.9,33.7,12,4695,1110,2153,989,4.6483,<1H OCEAN,4.74722,2.17695
1,-119.78,36.75,43,2070,512,1925,444,1.4635,INLAND,4.66216,4.33559
2,-121.67,40.89,17,2548,537,1118,461,2.267,INLAND,5.52711,2.42516
3,-118.11,34.03,36,1493,316,989,293,3.5272,<1H OCEAN,5.09556,3.37543
4,-117.97,33.93,35,1887,328,989,351,4.1321,<1H OCEAN,5.37607,2.81766


In [81]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names= attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [82]:
num_attribs= list(housing_num)
print(list(housing_num))
cat_attribs= ["ocean_proximity"]

num_pipeline= Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', Imputer(strategy='median')),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler())
])

#cat_pipeline= Pipeline([
#    ('selector', DataFrameSelector(cat_attribs)),
#    ('label_binarizer', LabelBinarizer()),
#])

from sklearn_pandas import DataFrameMapper
cat_pipeline = Pipeline([
    ('label_binarizer', DataFrameMapper([(cat_attribs, LabelBinarizer())])),
])

['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']




In [83]:
from sklearn.pipeline import FeatureUnion

full_pipeline= FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline)
])

In [84]:
housing_prepared=full_pipeline.fit_transform(housing)
housing_prepared

array([[ 0.83777132, -0.90910198, -1.31880165, ...,  0.        ,
         0.        ,  0.        ],
       [-0.0982453 ,  0.5143055 ,  1.14388773, ...,  0.        ,
         0.        ,  0.        ],
       [-1.03924072,  2.44640613, -0.92159368, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 1.26097031, -1.11444601, -1.15991846, ...,  0.        ,
         0.        ,  0.        ],
       [-0.96953735,  1.36834998,  0.3494718 , ...,  0.        ,
         0.        ,  0.        ],
       [-1.11392289,  1.09766921,  0.11114703, ...,  0.        ,
         0.        ,  0.        ]])

In [85]:
housing_prepared=pd.DataFrame(housing_prepared,
                              columns=list(housing_num.columns)+["rooms_per_household", "population_per_household", "bedrooms_per_room","OP_Bin1", 
                                           "OP_Bin2", "OP_Bin3", "OP_Bin4", "OP_Bin5"])
housing_prepared.head()
housing_prepared.shape

(16512, 16)

In [86]:
#Implement Linear Regression

from sklearn.linear_model import LinearRegression

housing_labels= strat_train_label.copy()

lin_reg= LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [87]:
#Trying out on few instances
some_data= housing_prepared.iloc[:5]
some_labels= housing_labels.iloc[:5]
#some_prepared_data= full_pipeline.fit_transform(some_data)
print("Predictions:", lin_reg.predict(some_data))

Predictions: [274369.41779856  55315.37709485  32410.55033648 204224.47020141
 227822.7362163 ]


In [88]:
print(list(some_labels))

[190800, 46600, 57800, 213700, 198100]


In [89]:
#Check quality of model

from sklearn.metrics import mean_squared_error

housing_predictions= lin_reg.predict(housing_prepared)
lin_mse= mean_squared_error(housing_labels, housing_predictions)
lin_rmse= np.sqrt(lin_mse)
lin_rmse

#Underfitting model

68163.84275048292

In [90]:
#Using Decision Tree Regressor

from sklearn.tree import DecisionTreeRegressor

tree_reg= DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [91]:
housing_predictions= tree_reg.predict(housing_prepared)
tree_rmse= mean_squared_error(housing_labels, housing_predictions)
tree_rmse= np.sqrt(tree_rmse)
tree_rmse

# Now it is overfitting

0.0

In [92]:
#Using cross validation

from sklearn.model_selection import cross_val_score
scores= cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores= np.sqrt(-scores)
tree_rmse_scores

array([73174.53273733, 72187.44945632, 69281.18554342, 70575.76371638,
       70959.44891347, 68995.18910744, 69043.63135237, 69712.53610748,
       69959.5069046 , 67779.51667785])

In [93]:
#Check RMSE
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard Deviation:", scores.std())

In [94]:
#RMSE of Decision Tree

display_scores(tree_rmse_scores)

Scores: [73174.53273733 72187.44945632 69281.18554342 70575.76371638
 70959.44891347 68995.18910744 69043.63135237 69712.53610748
 69959.5069046  67779.51667785]
Mean: 70166.87605166594
Standard Deviation: 1526.1705225828052


In [95]:
#RMSE of Linear Regression

lin_scores= cross_val_score(lin_reg, housing_prepared, housing_labels, scoring= "neg_mean_squared_error", cv=10)

lin_rmse_scores= np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores: [69632.10182642 69118.84646769 66254.64334091 70558.40593379
 68047.48394193 67849.54843674 69175.15807561 69398.05276184
 67851.48398075 67127.03528939]
Mean: 68501.27600550668
Standard Deviation: 1230.4093246429848


In [96]:
from sklearn.ensemble import RandomForestRegressor
forest_reg= RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [97]:
forest_scores= cross_val_score(forest_reg, housing_prepared, housing_labels, scoring= "neg_mean_squared_error", cv=10)

forest_rmse_scores= np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores: [52019.85602519 53613.44298948 52405.98338873 50513.67816639
 51950.82624436 53148.04752666 53995.27369331 53144.7096185
 53415.70079291 52205.01613527]
Mean: 52641.253458081934
Standard Deviation: 975.8922290083785


In [98]:
#Fine Tune Model

In [99]:
#Grid Search
from sklearn.model_selection import GridSearchCV
param_grid=[
    {'n_estimators':[3,10,30], 'max_features': [2,4,6,8]},
    {'bootstrap': [False], 'n_estimators':[3,10], 'max_features':[2,3,4]}
]

forest_reg= RandomForestRegressor()
grid_search= GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(housing_prepared, housing_labels)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [100]:
grid_search.best_params_

{'max_features': 8, 'n_estimators': 30}

In [101]:
grid_search.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=8, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=30, n_jobs=None, oob_score=False,
           random_state=None, verbose=0, warm_start=False)

In [102]:
cvres= grid_search.cv_results_

In [103]:
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

63073.40949470818 {'max_features': 2, 'n_estimators': 3}
55630.854511913865 {'max_features': 2, 'n_estimators': 10}
52293.45485147453 {'max_features': 2, 'n_estimators': 30}
59876.002347967755 {'max_features': 4, 'n_estimators': 3}
52465.03217381455 {'max_features': 4, 'n_estimators': 10}
49905.51693796976 {'max_features': 4, 'n_estimators': 30}
59375.212351689945 {'max_features': 6, 'n_estimators': 3}
51876.61703077165 {'max_features': 6, 'n_estimators': 10}
49648.46245520243 {'max_features': 6, 'n_estimators': 30}
58757.98492518824 {'max_features': 8, 'n_estimators': 3}
51664.45616314324 {'max_features': 8, 'n_estimators': 10}
49647.54555841576 {'max_features': 8, 'n_estimators': 30}
61413.57639868497 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
53843.50416600472 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
60010.268268884014 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
52207.937029278684 {'bootstrap': False, 'max_features': 3, 'n_estimators

In [104]:
#Ensemble Methods

In [105]:
#Analyze best models and their errors

feature_importances= grid_search.best_estimator_.feature_importances_
feature_importances

array([6.89602357e-02, 6.24299180e-02, 4.29051896e-02, 1.56521447e-02,
       1.45280811e-02, 1.48268240e-02, 1.33133558e-02, 3.71592725e-01,
       5.33076100e-02, 1.10838216e-01, 6.28655716e-02, 9.23403522e-03,
       1.53709272e-01, 2.28980184e-04, 1.59470974e-03, 4.01313185e-03])

In [111]:
housing_prepared.columns
sorted(zip( feature_importances,housing_prepared.columns), reverse=False)

[(0.00022898018382116225, 'OP_Bin3'),
 (0.0015947097403856392, 'OP_Bin4'),
 (0.0040131318465227534, 'OP_Bin5'),
 (0.009234035223602602, 'OP_Bin1'),
 (0.013313355786428232, 'households'),
 (0.014528081050887051, 'total_bedrooms'),
 (0.01482682395767529, 'population'),
 (0.01565214467989933, 'total_rooms'),
 (0.042905189564878404, 'housing_median_age'),
 (0.05330761004286834, 'rooms_per_household'),
 (0.06242991795833051, 'latitude'),
 (0.06286557156208766, 'bedrooms_per_room'),
 (0.06896023573523592, 'longitude'),
 (0.11083821612095995, 'population_per_household'),
 (0.15370927189785988, 'OP_Bin2'),
 (0.3715927246485572, 'median_income')]

In [112]:
#We can drop OP_Bin3, OP_Bin4, OP_Bin5, OP_Bin1

In [114]:
#Evaluate model on test set
final_model= grid_search.best_estimator_

x_test= strat_test_set.drop("median_house_value", axis=1)
y_test= strat_test_set["median_house_value"].copy()

In [116]:
X_test_prepared= full_pipeline.transform(x_test)


final_predictions= final_model.predict(X_test_prepared)

In [117]:
final_rmse= mean_squared_error(y_test, final_predictions)
final_rmse= np.sqrt(final_rmse)

In [119]:
final_rmse

49474.04627856251