In [25]:
import pandas as pd
import numpy as np

In [39]:
housing = pd.read_csv('coastal_distance.csv')
housing.drop('ind',inplace=True,axis=1)
housing.drop('geometry',inplace=True,axis=1)
housing.drop('ocean_proximity',inplace=True,axis=1)
housing['income_cat'] = pd.cut(housing['median_income'],bins=[0,1.5,3,4.5,6,np.inf],labels=[1,2,3,4,5])

housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,distance_coastline,income_cat
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0.25503,5
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0.261432,5
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0.240331,5
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0.23041,4
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0.23041,3


In [40]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=43)
for train_idx,test_idx in split.split(housing,housing['income_cat']):
    strat_train_set = housing.loc[train_idx]
    strat_test_set = housing.loc[test_idx]
for set_ in (strat_test_set,strat_train_set):
    set_.drop('income_cat',axis=1,inplace=True)

strat_train_labels = strat_train_set['median_house_value']
strat_test_labels = strat_test_set['median_house_value']
strat_train_set.drop('median_house_value',inplace=True,axis=1)
strat_test_set.drop('median_house_value',inplace=True,axis=1)
housing = strat_train_set.copy()
# strat_train_set.head()

# housing.


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,distance_coastline
8049,-118.17,33.83,45.0,2019.0,363.0,880.0,339.0,4.1023,0.066654
19922,-119.35,36.32,10.0,3817.0,719.0,1686.0,714.0,3.8235,1.75107
2827,-119.09,35.43,28.0,254.0,35.0,118.0,37.0,4.8571,1.119211
3905,-118.5,34.2,18.0,4249.0,933.0,2047.0,909.0,4.1304,0.167543
16992,-122.27,37.56,17.0,3211.0,847.0,1553.0,812.0,4.9434,0.198015


In [68]:
housing_og_attr = list(housing)
housing_og_attr

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'distance_coastline']

In [41]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, households_ix = 3,4,5,6
class combined(BaseEstimator,TransformerMixin):
    def __init__(self,add_bedrooms_per_room=True):
        self.add_bpr = add_bedrooms_per_room
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        rooms_per_houseold = X[:,rooms_ix]/X[:,households_ix]
        population_per_household = X[:,population_ix]/X[:,households_ix]
        if self.add_bpr:
            bedrooms_per_room = X[:,bedrooms_ix]/X[:,rooms_ix]
            return np.c_[X,rooms_per_houseold,population_per_household,bedrooms_per_room]
        return np.c_[X,rooms_per_houseold,population_per_household]
        

In [42]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy='median')),
    ('attr_adder',combined()),
    ('std_scaler',StandardScaler())
])

In [43]:
housing_transform = num_pipeline.fit_transform(housing)
housing_transform[:4]

array([[ 0.6972303 , -0.84147342,  1.30687973, -0.28574813, -0.41721612,
        -0.4877143 , -0.42220982,  0.12344712, -0.75756943,  0.20876467,
        -0.04272092, -0.51128686],
       [ 0.10806565,  0.32439472, -1.47762532,  0.54628976,  0.43357729,
         0.23306646,  0.56382045, -0.02363553,  2.06742783, -0.03486635,
        -0.06319597, -0.38293619],
       [ 0.23788159, -0.0923212 , -0.04559415, -1.10251501, -1.2010932 ,
        -1.16914723, -1.21629286,  0.5216465 ,  1.00771339,  0.57197067,
         0.00907826, -1.13983134],
       [ 0.53246392, -0.66823197, -0.84116702,  0.74620097,  0.94500928,
         0.55589754,  1.0765562 ,  0.13827145, -0.58836451, -0.30317038,
        -0.07274872,  0.08421755]])

In [44]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_transform,strat_train_labels)

LinearRegression()

In [46]:
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_transform)
lin_mse = mean_squared_error(strat_train_labels,housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse


68583.7753994035

In [47]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_transform,strat_train_labels)


DecisionTreeRegressor()

In [49]:
housing_predictions = tree_reg.predict(housing_transform)
lin_mse = mean_squared_error(strat_train_labels,housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

0.0

In [53]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(housing_transform,strat_train_labels)


RandomForestRegressor()

In [54]:
housing_predictions = forest_reg.predict(housing_transform)
forest_mse = mean_squared_error(housing_predictions,strat_train_labels)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

17683.59291626164

In [55]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg,housing_transform,strat_train_labels,scoring='neg_mean_squared_error',cv=10)
tree_rmse_scores = np.sqrt(-scores)
lin_scores = cross_val_score(lin_reg,housing_transform,strat_train_labels,scoring='neg_mean_squared_error',cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
forest_scores = cross_val_score(forest_reg,housing_transform,strat_train_labels,scoring='neg_mean_squared_error',cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
print('decison tree:')
print(tree_rmse_scores.mean(),tree_rmse_scores.std())
print('Linear:')
print(lin_rmse_scores.mean(),lin_rmse_scores.std())
print('Forest:')
print(forest_rmse_scores.mean(),forest_rmse_scores.std())

decison tree:
66824.61226022638 2054.308292371747
Linear:
68971.11398789023 2990.6542553119075
Forest:
47465.567445277746 997.8033164748739


In [56]:
from sklearn.model_selection import GridSearchCV

param_grid = [
{'bootstrap': [True,False],'n_estimators': [10, 30,40], 'max_features': [4, 8,12]},
{'bootstrap': [False,True], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
scoring='neg_mean_squared_error',
return_train_score=True)
grid_search.fit(housing_transform, strat_train_labels)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'bootstrap': [True, False],
                          'max_features': [4, 8, 12],
                          'n_estimators': [10, 30, 40]},
                         {'bootstrap': [False, True], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [57]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

49440.889774473 {'bootstrap': True, 'max_features': 4, 'n_estimators': 10}
47448.564014748066 {'bootstrap': True, 'max_features': 4, 'n_estimators': 30}
47184.05850041942 {'bootstrap': True, 'max_features': 4, 'n_estimators': 40}
49940.269678685996 {'bootstrap': True, 'max_features': 8, 'n_estimators': 10}
48014.65979722412 {'bootstrap': True, 'max_features': 8, 'n_estimators': 30}
47701.32511387763 {'bootstrap': True, 'max_features': 8, 'n_estimators': 40}
50361.091013533834 {'bootstrap': True, 'max_features': 12, 'n_estimators': 10}
48415.546809857646 {'bootstrap': True, 'max_features': 12, 'n_estimators': 30}
48246.199118869 {'bootstrap': True, 'max_features': 12, 'n_estimators': 40}
48843.93283873239 {'bootstrap': False, 'max_features': 4, 'n_estimators': 10}
46991.6266789231 {'bootstrap': False, 'max_features': 4, 'n_estimators': 30}
46440.98260487715 {'bootstrap': False, 'max_features': 4, 'n_estimators': 40}
49732.702195469785 {'bootstrap': False, 'max_features': 8, 'n_estimator

In [58]:
grid_search.best_params_

{'bootstrap': False, 'max_features': 4, 'n_estimators': 40}

In [59]:
feature_imp = grid_search.best_estimator_.feature_importances_
feature_imp

array([0.06499645, 0.07392797, 0.03453396, 0.01450216, 0.01348235,
       0.01434695, 0.01339134, 0.30979873, 0.23783828, 0.06184771,
       0.09348268, 0.06785143])

In [70]:
extra_attr = ['rooms/h','pop/h','bed/r']
attributes = housing_og_attr + extra_attr
sorted(zip(feature_imp,attributes),reverse=True)

[(0.30979872921404467, 'median_income'),
 (0.2378382828027252, 'distance_coastline'),
 (0.09348268129681697, 'pop/h'),
 (0.07392796567241072, 'latitude'),
 (0.06785143386634919, 'bed/r'),
 (0.06499645236419596, 'longitude'),
 (0.061847705053817974, 'rooms/h'),
 (0.03453395628801239, 'housing_median_age'),
 (0.014502156854686664, 'total_rooms'),
 (0.01434694831942117, 'population'),
 (0.013482345857573054, 'total_bedrooms'),
 (0.013391342409946163, 'households')]

In [74]:
final_model = grid_search.best_estimator_
strat_test_transform = num_pipeline.transform(strat_test_set)
final_pred = final_model.predict(strat_test_transform)
final_mse = mean_squared_error(strat_test_labels,final_pred)
final_rmse = np.sqrt(final_mse)
final_rmse

45752.63141892831

In [75]:
from scipy import stats
confidence = 0.95
squared_errors = (final_pred - strat_test_labels) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,loc=squared_errors.mean(),scale=stats.sem(squared_errors)))


array([43487.06567596, 47911.18535799])