In [5]:
import pandas as pd
import os 
import numpy as np

HOUSING_PATH = os.path.join("datasets", "housing")
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)


housing = load_housing_data()
print(housing.head())

housing["income_cat"] = pd.cut(housing["median_income"],
bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
labels=[1, 2, 3, 4, 5])

from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]


housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

# Data Cleaning
housing.dropna(subset=["total_bedrooms"]) # option 1
housing.drop("total_bedrooms", axis=1) # option 2
median = housing["total_bedrooms"].median() # option 3
housing["total_bedrooms"].fillna(median, inplace=True)


from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")
housing_num = housing.drop("ocean_proximity",axis=1) 
imputer.fit(housing_num)

print("\n----------------------------------\n----------------------------------\n")
print(imputer.statistics_)
print("\n----------------------------------\n----------------------------------\n")

print(housing_num.median().values)
print("\n----------------------------------\n----------------------------------\n")
X= imputer.transform(housing_num)

housing_tr  = pd.DataFrame(X,columns=housing_num.columns)

# Handling Text and Categorical Attributes
housing_cat = housing[["ocean_proximity"]]
print("\n----------------------------------\n")
print(housing_cat.head(10))

from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()

housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
print("\n----------------------------------\n")
print(housing_cat_encoded[:10])
print("\n----------------------------------\n")
print(ordinal_encoder.categories_)
print("\n----------------------------------\n")

from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot.toarray()
print(housing_cat_1hot.toarray())
print("\n----------------------------------\n")
print(cat_encoder.categories_)
print("\n----------------------------------\n")


# Customs Transformers
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
            bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

# Transformation Pipelines

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])
housing_num_tr = num_pipeline.fit_transform(housing_num)

from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])
housing_prepared = full_pipeline.fit_transform(housing)


# Training and Evaluating on the Training Set

from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions:", lin_reg.predict(some_data_prepared))
print("\n----------------------------------\n")
print("Labels:", list(some_labels))
print("\n----------------------------------\n")

from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)
print("\n----------------------------------\n")








   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  

----------------------------------
----------------------------------

[-1

In [6]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)

print(tree_rmse)
print("\n----------------------------------\n")


0.0

----------------------------------



In [12]:
# Cross-Validation

from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
                        scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

print("Decision tree")
print(display_scores(tree_rmse_scores))

print("\n----------------------------------\n")


lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                            scoring="neg_mean_squared_error", cv=10)
...
lin_rmse_scores = np.sqrt(-lin_scores)
print(display_scores(lin_rmse_scores))
print("\n----------------------------------\n")



Decision tree
Scores: [69154.13209457 67178.20949022 70322.95890415 68924.04566905
 71539.45338223 75070.21765793 68476.43341184 71769.9995391
 76470.90054371 70796.896179  ]
Mean: 70970.32468717915
Standard deviation: 2767.01010268427
None

----------------------------------

Scores: [66877.52325028 66608.120256   70575.91118868 74179.94799352
 67683.32205678 71103.16843468 64782.65896552 67711.29940352
 71080.40484136 67687.6384546 ]
Mean: 68828.99948449331
Standard deviation: 2662.761570610344
None

----------------------------------



In [14]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)
forest_predictions = forest_reg.predict(housing_prepared)
forest_mse= mean_squared_error(housing_labels,forest_predictions)
forest_rmse = np.sqrt(forest_mse)



'''
scores_forets = cross_val_score(forest_reg,housing_prepared,housing_labels,scoring="neg_mean_squared_error",cv=10)
forest_rmse_scores =np.sqrt(-scores)

print(forest_rmse)
print(display_scores(forest_rmse_scores))
print("\n----------------------------------\n")
'''

'\nscores_forets = cross_val_score(forest_reg,housing_prepared,housing_labels,scoring="neg_mean_squared_error",cv=10)\nforest_rmse_scores =np.sqrt(-scores)\n\nprint(forest_rmse)\nprint(display_scores(forest_rmse_scores))\nprint("\n----------------------------------\n")\n'

In [16]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                        scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-scores)
print("Random Forest")
print(forest_rmse)

Random Forest
18803.732038907292


In [17]:
print(display_scores(forest_rmse_scores))
print("\n----------------------------------\n")

Scores: [49604.85319566 47358.67962284 49748.99900586 52315.66315942
 49672.07583274 53432.66401929 49054.90259011 48030.15459772
 52952.41748561 49957.46034966]
Mean: 50212.78698588959
Standard deviation: 1936.1824260156664
None

----------------------------------



In [18]:
from sklearn.model_selection import GridSearchCV

param_grid= [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
              scoring='neg_mean_squared_error',
              return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jo

In [19]:
grid_search.best_params_

{'max_features': 8, 'n_estimators': 30}

In [20]:
grid_search.best_estimator_

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features=8, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=30, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [22]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score),params)

65019.870548129664 {'max_features': 2, 'n_estimators': 3}
55639.60302593755 {'max_features': 2, 'n_estimators': 10}
53536.99354911229 {'max_features': 2, 'n_estimators': 30}
61593.38235833468 {'max_features': 4, 'n_estimators': 3}
53579.144360744445 {'max_features': 4, 'n_estimators': 10}
51212.95692875872 {'max_features': 4, 'n_estimators': 30}
60316.21816797359 {'max_features': 6, 'n_estimators': 3}
53215.343089210706 {'max_features': 6, 'n_estimators': 10}
50802.1482471517 {'max_features': 6, 'n_estimators': 30}
59577.922972735236 {'max_features': 8, 'n_estimators': 3}
52440.57185092238 {'max_features': 8, 'n_estimators': 10}
50685.027075180806 {'max_features': 8, 'n_estimators': 30}
63270.02698398569 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
55125.19727679271 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
60107.943993405985 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
53241.537620630974 {'bootstrap': False, 'max_features': 3, 'n_estimator

In [23]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([6.10043959e-02, 5.94623289e-02, 4.46799195e-02, 1.61450458e-02,
       1.54521599e-02, 1.61037141e-02, 1.45105590e-02, 3.83618332e-01,
       7.37341442e-02, 3.61729284e-02, 1.08415877e-01, 4.36717687e-02,
       6.57014981e-03, 1.14120582e-01, 1.05927593e-04, 2.54130467e-03,
       3.69086205e-03])

In [24]:
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

[(0.38361833190706396, 'median_income'),
 (0.11412058228767799, 'INLAND'),
 (0.1084158772157383, 'pop_per_hhold'),
 (0.0737341441561028, 'income_cat'),
 (0.06100439587888486, 'longitude'),
 (0.05946232893190359, 'latitude'),
 (0.044679919500945554, 'housing_median_age'),
 (0.043671768741071974, 'bedrooms_per_room'),
 (0.0361729284383818, 'rooms_per_hhold'),
 (0.016145045770954807, 'total_rooms'),
 (0.01610371408252557, 'population'),
 (0.015452159923265851, 'total_bedrooms'),
 (0.01451055904098792, 'households'),
 (0.006570149814731985, '<1H OCEAN'),
 (0.0036908620477178024, 'NEAR OCEAN'),
 (0.002541304669369197, 'NEAR BAY'),
 (0.00010592759267604055, 'ISLAND')]

In [25]:
final_model = grid_search.best_estimator_
X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()
X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [26]:
from scipy import stats
>>> confidence = 0.95
>>> squared_errors = (final_predictions - y_test) ** 2
>>> np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
                loc=squared_errors.mean(),
                scale=stats.sem(squared_errors)))

array([46500.86049695, 50422.67152933])