In [1]:
import os
import tarfile
from six.moves import urllib
import hashlib
import numpy as np

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH +"/housing.tgz"

def fetch_housing_data(housing_url = HOUSING_URL, housing_path = HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path = housing_path)
    housing_tgz.close()
    
fetch_housing_data(HOUSING_URL, HOUSING_PATH)
    
import pandas as pd
def load_housing_data(housing_path = HOUSING_PATH):
    csv_path = os.path.join (housing_path, "housing.csv")
    return pd.read_csv(csv_path)

housing = load_housing_data()



In [2]:
#making median income in categories for simplification from continuous variable and putting all the maximum ones in 5
#This category is made just to do stratified sampling on the basis of this category
housing["income_cat"] = np.ceil(housing["median_income"]/1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace = True)

#Doing stratified sampling to make the training testing set
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)

for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set= housing.loc[test_index]

In [3]:
#now that we have divided the group, we drop the column income_cat from the database

for set in (strat_train_set, strat_test_set):
    set.drop(["income_cat"], axis = 1, inplace = True)
 

In [4]:
#making a copy of training data 
#Now that we have visualized the data, we want to go back to the original data, make its copy and separate the data from target values (median housing valu
housing = strat_train_set.copy()
housing = strat_train_set.drop("median_house_value", axis = 1)
housing_labels = strat_train_set["median_house_value"].copy()

X_test = strat_test_set.drop('median_house_value', axis = 1)
y_test = strat_test_set["median_house_value"].copy()


In [5]:
housing_num = housing.drop("ocean_proximity", axis = 1)
housing_cat = housing["ocean_proximity"]


In [6]:
#making our own transforms. adding two extra attribs like before. population_per_household amd rooms_per_household
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix , bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator,TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix]/ X[:,household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix]/ X[:, rooms_ix]
            return np.c_[X,rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]



In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer


cats = housing.ocean_proximity.unique()
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]


class MyLabelBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, x, y=None):
        return self
    def transform(self, x, y=None):
        x = x.astype(pd.api.types.CategoricalDtype(categories = cats))
        dummies = pd.get_dummies(x)
        return dummies


num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy = "median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler())
    ])


cat_pipeline = Pipeline([
        ('label_binarizer', MyLabelBinarizer())
    ])

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
    ])


housing_prepared = full_pipeline.fit_transform(housing)
#print(housing_prepared)



In [8]:

from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint



forest_reg = RandomForestRegressor(random_state=42)
params = {
    'n_estimators': randint(1,200),
     'max_features': randint(1, 8)
}
 
newgridsearch = RandomizedSearchCV(forest_reg, param_distributions = params, n_iter=10, cv = 5, scoring='neg_mean_squared_error', n_jobs = 5)
r_grid_search = newgridsearch.fit(housing_prepared, housing_labels)
r_cv = r_grid_search.cv_results_

for mean_score, params in zip(r_cv["mean_test_score"], r_cv["params"]):
    print(np.sqrt(-mean_score), params)


49246.42822568018 {'max_features': 6, 'n_estimators': 90}
52167.259787109186 {'max_features': 2, 'n_estimators': 93}
49340.47893825622 {'max_features': 5, 'n_estimators': 149}
50389.99280724093 {'max_features': 7, 'n_estimators': 26}
51863.746022514715 {'max_features': 2, 'n_estimators': 185}
55270.59014543636 {'max_features': 1, 'n_estimators': 31}
50452.23100730043 {'max_features': 3, 'n_estimators': 104}
51963.54184952304 {'max_features': 2, 'n_estimators': 123}
49392.11245890409 {'max_features': 6, 'n_estimators': 56}
49383.34860447154 {'max_features': 7, 'n_estimators': 92}


In [9]:
feature_importances = r_grid_search.best_estimator_.feature_importances_

extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cats = list(cats)
attributes = num_attribs + extra_attribs + cats 
f_importance = sorted(zip(feature_importances, attributes), reverse = True)
#print(f_importance)

class feature_selector(BaseEstimator, TransformerMixin):
    def __init__(self , number, f_imp = None):
        self.number = number
        self.f_imp = f_imp
        
    def fit(self,  y = None, f_imp =  None):
        return self
    
    def transform(self, x, y = None):
        selected = self.f_imp[0:self.number]
        list_att = [x[1] for x in selected]
        list_items = []
        for item in list_att:
            list_items.append(attributes.index(item))
        x = x[:, list_items]
        return x

ls = feature_selector(5, f_importance)
final = ls.transform( housing_prepared)
print((final))

final_model = r_grid_search.best_estimator_

[[-0.61493744  0.         -0.08649871  0.15531753 -1.15604281]
 [ 1.33645936  0.         -0.03353391 -0.83628902 -1.17602483]
 [-0.5320456   0.         -0.09240499  0.4222004   1.18684903]
 ...
 [-0.3167053   1.         -0.03055414 -0.52177644  1.58648943]
 [ 0.09812139  0.          0.06150916 -0.30340741  0.78221312]
 [-0.15779865  0.         -0.09586294  0.10180567 -1.43579109]]


In [10]:
final_pipeline = Pipeline ([
    ("preparation_data", full_pipeline),
    ("feature_selection", feature_selector(10)),
    ("prediction", final_model)   
])


final_pipeline.set_params(feature_selection__f_imp = f_importance)
#print(final_pipeline.get_params())
training1 = final_pipeline.fit( housing, housing_labels )

pred_train = training1.predict(housing)
final_mse = mean_squared_error(housing_labels, pred_train)
final_rmse = np.sqrt(final_mse)
print("training data error: " , final_rmse)


test_prepared = final_pipeline.fit(X_test, y_test)
test_predictions = test_prepared.predict(X_test)
final_mse = mean_squared_error(y_test, test_predictions)
final_rmse = np.sqrt(final_mse)
print("testing data error: ", final_rmse)

#how does the pipeline know where the input data is going


training data error:  18303.166820165683
testing data error:  19574.290580869783


In [11]:
#Q3
#with rbf, SVR, we can not get feature importances as the features are modified and taken to a high dimensional space
# b_est_q3 = r_grid_searchQ2.best_estimator_



#Q3    our best estimator was coming as 'rbf' but RFE doesnt work with it so we tried it with SVR linear
from sklearn.feature_selection import RFE

b_est_q3 = SVR(kernel = 'linear')


class feature_selector_SVM(BaseEstimator, TransformerMixin):
    def __init__ (self, number, h_l = None):
        self.number = number
        self.h_l = h_l
        
    def fit(self, x, h_l = None):
        return self
    
    def transform(self, x):
        #print("inside transfrom of feature selector svm value of h_l is ", h_l)
        #self.h_l = h_l
        selector = RFE(b_est_q3, self.number, step=1)
        selector = selector.fit(x, self.h_l)
        result = [a for a, b in zip(attributes, selector.support_) if b == True]
        list_items = []
        for item in result:
            list_items.append(attributes.index(item))
        x = x[:, list_items]
        return x

inter = feature_selector_SVM(10, housing_labels)
f = inter.transform(housing_prepared)


NameError: name 'SVR' is not defined

In [None]:


final_pipeline_SVM = Pipeline ([
    ("preparation_data", full_pipeline),
    ("feature_selection", feature_selector_SVM(10)),
    ("prediction", b_est_q3)   
])


final_pipeline_SVM.set_params(feature_selection__h_l = housing_labels)
#print(final_pipeline_SVM.get_params())
training1 = final_pipeline_SVM.fit( housing, housing_labels )
pred_train = training1.predict(housing)
final_mse = mean_squared_error(housing_labels, pred_train)
final_rmse = np.sqrt(final_mse)
print("training data error: " , final_rmse)



final_pipeline_SVM.set_params(feature_selection__h_l = y_test)
test_prepared = final_pipeline_SVM.fit(X_test, y_test)
test_predictions = test_prepared.predict(X_test)
final_mse = mean_squared_error(y_test, test_predictions)
final_rmse = np.sqrt(final_mse)
print("testing data error: ", final_rmse)

#how does the pipeline know where the input data is going



In [None]:
print(housing_labels.shape)

In [None]:
#checking through a pipeline the best strategy for better results
from sklearn.model_selection import GridSearchCV

param_grid = [
        {'preparation_data__num__imputer__strategy': ['mean', 'median', 'most_frequent']}
]

grid_search_prep = GridSearchCV(final_pipeline, param_grid, cv=5,
                                scoring='neg_mean_squared_error', verbose=2, n_jobs=4)
grid_search_prep.fit(housing, housing_labels)
grid_search_prep.best_params_