In [1]:
import os
import tarfile
from six.moves import urllib
import hashlib
import numpy as np

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH +"/housing.tgz"

def fetch_housing_data(housing_url = HOUSING_URL, housing_path = HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path = housing_path)
    housing_tgz.close()
    
fetch_housing_data(HOUSING_URL, HOUSING_PATH)
    
import pandas as pd
def load_housing_data(housing_path = HOUSING_PATH):
    csv_path = os.path.join (housing_path, "housing.csv")
    return pd.read_csv(csv_path)

housing = load_housing_data()

#plotting the stuff
%matplotlib inline
import matplotlib.pyplot as plt
#housing.hist(bins = 50, figsize = (20,15))
#plt.show()

In [2]:
#making median income in categories for simplification from continuous variable and putting all the maximum ones in 5
#This category is made just to do stratified sampling on the basis of this category
housing["income_cat"] = np.ceil(housing["median_income"]/1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace = True)
# housing.head()

In [3]:
#Doing stratified sampling to make the training testing set
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)

for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set= housing.loc[test_index]

In [4]:
#now that we have divided the group, we drop the column income_cat from the database

for set in (strat_train_set, strat_test_set):
    set.drop(["income_cat"], axis = 1, inplace = True)
 

In [5]:
#making a copy of training data 
#Now that we have visualized the data, we want to go back to the original data, make its copy and separate the data from target values (median housing valu
housing = strat_train_set.copy()
housing = strat_train_set.drop("median_house_value", axis = 1)
housing_labels = strat_train_set["median_house_value"].copy()

In [6]:
housing_num = housing.drop("ocean_proximity", axis = 1)

housing_cat = housing["ocean_proximity"]


In [7]:
#making our own transforms. adding two extra attribs like before. population_per_household amd rooms_per_household
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix , bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
#print(housing_tr.head())
#print(housing_tr.info())

class CombinedAttributesAdder(BaseEstimator,TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix]/ X[:,household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix]/ X[:, rooms_ix]
            return np.c_[X,rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]



In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer


cats = housing.ocean_proximity.unique()
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

class MyLabelBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, x, y=None):
        return self
    def transform(self, x, y=None):
        x = x.astype(pd.api.types.CategoricalDtype(categories = cats))
        dummies = pd.get_dummies(x)
        return dummies


num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy = "median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler())
    ])


cat_pipeline = Pipeline([
        ('label_binarizer', MyLabelBinarizer())
    ])

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
    ])


housing_prepared = full_pipeline.fit_transform(housing)
#print(housing_prepared)



<class 'numpy.ndarray'>


In [61]:
#training the model
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]

some_data_prepared = full_pipeline.fit_transform(some_data)
#print(type(some_data_prepared))

print("Predictions:", lin_reg.predict(some_data_prepared))
print("Labels: ", list(some_labels))


Predictions: [215615.06923226 346732.1858831  228727.47961606  62616.76604913
 206972.49365904]
Labels:  [286600.0, 340600.0, 196900.0, 46300.0, 254500.0]


In [62]:
from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

from sklearn.metrics import mean_absolute_error

lin_mae = mean_absolute_error(housing_labels, housing_predictions)
lin_mae

49439.89599001896

In [63]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(housing_prepared, housing_labels)
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

In [64]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)


Scores: [69284.28804141 65740.32352991 71288.17037922 68907.78126244
 69504.81236153 73478.01533148 71562.31873201 70755.05481841
 76559.53211881 69884.65768198]
Mean: 70696.49542571935
Standard deviation: 2739.5595935141732


In [65]:
#calculating cross validation results for linear regression
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring = "neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(- lin_scores )
display_scores(lin_rmse_scores)

Scores: [66813.86698918 66960.118071   70350.90184102 74739.57052552
 68021.57333985 71193.84183426 64969.63056405 68278.21328013
 71553.62690045 67665.09903423]
Mean: 69054.64423796839
Standard deviation: 2729.7642208915054


In [66]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(random_state=42)
forest_reg.fit(housing_prepared, housing_labels)

housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)

from sklearn.model_selection import cross_val_score

forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)



Scores: [51410.52555694 49408.08775711 53596.25669043 54820.9795026
 50903.87982382 56281.96153024 51936.85494851 49904.99068523
 55337.76136373 53587.06536149]
Mean: 52718.83632200981
Standard deviation: 2235.9191865475136


In [67]:
from sklearn.model_selection import GridSearchCV


param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [68]:
grid_search.best_params_
grid_search.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=8, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=30, n_jobs=None, oob_score=False, random_state=42,
           verbose=0, warm_start=False)

In [69]:
cvres = grid_search.cv_results_
#cvres['params']
#for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
#    print(np.sqrt(-mean_score), params)

#making dataframe of the above data
#pd.DataFrame(grid_search.cv_results_)

In [70]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
params = {
    'n_estimators': randint(1,200),
     'max_features': randint(1, 8)
}
 
newgridsearch = RandomizedSearchCV(forest_reg, param_distributions = params, n_iter=10, cv = 5, scoring='neg_mean_squared_error')
r_grid_search = newgridsearch.fit(housing_prepared, housing_labels)
r_cv = r_grid_search.cv_results_

for mean_score, params in zip(r_cv["mean_test_score"], r_cv["params"]):
    print(np.sqrt(-mean_score), params)

49246.42822568018 {'max_features': 6, 'n_estimators': 90}
52969.42883506573 {'max_features': 2, 'n_estimators': 42}
51886.544388884795 {'max_features': 2, 'n_estimators': 176}
49286.4259913675 {'max_features': 5, 'n_estimators': 180}
49102.173020869595 {'max_features': 6, 'n_estimators': 171}
49557.10922515899 {'max_features': 6, 'n_estimators': 49}
49734.69106453142 {'max_features': 7, 'n_estimators': 55}
54445.874110411816 {'max_features': 1, 'n_estimators': 146}
52047.322188874816 {'max_features': 2, 'n_estimators': 110}
49294.707904934825 {'max_features': 7, 'n_estimators': 142}


In [52]:
feature_importances = r_grid_search.best_estimator_.feature_importances_

extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cats = list(cats)
attributes = num_attribs + extra_attribs + cats 
#print(attributes)
f_imp = sorted(zip(feature_importances, attributes), reverse = True)
#print(f_imp)
#print(*f_imp,sep='\n')




class feature_selector(BaseEstimator, TransformerMixin):
    def __init__(self , number):
        self.number = number
    def fit(self, x, y=None):
        return self
    def transform(self, x, y=None):
        selected = f_imp[0:self.number]
        list_att = [x[1] for x in selected]
        list_items = []
        for item in list_att:
            list_items.append( attributes.index(item))
        return list_items

ls = feature_selector(5)
final = ls.transform(housing_prepared)
print(final)
#print(attributes.index('pop_per_hhold'))

[7, 13, 9, 0, 10]


In [72]:
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop('median_house_value', axis = 1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print(final_rmse)

83981.45225603662


In [73]:
# Question number 1 
#the more I increase the value for C, the better the rbf kernel performs. if values are till 1000, kernel linear perfomrs better
from sklearn.svm import SVR
param_grid_svm =[
    {'kernel': ['linear'], 'C':[3000]},
    {'kernel': ['rbf'] , 'C': [3000], 'gamma' :['scale']}
]

Support_vector = SVR()

# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(Support_vector, param_grid_svm, cv=5,
                           scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)
grid_search.best_estimator_

negative_mse = grid_search.best_score_
rmse = np.sqrt(-negative_mse)
rmse

grid_search.best_params_



{'C': 3000, 'gamma': 'scale', 'kernel': 'rbf'}

In [74]:
#Q2


Support_vectorRS = SVR()

param_grid_svmQ2 = {
    'kernel': ['linear','rbf'],
    'C': randint(100, 5000),
    'gamma' :['scale']
     }

Q2gridsearch = RandomizedSearchCV(Support_vectorRS, param_distributions = param_grid_svmQ2, n_jobs = 5,
                                   n_iter=10, cv = 5, scoring='neg_mean_squared_error')

r_grid_searchQ2 = Q2gridsearch.fit(housing_prepared, housing_labels)
r_cvQ2 = r_grid_searchQ2.cv_results_

#print(r_cv)
for mean_score, params in zip(r_cvQ2["mean_test_score"], r_cvQ2["params"]):
    print(np.sqrt(-mean_score), params)

70328.85544522104 {'C': 1108, 'gamma': 'scale', 'kernel': 'rbf'}
64595.94677893514 {'C': 3682, 'gamma': 'scale', 'kernel': 'rbf'}
64217.26083662952 {'C': 4114, 'gamma': 'scale', 'kernel': 'rbf'}
70373.97421774412 {'C': 4839, 'gamma': 'scale', 'kernel': 'linear'}
70411.53432459633 {'C': 1857, 'gamma': 'scale', 'kernel': 'linear'}
64685.56923963392 {'C': 3585, 'gamma': 'scale', 'kernel': 'rbf'}
64050.97920782328 {'C': 4332, 'gamma': 'scale', 'kernel': 'rbf'}
70385.15087347322 {'C': 3746, 'gamma': 'scale', 'kernel': 'linear'}
70416.66592489646 {'C': 1631, 'gamma': 'scale', 'kernel': 'linear'}
70391.38110114516 {'C': 3317, 'gamma': 'scale', 'kernel': 'linear'}


In [75]:
#Q3
#with rbf, SVR, we can not get feature importances as the features are modified and taken to a high dimensional space
# b_est_q3 = r_grid_searchQ2.best_estimator_
# so we try it with randomforestregression




In [76]:
#Q3    our best estimator was coming as 'rbf' but RFE doesnt work with it so we tried it with SVR linear
from sklearn.feature_selection import RFE

b_est_q3 = SVR(kernel = 'linear')
selector = RFE(b_est_q3, 10, step=1)
selector = selector.fit(housing_prepared, housing_labels)
# selector.support_ gives the mask of true false for the features given

result = [x for x, y in zip(attributes, selector.support_) if y == True]
print(result)


['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'households', 'median_income', 'rooms_per_hhold', 'bedrooms_per_room', '<1H OCEAN', 'INLAND']


NameError: name 'prepare_select_and_predict_pipeline' is not defined