# Download the Data
As we can see we first need to download the data from online sources. This can be done manually or by writing a small python script to download and extract the csv file from the tgz file. Writing a python script is preffered because data keeps changing from time to time also a python script can be automated so that new data is fetched from time to time so tha twe can keep a
track of model's accuracy 

In [None]:
import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT="https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH=os.path.join("datasets","housing")#creating a newfolder dataset and a housinh folder inside it
HOUSING_URL=DOWNLOAD_ROOT + "datasets/housing/housing.tgz"#path where file will be stored

#fuction to fetch data from online source
def fetch_housing_data(housing_url=HOUSING_URL,housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):#used to check if a specified path in an existing directory or not
        os.makedirs(housing_path)#used to create a directry recursively
    tgz_path=os.path.join(housing_path,"housing.tgz")#will store tgz file inside housing path 
    urllib.request.urlretrieve(housing_url,tgz_path)#will store the downloaded file to workind directory
    housing_tgz=tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()


In [None]:
fetch_housing_data()#calling the function

import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path=os.path.join(housing_path,"housing.csv")
    return pd.read_csv(csv_path)

In [None]:
housing=load_housing_data()
housing.head()

In [None]:
housing.info()

As we can see in the above sequence total number of instances in the dataset are 20640. If we observe carefully we can also see total_bedrooms features has on 20433 non-null values which means some of the values are missing so we will have to on this problem. Also datatype of all features is same except ocean_proximity so we will check it out too. 

In [None]:
housing.ocean_proximity.value_counts()

In [None]:
housing.describe()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50,figsize=(20,15))#bins is used for the width of histogram line
plt.show()

# Creating a test set

In [None]:
import numpy as np

def split_train_test(data,test_ratio):
    shuffled_indices=np.random.permutation(len(data))#randomly permute a sequence or return a permuted range
    test_set_size=int(len(data)*test_ratio)
    test_indices=shuffled_indices[:test_set_size]
    train_indices=shuffled_indices[test_set_size:]
    return data.iloc[train_indices],data.iloc[test_indices]

In [None]:
train_set,test_set=split_train_test(housing,0.2)
len(train_set)
len(test_set)

In [None]:
#pd.cut function is used to create a income category attribute with 5 categories
housing["income_cat"]=pd.cut(housing["median_income"],
                             bins=[0.,1.5,3.0,4.5,6.,np.inf],
                            labels=[1,2,3,4,5])
housing["income_cat"].hist()

In [None]:
# Stratified Sampling 
from sklearn.model_selection import StratifiedShuffleSplit
split=StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_index, test_index in split.split(housing,housing["income_cat"]):
    strat_train_set=housing.loc[train_index]
    strat_test_set=housing.loc[test_index]

In [None]:
strat_test_set["income_cat"].value_counts()/len(strat_test_set)

In [None]:
for set_ in (strat_train_set,strat_test_set):
    set_.drop("income_cat",axis=1,inplace=True)

# Data Visualization

In [None]:
# creating a copy of train set
housing=strat_train_set.copy()
housing.plot(kind="scatter",x="longitude",y="latitude")

In [None]:
# Above graph is hard to visualize so we will set alpha=0.1 to see more denser area clearly
housing.plot(kind="scatter",x="longitude",y="latitude",alpha=0.1)

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.5,
 s=housing["population"]/100, label="population", figsize=(10,7),
 c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
)
plt.legend()
# s represents districts population
# c color represents the price
# cmap we used predefined color map called jet

# Looking for correlation

In [None]:
#using correlation coefficient
corr_matrix=housing.corr()
corr_matrix.median_house_value.sort_values(ascending=False)
#as we have to only see correlation of every feature with only median_house_value whic is our label

In [None]:
#using Pandas scatter_matrix functi to find correlation
#as there are 11 numerical features we will get 121 plots which is not fisible
#so we will working with features that are reslly important

from pandas.plotting import scatter_matrix
attributes=["median_house_value","median_income","total_rooms",
           "housing_median_age"]
scatter_matrix(housing[attributes],figsize=(12,8))

In [None]:
#median_house_value vs median_income is very important plot so lets take a closer look
housing.plot(kind="scatter",x="median_income",y="median_house_value",alpha=0.1)

# Experimenting with Attribute Combinations

In [None]:
housing["rooms_per_household"]=housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"]=housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

In [None]:
corr_matrix=housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

# Preparing the data for ML Algorithms

In [None]:
housing=strat_train_set.drop("median_house_value",axis=1)
housing_labels=strat_train_set.median_house_value.copy()

In [None]:
#as we know total bedrooms had some non-null values so we must remove them before applying ML algo
median=housing.total_bedrooms.median()
housing.total_bedrooms.fillna(median,inplace=True)
#housing.info()

In [None]:
#scikit_learn provides a handy class to deal with missing values
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(strategy="median")

In [None]:
# as median only works in numerical values we will be removing feature ocean proximity
housing_num=housing.drop("ocean_proximity",axis=1)

In [None]:
imputer.fit(housing_num)
imputer.statistics_

In [None]:
housing_num.median().values

In [None]:
# Now we can use this trained imputer to transform the training setby replacing missing values by the learned medians
X=imputer.transform(housing_num)

In [None]:
# X is a NumPy array containing transformed feature.
# we can put it back in dataframe using following frame
housing_tr=pd.DataFrame(X,columns=housing_num.columns)

# Handling Text and Categorial Attributes

In [None]:
housing_cat=housing[["ocean_proximity"]]
housing_cat.head(10)

In [None]:
# converting these text to numbers
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()

In [None]:
housing_cat_encoded=ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]

In [None]:
#categorial_ is used to get a list of categories
ordinal_encoder.categories_

In [None]:
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
housing_cat_1hot =cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

In [None]:
housing_cat_1hot.toarray()

In [None]:
cat_encoder.categories_

We will want your transformer to work seamlessly with Scikit-Learn functionalities (such as pipelines), and since Scikit-Learn relies on duck typing (not inheritance), all you need is to create a class and implement three methods: fit()(returning self), transform(), and fit_transform(). You can get the last one forfree by simply adding TransformerMixin as a base class. Also, if you add BaseEstimator as a base class (and avoid *args and kargs in your constructor) you will gettwo extra methods (get_params() and set_params()) that will be useful for autoPrepare the Data for automatic hyperparameter tuning

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, household_ix=3,4,5,6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self,add_bedrooms_per_room=True):
        self.add_bedrooms_per_room=add_bedrooms_per_room
    def fit(self,X,y=None):
        return self
    def transform(self, X, y=None):
        rooms_per_household=X[:,rooms_ix]/X[:,household_ix]
        population_per_household=X[:,population_ix]/X[:,household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room=X[:,bedrooms_ix]/X[:,rooms_ix]
            return np.c_[X,rooms_per_household,population_per_household,
                        bedrooms_per_room]
        else:
            return np.c_[X,rooms_per_household,population_per_household]
    
attr_adder=CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs=attr_adder.transform(housing.values)

# Transforming Pipelines

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline=Pipeline([('imputer',SimpleImputer(strategy="median")),
                      ('attribs_adder',CombinedAttributesAdder()),
                      ('std_scaler',StandardScaler())])
housing_num_tr=num_pipeline.fit_transform(housing_num)

In [None]:
housing.values[:,bedrooms_ix]

In [None]:
from sklearn.compose import ColumnTransformer
num_attribs=list(housing_num)
cat_attribs=["ocean_proximity"]
full_pipeline=ColumnTransformer([
    ("num",num_pipeline,num_attribs),
    ("cat",OneHotEncoder(),cat_attribs)
])
housing_prepared=full_pipeline.fit_transform(housing)

# Select and train a model

In [None]:
from sklearn.linear_model import LinearRegression
lin_reg=LinearRegression()
lin_reg.fit(housing_prepared,housing_labels)

In [None]:
some_data=housing.iloc[:5] # this takes data from 1st row to 4th row 
some_labels=housing_labels.iloc[:5]
some_data_prepared=full_pipeline.transform(some_data)
print("Predictions:",lin_reg.predict(some_data_prepared))

In [None]:
print("Labels:",list(some_labels))

Now we will be measuring linear regression model's RMSE on the whole training set.

In [None]:
from sklearn.metrics import mean_squared_error
housing_predicions=lin_reg.predict(housing_prepared)
lin_mse=mean_squared_error(housing_labels,housing_predicions)
lin_rmse=np.sqrt(lin_mse)
lin_rmse

The results are not satisfactory as most districts median_housing values range
between $120,000  and  $265,000 so a typical prediction error of $68,628 is not very satifactory. This is an example of model underfitting.

In [None]:
# Now we will train a more powerful model to solve the problem of overfitting
from sklearn.tree import DecisionTreeRegressor
tree_reg=DecisionTreeRegressor()
tree_reg.fit(housing_prepared,housing_labels)

In [None]:
from sklearn.metrics import mean_squared_error
housing_predictions=tree_reg.predict(housing_prepared)
tree_mse=mean_squared_error(housing_labels,housing_predictions)
tree_rmse=np.sqrt(tree_mse)
tree_rmse

As we can see the value of the rmse is 0 which means the model may badly overfit. So we will preform cross-validation feature to check if this is true.

In [None]:
# cross_val_score allows to randomly split the training set into 10 distinct susbse called folds
# then evaluates the model 10 times
from sklearn.model_selection import cross_val_score
scores=cross_val_score(tree_reg,housing_prepared,housing_labels,
                      scoring="neg_mean_squared_error",cv=10)
tree_rmse_scores=np.sqrt(-scores)#we hve used -scores bcoz cross_val_score function
# except a utility func rather than cost function so it is opposite of MSE

In [None]:
def display_score(scores):
    print("Scores:",scores)
    print("Mean:",scores.mean())
    print("Standard deviation:",scores.std())
display_score(tree_rmse_scores)

As we can see after the cross validation the results of decison tree model are 
worse than the linear regressor. So the model was overfitting.

In [None]:
# Now we will try cross_validation on linear regressor model
lin_scores=cross_val_score(lin_reg,housing_prepared,housing_labels,
                          scoring="neg_mean_squared_error",cv=10)
lin_rmse_scores=np.sqrt(-lin_scores)
display_score(lin_rmse_scores)

Decison tree also does not work very well so we will now aaply Randaom forest Regressor.

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest_reg=RandomForestRegressor()
forest_reg.fit(housing_prepared,housing_labels)

In [None]:
housing_predictions=forest_reg.predict(housing_prepared)
forest_mse=mean_squared_error(housing_labels,housing_predictions)
forest_rmse=np.sqrt(forest_mse)
forest_rmse

In [None]:
forest_scores=cross_val_score(forest_reg,housing_prepared,housing_labels,
                             scoring="neg_mean_squared_error",cv=10)
forest_rmse_score=np.sqrt(-forest_scores)
display_score(forest_rmse_score)

# Fine Tuning the model

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid=[
    {'n_estimators':[3, 10, 30], 'max_features':[2, 4, 6, 8]},
    {'bootstrap':[False],'n_estimators':[3, 10],'max_features':[2,3,4]},
]
forest_reg=RandomForestRegressor()
grid_search=GridSearchCV(forest_reg,param_grid,cv=5,
                        scoring='neg_mean_squared_error',
                        return_train_score=True)
grid_search.fit(housing_prepared,housing_labels)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
cvres=grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"],cvres["params"]):
    print(np.sqrt(-mean_score),params)

In [None]:
feature_importances=grid_search.best_estimator_.feature_importances_
feature_importances

In [None]:
extra_attribs=["rooms_per_hhold","pop_per_hhold","bedrooms_per_room"]
cat_encoder=full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs=list(cat_encoder.categories_[0])
attributes=num_attribs+extra_attribs+cat_one_hot_attribs
sorted(zip(feature_importances,attributes),reverse=True)

# Evaluating our System on the test set

In [None]:
final_model=grid_search.best_estimator_
X_test=strat_test_set.drop("median_house_value",axis=1)
y_test=strat_test_set.median_house_value.copy()
X_test_prepared=full_pipeline.transform(X_test)
final_predictions=final_model.predict(X_test_prepared)
final_mse=mean_squared_error(y_test,final_predictions)
final_rmse=np.sqrt(final_mse)
print(final_rmse)

In [None]:
from scipy import stats
confidence=0.95
squared_errors=(final_predictions-y_test)**2
np.sqrt(stats.t.interval(confidence,len(squared_errors)-1,
                        loc=squared_errors.mean(),
                        scale=stats.sem(squared_errors)))