![](logo1.jpg)

# **shAI Training 2021 | Level 1**

## Task #6 (End-to-End ML Project {part_2})

## Welcome to the exercises for reviewing second part of end to end ML project.
**Make sure that you read and understand ch2 from the hands-on ML book (page 72 to the end of the chapter ) before start with this notebook.**

**If you stuck with anything reread that part from the book and feel free to ask about anything in the messenger group as you go along.**

 ## Good Luck : )

## first run the following cell for the first part of the project to continue your work 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [None]:
import os
import tarfile
import urllib
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
    
def load_housing_data(housing_path=HOUSING_PATH):
   csv_path = os.path.join(housing_path, "housing.csv")
   return pd.read_csv(csv_path)
   
fetch_housing_data()
housing = load_housing_data()

rooms_ix, bedrooms_ix, population_ix, household_ix = [
    list(housing.columns).index(col)
    for col in ("total_rooms", "total_bedrooms", "population", "households")]

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
        
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
housing = train_set.drop("median_house_value", axis=1)
housing_labels = train_set["median_house_value"].copy()

housing_num = housing.drop("ocean_proximity", axis=1)
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
 ('imputer', SimpleImputer(strategy="median")),
 ('attribs_adder', CombinedAttributesAdder()),
 ('std_scaler', StandardScaler())])

full_pipeline = ColumnTransformer([
 ("num", num_pipeline, num_attribs),
 ("cat", OneHotEncoder(), cat_attribs)])

housing_prepared = full_pipeline.fit_transform(housing)

# 1- Select and Train a Model

# Let’s first train a LinearRegression model 

In [None]:
# CODE HERE
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

LinearRegression()

# First try it out on a few instances from the training set:


In [None]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]

In [None]:
# CODE HERE
some_data_prepared = full_pipeline.transform(some_data)

# measure this regression model’s RMSE on the whole training set 
* sing Scikit-Learn’s mean_squared_error() function:

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
# CODE HERE

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

67593.20745775253

# judge on the RMSE result for this model 
write down your answar 

your answer goes here

# Let’s train a Decision Tree Regressor model 
## more powerful model

In [None]:
from sklearn.tree import DecisionTreeRegressor 

In [None]:
# CODE HERE

DS = DecisionTreeRegressor()
DS.fit(housing_prepared, housing_labels)

DecisionTreeRegressor()

# Now evaluate the model on the training set 
* using Scikit-Learn’s mean_squared_error() function:

In [None]:
# CODE HERE
housing_predictions = DS.predict(housing_prepared)
DS_mse = mean_squared_error(housing_labels, housing_predictions)
DS_mse

0.0

# Explaine this result 
write down your answar

your answer goes here

# Evaluation Using Cross-Validation

1-split the training set into 10 distinct subsets then train and evaluate the Decision Tree model

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
# CODE HERE
scores = cross_val_score(DS, housing_prepared, housing_labels,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

2- display the resultant scores and calculate its Mean and Standard deviation

In [None]:
# CODE HERE

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)

Scores: [64775.16541945 70808.90045197 68693.53305273 70427.10596377
 72011.82342126 65229.59672726 66695.62991731 69700.29891242
 65312.61333286 69201.73916699]
Mean: 68285.6406366015
Standard deviation: 2467.2634400924962


3-repaet the same steps to compute the same scores for the Linear Regression  model 

*notice the difference between the results of the two models*

In [None]:
# CODE HERE

lin_scores = cross_val_score(DS, housing_prepared, housing_labels,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores: [65447.21789952 70628.84400592 68425.06248308 72148.38323922
 73294.93492188 67700.76224925 66844.4061757  68382.63906176
 66475.0309557  70211.39657671]
Mean: 68955.8677568736
Standard deviation: 2418.7588013269037


## Let’s train one last model the RandomForestRegressor.

In [None]:
# CODE HERE
from sklearn.ensemble import RandomForestRegressor

RS = RandomForestRegressor(n_estimators=100, random_state=42)
RS.fit(housing_prepared, housing_labels)

RandomForestRegressor(random_state=42)

# repeat the same steps to compute the same scores its Mean and Standard deviation for the Random Forest model

In [None]:
# CODE HERE

housing_predictions = RS.predict(housing_prepared)
RS_mse = mean_squared_error(housing_labels, housing_predictions)
RS_rmse = np.sqrt(RS_mse)
RS_rmse

18527.322990316152

# Save every model you experiment with 
*using the joblib library*

In [None]:
# CODE HERE

import joblib
joblib.dump(lin_reg, 'Linear_reg.pkl')
joblib.dump(DS, 'D_Tree.pkl')
joblib.dump(RS, 'randomforist.pkl')


['randomforist.pkl']

## now you have a shortlist of promising models. You now need to
## fine-tune them!
# Fine-Tune Your Model

## 1- Grid Search
## evaluate all the possible combinations of hyperparameter values for the RandomForestRegressor 
*It may take a long time*

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# CODE HERE

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

with the evaluation scores

In [None]:
# CODE HERE
final_model = grid_search.best_estimator_

X_test = test_set.drop("median_house_value", axis=1)
y_test = test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

final_rmse

49198.020631676336

# Analyze the Best Models and Their Errors
1-indicate the relative importance of each attribute

In [None]:
# CODE HERE

2-display these importance scores next to their corresponding attribute names:

In [None]:
# CODE HERE

## Now is the time to evaluate the final model on the test set.
# Evaluate Your System on the Test Set

1-get the predictors and the labels from your test set

In [None]:
# CODE HERE

2-run your full_pipeline to transform the data

In [None]:
# CODE HERE

3-evaluate the final model on the test set

In [None]:
# CODE HERE

# compute a 95% confidence interval for the generalization error 
*using scipy.stats.t.interval():*

In [None]:
from scipy import stats

In [None]:
# CODE HERE

# Great Job!
# #shAI_Club