![](logo1.jpg)

# **shAI Training 2023 | Level 1**

## Task #8 (End-to-End ML Project {part_2})

## Welcome to the exercises for reviewing second part of end to end ML project.
**Make sure that you read and understand ch2 from the hands-on ML book (page 72 to the end of the chapter ) before start with this notebook.**

**If you stuck with anything reread that part from the book and feel free to ask about anything in the messenger group as you go along.**

 ## Good Luck : )

## first run the following cell for the first part of the project to continue your work 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [2]:
import os
import tarfile
import urllib
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
    
def load_housing_data(housing_path=HOUSING_PATH):
   csv_path = os.path.join(housing_path, "housing.csv")
   return pd.read_csv(csv_path)
   
fetch_housing_data()
housing = load_housing_data()

rooms_ix, bedrooms_ix, population_ix, household_ix = [
    list(housing.columns).index(col)
    for col in ("total_rooms", "total_bedrooms", "population", "households")]

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
        
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
housing = train_set.drop("median_house_value", axis=1)
housing_labels = train_set["median_house_value"].copy()

housing_num = housing.drop("ocean_proximity", axis=1)
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
 ('imputer', SimpleImputer(strategy="median")),
 ('attribs_adder', CombinedAttributesAdder()),
 ('std_scaler', StandardScaler())])

full_pipeline = ColumnTransformer([
 ("num", num_pipeline, num_attribs),
 ("cat", OneHotEncoder(), cat_attribs)])

housing_prepared = full_pipeline.fit_transform(housing)

# 1- Select and Train a Model

# Let’s first train a LinearRegression model 

In [4]:
# CODE HERE
from sklearn.linear_model import LinearRegression

lin_model = LinearRegression()

lin_model.fit(housing_prepared , housing_labels)

# First try it out on a few instances from the training set:


In [6]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]

In [15]:
# CODE HERE
some_data_pre = full_pipeline.transform(some_data)
result = lin_model.predict(some_data_pre)
print("predicted values :" , result)

print("true values : " , list(some_labels))

predicted values : [181746.54359616 290558.74973505 244957.50017771 146498.51061398
 163230.42393939]
true values :  [103000.0, 382100.0, 172600.0, 93400.0, 96500.0]


# measure this regression model’s RMSE on the whole training set 
* sing Scikit-Learn’s mean_squared_error() function:

In [16]:
from sklearn.metrics import mean_squared_error

In [19]:
# CODE HERE
train_pred = lin_model.predict(housing_prepared)
train_mse = mean_squared_error(housing_labels , train_pred)
rmse = np.sqrt(train_mse)

print("Training RMSE :",rmse)

Training RMSE : 67593.20745775253


# judge on the RMSE result for this model 
write down your answar 

The model is not acceptable. The error rate is large, almost half the price of the houses.

# Let’s train a Decision Tree Regressor model 
## more powerful model

In [20]:
from sklearn.tree import DecisionTreeRegressor 

In [21]:
# CODE HERE

tree_model = DecisionTreeRegressor(random_state = 42)
tree_model.fit(housing_prepared , housing_labels)

# Now evaluate the model on the training set 
* using Scikit-Learn’s mean_squared_error() function:

In [24]:
# CODE HERE
train_tree_pred = tree_model.predict(housing_prepared)
train_tree_mse = mean_squared_error(housing_labels , train_tree_pred)
tree_rmse = np.sqrt(train_tree_mse)

print("Tree Training RMSE :",tree_rmse)

Tree Training RMSE : 0.0


# Explaine this result 
write down your answar

There is clearly overfitting in this model

# Evaluation Using Cross-Validation

1-split the training set into 10 distinct subsets then train and evaluate the Decision Tree model

In [26]:
from sklearn.model_selection import cross_val_score

In [27]:
# CODE HERE
cross_mse = cross_val_score(
    tree_model,housing_prepared ,
    housing_labels,
    scoring ="neg_mean_squared_error",
    cv = 10
)
tree_rmse_cross = np.sqrt(-cross_mse)

2- display the resultant scores and calculate its Mean and Standard deviation

In [28]:
# CODE HERE
print("Scores: ", tree_rmse_cross)
print("Mean: ", tree_rmse_cross.mean())
print("Standard Deviation: ", tree_rmse_cross.std())

Scores:  [65312.86044031 70581.69865676 67849.75809965 71460.33789358
 74035.29744574 65562.42978503 67964.10942543 69102.89388457
 66876.66473025 69735.84760006]
Mean:  68848.18979613911
Standard Deviation:  2579.6785558576307


3-repaet the same steps to compute the same scores for the Linear Regression  model 

*notice the difference between the results of the two models*

In [29]:
# CODE HERE
lin_cross_mse = cross_val_score(
    lin_model,housing_prepared ,
    housing_labels,
    scoring ="neg_mean_squared_error",
    cv = 10
)
lin_rmse_cross = np.sqrt(-lin_cross_mse)

print("Scores: ", lin_rmse_cross)
print("Mean: ", lin_rmse_cross.mean())
print("Standard Deviation: ", lin_rmse_cross.std())

Scores:  [65000.67382615 70960.56056304 67122.63935124 66089.63153865
 68402.54686442 65266.34735288 65218.78174481 68525.46981754
 72739.87555996 68957.34111906]
Mean:  67828.38677377408
Standard Deviation:  2468.0913950652284


## Let’s train one last model the RandomForestRegressor.

In [32]:
# CODE HERE
from sklearn.ensemble import RandomForestRegressor
forest_model = RandomForestRegressor()
forest_mse = cross_val_score(
    forest_model,housing_prepared ,
    housing_labels,
    scoring ="neg_mean_squared_error",
    cv = 10
)
forest_rmse = np.sqrt(-forest_mse)

# repeat the same steps to compute the same scores its Mean and Standard deviation for the Random Forest model

In [33]:
# CODE HERE
print("Scores: ", forest_rmse)
print("Mean: ", forest_rmse.mean())
print("Standard Deviation: ", forest_rmse.std())

Scores:  [46957.26483128 51716.08058589 49956.45084248 52162.72434529
 52218.07209162 47223.27979837 47403.60088878 50966.70920298
 49098.79291512 50007.52961157]
Mean:  49771.05051133959
Standard Deviation:  1935.8803975968135


# Save every model you experiment with 
*using the joblib library*

In [34]:
# CODE HERE
import joblib
joblib.dump(lin_model, 'linearregression_model.joblib')
joblib.dump(lin_model, 'decisiontree_model.joblib')
joblib.dump(lin_model, 'randomforest_model.joblib')

['randomforest_model.joblib']

## now you have a shortlist of promising models. You now need to
## fine-tune them!
# Fine-Tune Your Model

## 1- Grid Search
## evaluate all the possible combinations of hyperparameter values for the RandomForestRegressor 
*It may take a long time*

In [35]:
from sklearn.model_selection import GridSearchCV

In [36]:
# CODE HERE
param_grid = [
              {'n_estimators': [2, 4, 8, 16, 32], 'max_features':[2, 4, 6, 8, 10]}
]
fine_tuned_forest = RandomForestRegressor(random_state = 42)

grid_search = GridSearchCV(fine_tuned_forest,
                           param_grid,
                           cv = 5,
                           scoring = 'neg_mean_squared_error',
                           return_train_score = True)

grid_search.fit(housing_prepared, housing_labels)

with the evaluation scores

In [37]:
# CODE HERE
cvres = grid_search.cv_results_

for mean_score, params in zip(cvres["mean_test_score"],cvres["params"]):
  print(np.sqrt(-mean_score), params)

69101.74344177994 {'max_features': 2, 'n_estimators': 2}
61607.818107132465 {'max_features': 2, 'n_estimators': 4}
56425.72346034952 {'max_features': 2, 'n_estimators': 8}
53818.7818288565 {'max_features': 2, 'n_estimators': 16}
52671.16855757806 {'max_features': 2, 'n_estimators': 32}
63977.64586391227 {'max_features': 4, 'n_estimators': 2}
56378.709569716775 {'max_features': 4, 'n_estimators': 4}
52549.057598385196 {'max_features': 4, 'n_estimators': 8}
50922.58165107678 {'max_features': 4, 'n_estimators': 16}
49727.12037146433 {'max_features': 4, 'n_estimators': 32}
62615.724015666594 {'max_features': 6, 'n_estimators': 2}
56759.09500589323 {'max_features': 6, 'n_estimators': 4}
52131.51962922895 {'max_features': 6, 'n_estimators': 8}
50769.78394207507 {'max_features': 6, 'n_estimators': 16}
49894.86094945852 {'max_features': 6, 'n_estimators': 32}
62122.447572471094 {'max_features': 8, 'n_estimators': 2}
56291.08501344931 {'max_features': 8, 'n_estimators': 4}
52278.58340393876 {'m

# Analyze the Best Models and Their Errors
1-indicate the relative importance of each attribute

In [44]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([6.75806838e-02, 6.40639821e-02, 4.21969420e-02, 1.46119419e-02,
       1.38533502e-02, 1.43268099e-02, 1.30684016e-02, 3.74957342e-01,
       4.87114286e-02, 1.09619232e-01, 5.98196377e-02, 7.10035057e-03,
       1.65297490e-01, 2.46626633e-04, 1.78399596e-03, 2.76178532e-03])

2-display these importance scores next to their corresponding attribute names:

In [52]:
# CODE HERE
extra_attribs =['rooms_per_household', 'population_per_household','bedrooms_per_room']
num_attribs = list(train_set.drop(["median_house_value"],axis = 1).columns)
cat_one_hot_attribs = list(full_pipeline.named_transformers_['cat'].categories_)
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances,attributes),reverse = True)
# len(num_attribs)

[(0.3749573418372042, 'median_income'),
 (0.1652974902823448,
  array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
        dtype=object)),
 (0.1096192316437479, 'rooms_per_household'),
 (0.06758068376737504, 'longitude'),
 (0.06406398205514632, 'latitude'),
 (0.059819637747804105, 'population_per_household'),
 (0.04871142859577332, 'ocean_proximity'),
 (0.04219694201015764, 'housing_median_age'),
 (0.014611941860488205, 'total_rooms'),
 (0.01432680986023915, 'population'),
 (0.013853350229669147, 'total_bedrooms'),
 (0.013068401633034571, 'households'),
 (0.007100350568617614, 'bedrooms_per_room')]

## Now is the time to evaluate the final model on the test set.
# Evaluate Your System on the Test Set

1-get the predictors and the labels from your test set

In [53]:
# CODE HERE
X_test = test_set.drop(["median_house_value"],axis = 1)
Y_test = test_set["median_house_value"].copy()

2-run your full_pipeline to transform the data

In [54]:
# CODE HERE
test = full_pipeline.transform(X_test)

3-evaluate the final model on the test set

In [55]:
# CODE HERE
final_model = grid_search.best_estimator_
test_pred = final_model.predict(test)
test_mse = mean_squared_error(Y_test , test_pred)
test_rmse = np.sqrt(test_mse)

print("Test RMSE :",test_rmse)

Test RMSE : 49254.86975027394


# compute a 95% confidence interval for the generalization error 
*using scipy.stats.t.interval():*

In [56]:
from scipy import stats

In [59]:
# CODE HERE
lower_bound, upper_bound = stats.t.interval(0.95 , X_test.shape[0] - 1 , loc = np.mean(test_pred))
print(lower_bound ,upper_bound )

206644.26799179785 206648.18906973317


# Great Job!
# #shAI_Club