![](logo1.jpg)

# **shAI Training 2023 | Level 1**

## Task #8 (End-to-End ML Project {part_2})

## Welcome to the exercises for reviewing second part of end to end ML project.
**Make sure that you read and understand ch2 from the hands-on ML book (page 72 to the end of the chapter ) before start with this notebook.**

**If you stuck with anything reread that part from the book and feel free to ask about anything in the messenger group as you go along.**

 ## Good Luck : )

## first run the following cell for the first part of the project to continue your work

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [None]:
import os
import tarfile
import urllib
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

def load_housing_data(housing_path=HOUSING_PATH):
   csv_path = os.path.join(housing_path, "housing.csv")
   return pd.read_csv(csv_path)

fetch_housing_data()
housing = load_housing_data()

rooms_ix, bedrooms_ix, population_ix, household_ix = [
    list(housing.columns).index(col)
    for col in ("total_rooms", "total_bedrooms", "population", "households")]

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
housing = train_set.drop("median_house_value", axis=1)
housing_labels = train_set["median_house_value"].copy()

housing_num = housing.drop("ocean_proximity", axis=1)
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
 ('imputer', SimpleImputer(strategy="median")),
 ('attribs_adder', CombinedAttributesAdder()),
 ('std_scaler', StandardScaler())])

full_pipeline = ColumnTransformer([
 ("num", num_pipeline, num_attribs),
 ("cat", OneHotEncoder(), cat_attribs)])

housing_prepared = full_pipeline.fit_transform(housing)

# 1- Select and Train a Model

# Let’s first train a LinearRegression model

In [None]:
test_set.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [None]:
test_set.dropna(axis = 0, inplace = True)

In [None]:
housing_cat = housing[['ocean_proximity']]
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
ohe_housing = ohe.fit_transform(housing_cat)

In [None]:
ohe_housing.shape

(16512, 5)

In [None]:
ohe_housing.toarray()

array([[0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       ...,
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])

In [None]:
ohe.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

In [None]:
train_set_cat = train_set[['ocean_proximity']]
test_set_cat = test_set[['ocean_proximity']]

ohe_train = OneHotEncoder()
ohe_test = OneHotEncoder()
ohe_train_cat = ohe_train.fit_transform(train_set_cat)
ohe_test_cat = ohe_test.fit_transform(test_set_cat)

In [None]:
print(ohe_train_cat.shape)
print(ohe_test_cat.shape)

(16512, 5)
(3921, 5)


In [None]:
arr_ohe_train_cat = ohe_train_cat.toarray()
arr_ohe_test_cat = ohe_test_cat.toarray()

In [None]:
df_ohe_train_cat = pd.DataFrame(arr_ohe_train_cat)
df_ohe_test_cat = pd.DataFrame(arr_ohe_test_cat)

In [None]:
(df_ohe_train_cat.head())

Unnamed: 0,0,1,2,3,4
0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,0.0


In [None]:
(df_ohe_test_cat.head())

Unnamed: 0,0,1,2,3,4
0,0.0,1.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0


In [None]:
train_set.drop('ocean_proximity',axis = 1,inplace = True)
test_set.drop('ocean_proximity',axis = 1, inplace = True)

In [None]:
train_set.index = np.arange(0,16512)
test_set.index = np.arange(0,3921)

In [None]:
train_set_full = train_set.join(df_ohe_train_cat)
test_set_full = test_set.join(df_ohe_test_cat)

In [None]:
(train_set_full.head())

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,0,1,2,3,4
0,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,103000.0,0.0,0.0,0.0,0.0,1.0
1,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,382100.0,0.0,0.0,0.0,0.0,1.0
2,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,172600.0,0.0,0.0,0.0,0.0,1.0
3,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425,93400.0,0.0,0.0,0.0,0.0,1.0
4,-119.8,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,96500.0,0.0,1.0,0.0,0.0,0.0


In [None]:
(test_set_full.head())

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,0,1,2,3,4
0,-120.67,37.37,18.0,164.0,30.0,104.0,32.0,1.6607,87500.0,0.0,1.0,0.0,0.0,0.0
1,-118.01,33.89,34.0,1653.0,292.0,1003.0,310.0,4.6,203400.0,1.0,0.0,0.0,0.0,0.0
2,-121.97,36.97,15.0,2849.0,668.0,1546.0,582.0,2.7587,228600.0,0.0,0.0,0.0,0.0,1.0
3,-118.25,33.98,47.0,617.0,162.0,754.0,144.0,2.2969,116700.0,1.0,0.0,0.0,0.0,0.0
4,-121.81,37.38,29.0,570.0,76.0,244.0,72.0,12.3292,416700.0,1.0,0.0,0.0,0.0,0.0


In [None]:
train_set_full.drop(["median_house_value"],axis = 1, inplace = True)

In [None]:
# CODE HERE
#back
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
#lin_reg.fit(housing,housing_labels)
lin_reg.fit(train_set_full.values,housing_labels)

# First try it out on a few instances from the training set:


In [None]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]

In [None]:
from sklearn.pipeline import make_pipeline
# num_pipeline = make_pipeline(SimpleImputer(strategy="median"),StandardScaler())

In [None]:
housing_num_prepared = num_pipeline.fit_transform(housing_num)

In [None]:
housing_num_prepared[:2].round(2)

array([[ 1.27, -1.37,  0.35,  0.22,  0.21,  0.77,  0.32, -0.33, -0.17,
         0.05, -0.21],
       [ 0.71, -0.88,  1.62,  0.34,  0.59, -0.1 ,  0.67, -0.04, -0.4 ,
        -0.12,  0.34]])

In [None]:
# from sklearn.preprocessing import MinMaxScaler
# min_max_scaler = MinMaxScaler(feature_range=(-1, 1))
# housing_num_min_max_scaled = min_max_scaler.fit_transform(housing_num)

In [None]:
# from sklearn.linear_model import LinearRegression
# target_scaler = StandardScaler()
# scaled_labels = target_scaler.fit_transform(housing_labels.to_frame())
# model = LinearRegression()
# model.fit(housing, scaled_labels)
# some_new_data = housing.iloc[:5]
# scaled_predictions = model.predict(some_new_data)
# predictions = target_scaler.inverse_transform(scaled_predictions)

In [None]:
# from sklearn.preprocessing import FunctionTransformer
# log_transformer = FunctionTransformer(np.log,inverse_func=np.exp)
# log_pop = log_transformer.transform(housing[["population"]])

In [None]:
#df_housing_num_prepared = pd.DataFrame(housing_num_prepared,columns=num_pipeline.get_feature_names_out(),index=housing_num.index)

In [None]:
from sklearn.compose import ColumnTransformer
num_attribs = ["longitude", "latitude","housing_median_age", "total_rooms", "total_bedrooms", "population","households", "median_income"]
cat_attribs = ["ocean_proximity"]
cat_pipeline = make_pipeline(SimpleImputer(strategy="most_frequent"),
                            OneHotEncoder(handle_unknown="ignore"))
preprocessing = ColumnTransformer([("num", num_pipeline, num_attribs),("cat", cat_pipeline, cat_attribs),])


In [None]:
from sklearn.compose import make_column_selector,make_column_transformer
preprocessing = make_column_transformer((num_pipeline,make_column_selector(dtype_include=np.number)),(cat_pipeline,make_column_selector(dtype_include=object)),)

In [None]:
housing_prepared = preprocessing.fit_transform(housing)

In [None]:
from sklearn.linear_model import LinearRegression
lin_reg = make_pipeline(preprocessing,
LinearRegression())
lin_reg.fit(housing, housing_labels)

In [None]:
housing_predictions = lin_reg.predict(housing)

In [None]:
housing_predictions[:5].round(-2)

array([181700., 290600., 245000., 146500., 163200.])

In [None]:
housing_labels.iloc[:5].values

array([103000., 382100., 172600.,  93400.,  96500.])

In [None]:
from sklearn.metrics import mean_squared_error
lin_rmse = mean_squared_error(housing_labels,housing_predictions,squared=False)
lin_rmse

67593.20745775253

# measure this regression model’s RMSE on the whole training set
* sing Scikit-Learn’s mean_squared_error() function:

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
# CODE HERE
y_predictions = lin_reg.predict(housing)
lin_mse = mean_squared_error(housing_labels,y_predictions)
lin_rmse = np.sqrt(lin_mse)
print("The mse is :",lin_rmse)

The mse is : 67593.20745775253


In [None]:
y_predictions_some_data = lin_reg.predict(some_data)
lin_mse_some_data = mean_squared_error(some_labels,y_predictions_some_data)
lin_rmse_some_data = np.sqrt(lin_mse_some_data)
print("The mse is :",lin_rmse_some_data)

The mse is : 73605.4723086992


# judge on the RMSE result for this model
write down your answar

your answer goes here  
the MSE is of the second linear regression model in all dataset is very big it indicates that the model does not accuratly
predict the model .

in the first model linear regression with some data the result it closes to zero it indicates that the model fit the data
accurately .
the first model is better than the second one .

# Let’s train a Decision Tree Regressor model
## more powerful model

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
# CODE HERE
decision_regressor = DecisionTreeRegressor(random_state = 42)
# fit the regressor with X and Y data
decision_regressor.fit(train_set_full.values,housing_labels)

In [None]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = make_pipeline(preprocessing,DecisionTreeRegressor(random_state=42))
tree_reg.fit(housing, housing_labels)

In [None]:
housing_predictions = tree_reg.predict(housing)
tree_rmse = mean_squared_error(housing_labels,housing_predictions,squared=False)
tree_rmse

0.0

# Now evaluate the model on the training set
* using Scikit-Learn’s mean_squared_error() function:

In [None]:
# CODE HERE
from sklearn.metrics import mean_squared_error
y_predict = decision_regressor.predict(train_set_full.values)
decision_regressor_mse = mean_squared_error(housing_labels,y_predict)
decision_regressor_rmse = np.sqrt(decision_regressor_mse)
print(decision_regressor_rmse)

0.0


# Explaine this result
write down your answar

your answer goes here
the model fits the data accurately the rmse is 0

# Evaluation Using Cross-Validation

1-split the training set into 10 distinct subsets then train and evaluate the Decision Tree model

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
# CODE HERE
scores = cross_val_score(decision_regressor,train_set_full.values,housing_labels,scoring ="neg_mean_squared_error",cv = 10)
decision_scores = np.sqrt(-scores)
print(decision_scores)

[66901.80689562 69717.16806528 66772.74727375 70189.43109608
 67525.30971243 67528.03614663 62573.29539395 71201.14995545
 69987.59602774 68282.9174425 ]


2- display the resultant scores and calculate its Mean and Standard deviation

In [None]:
# CODE HERE
import statistics
Mean_scores_decision = statistics.mean(decision_scores)
print("The Mean is :",Mean_scores_decision)
Std_scores_decision = print("The Std is :",np.std(decision_scores))

The Mean is : 68067.94580094457
The Std is : 2336.3097105092297


3-repaet the same steps to compute the same scores for the Linear Regression  model

*notice the difference between the results of the two models*

In [None]:
# CODE HERE
scores_lin_reg = cross_val_score(lin_reg, housing,housing_labels,scoring ="neg_mean_squared_error",cv = 10)
#print(scores_lin_reg)
linear_regression_scores = np.sqrt(-scores_lin_reg)
print(linear_regression_scores)

[65000.67382615 70960.56056304 67122.63935124 66089.63153865
 68402.54686442 65266.34735288 65218.78174481 68525.46981754
 72739.87555996 68957.34111906]


In [None]:
Mean_scores_lin_reg = statistics.mean(linear_regression_scores)
print("The Mean is :",Mean_scores_lin_reg)
Std_scores_lin_reg = print("The Std is :",np.std(linear_regression_scores))

The Mean is : 67828.38677377408
The Std is : 2468.091395065225


## Let’s train one last model the RandomForestRegressor.

In [None]:
# CODE HERE
from sklearn.ensemble import RandomForestRegressor

regressor_forest = RandomForestRegressor(n_estimators=100, random_state=42)
regressor_forest.fit(train_set_full.values,housing_labels)

# repeat the same steps to compute the same scores its Mean and Standard deviation for the Random Forest model

In [None]:
# CODE HERE
random_forest_scores = cross_val_score(regressor_forest,train_set_full.values,housing_labels,scoring ="neg_mean_squared_error",cv = 10)
random_forest_scores
random_forest_scores = np.sqrt(-random_forest_scores)
print(random_forest_scores)

[47421.81091771 50511.52852061 47554.7207197  49977.74446628
 50329.8676495  46696.05790071 45923.281973   50915.57398837
 49448.92446376 49740.31950019]


In [None]:
Mean_scores_random_forest = statistics.mean(random_forest_scores)
print("The Mean is :",Mean_scores_random_forest)
Std_scores_random_forest = print("The Std is :",np.std(random_forest_scores))

The Mean is : 48851.98300998268
The Std is : 1690.060537830506


# Save every model you experiment with
*using the joblib library*

In [None]:
# CODE HERE
import joblib
joblib.dump(lin_reg, 'my_model_lin_reg.pkl')
joblib.dump(regressor, 'my_model_Decision_Tree_Regressor.pkl')
joblib.dump(regressor_forest, 'my_model_random_forest.pkl')
# joblib.dump(model, "my_model.pkl")

my_lin_reg_model_load = joblib.load("my_model_lin_reg.pkl")
my_desicion_tree_regressor_model_load = joblib.load("my_model_Decision_Tree_Regressor.pkl")
my_random_forest_regreesor_model_load = joblib.load("my_model_random_forest.pkl")

## now you have a shortlist of promising models. You now need to
## fine-tune them!
# Fine-Tune Your Model

## 1- Grid Search
## evaluate all the possible combinations of hyperparameter values for the RandomForestRegressor
*It may take a long time*

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
              {'n_estimators': [3,10,30], 'max_features':[2,4,6,8]},
              {'bootstrap':[False], 'max_features':[2,3,4],'n_estimators':[3,10]}
]
forest_reg = RandomForestRegressor(random_state = 42)

grid_search = GridSearchCV(forest_reg,param_grid,cv = 5, scoring = 'neg_mean_squared_error',return_train_score = True)
grid_search.fit(train_set_full.values, housing_labels)

with the evaluation scores

In [None]:
# CODE HERE
grid_search.best_params_

{'max_features': 4, 'n_estimators': 30}

In [None]:
grid_search.best_estimator_

In [None]:
# CODE HERE
cvres = grid_search.cv_results_

for mean_score, params in zip(cvres["mean_test_score"],cvres["params"]):
  print(np.sqrt(-mean_score), params)

62324.32593649314 {'max_features': 2, 'n_estimators': 3}
54002.67134386552 {'max_features': 2, 'n_estimators': 10}
51676.4484577732 {'max_features': 2, 'n_estimators': 30}
58377.067425604095 {'max_features': 4, 'n_estimators': 3}
51507.88447553556 {'max_features': 4, 'n_estimators': 10}
49832.40307626678 {'max_features': 4, 'n_estimators': 30}
58475.42036497952 {'max_features': 6, 'n_estimators': 3}
52017.21393020737 {'max_features': 6, 'n_estimators': 10}
49874.93597521022 {'max_features': 6, 'n_estimators': 30}
58019.6508478355 {'max_features': 8, 'n_estimators': 3}
52021.08208724326 {'max_features': 8, 'n_estimators': 10}
50228.446457026716 {'max_features': 8, 'n_estimators': 30}
61073.83449339679 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
53389.19826110037 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
57434.95310340888 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
51310.75102463433 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10

# Analyze the Best Models and Their Errors
1-indicate the relative importance of each attribute

In [None]:
# CODE HERE
from sklearn.model_selection import RandomizedSearchCV

param_distribs = {
        'n_estimators': np.random.randint(1, 200, 10),
        'max_features': np.random.randint(1, 8, 10),
    }

forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(train_set_full.values, housing_labels)

2-display these importance scores next to their corresponding attribute names:

In [None]:
# CODE HERE
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

49513.39076157026 {'n_estimators': 97, 'max_features': 4}
48914.61012689054 {'n_estimators': 97, 'max_features': 7}
51922.77370582913 {'n_estimators': 13, 'max_features': 4}
49925.35016406526 {'n_estimators': 123, 'max_features': 3}
50262.10718523624 {'n_estimators': 49, 'max_features': 3}
53278.44782526415 {'n_estimators': 157, 'max_features': 1}
48747.84710719617 {'n_estimators': 177, 'max_features': 6}
51922.77370582913 {'n_estimators': 13, 'max_features': 4}
50718.28007952207 {'n_estimators': 13, 'max_features': 7}
50718.28007952207 {'n_estimators': 13, 'max_features': 7}


In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([1.17401936e-01, 1.09699158e-01, 4.66992681e-02, 3.50589065e-02,
       3.00663629e-02, 4.33801187e-02, 2.70461593e-02, 4.13047792e-01,
       1.77452978e-02, 1.47785084e-01, 2.65278105e-04, 4.31343448e-03,
       7.49120362e-03])

In [None]:
feature_importances = rnd_search.best_estimator_.feature_importances_
feature_importances

array([1.16750932e-01, 1.07691396e-01, 4.73884846e-02, 3.54565393e-02,
       3.03915455e-02, 4.28356950e-02, 2.72187259e-02, 4.22034230e-01,
       1.50529018e-02, 1.44645483e-01, 2.65403807e-04, 3.55222453e-03,
       6.71643764e-03])

In [None]:
extra_attribs = ["population_per_household","bedroom_per_room","rooms_per_household"]
num_attribs = list(train_set.drop(["median_house_value"],axis = 1).columns)
cat_one_hot_attribs = list(ohe.categories_)
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances,attributes),reverse = True)

[(0.422034230479333, 'median_income'),
 (0.14464548333143373, 'bedroom_per_room'),
 (0.11675093177101392, 'longitude'),
 (0.1076913964248516, 'latitude'),
 (0.04738848457872672, 'housing_median_age'),
 (0.0428356949626655, 'population'),
 (0.03545653931030466, 'total_rooms'),
 (0.030391545469609794, 'total_bedrooms'),
 (0.027218725903687097, 'households'),
 (0.01505290178996172, 'population_per_household'),
 (0.0035522245316068177,
  array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
        dtype=object)),
 (0.00026540380713059234, 'rooms_per_household')]

In [None]:
final_model = rnd_search.best_estimator_

## Now is the time to evaluate the final model on the test set.
# Evaluate Your System on the Test Set

1-get the predictors and the labels from your test set

In [None]:
# CODE HERE
X_test = test_set_full.drop(["median_house_value"],axis = 1)
Y_test = test_set_full["median_house_value"].copy()

2-run your full_pipeline to transform the data

In [None]:
# CODE HERE
final_model = rnd_search.best_estimator_

3-evaluate the final model on the test set

In [None]:
final_predictions = final_model.predict(X_test)
final_rmse = mean_squared_error(Y_test,final_predictions, squared=False)
print(final_rmse)



48981.02979411738


# compute a 95% confidence interval for the generalization error
*using scipy.stats.t.interval():*

In [None]:
from scipy import stats

In [None]:
from scipy import stats
confidence = 0.95
squared_errors = (final_predictions - Y_test) ** 2
np.sqrt(stats.t.interval(confidence,len(squared_errors) - 1,
loc=squared_errors.mean(),scale=stats.sem(squared_errors)))

array([46644.04747801, 51211.47717312])

# Great Job!
# #shAI_Club