In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [25]:
import os
import tarfile
import urllib
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

def load_housing_data(housing_path=HOUSING_PATH):
   csv_path = os.path.join(housing_path, "housing.csv")
   return pd.read_csv(csv_path)

fetch_housing_data()
housing = load_housing_data()

rooms_ix, bedrooms_ix, population_ix, household_ix = [
    list(housing.columns).index(col)
    for col in ("total_rooms", "total_bedrooms", "population", "households")]

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
housing = train_set.drop("median_house_value", axis=1)
housing_labels = train_set["median_house_value"].copy()

housing_num = housing.drop("ocean_proximity", axis=1)
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
 ('imputer', SimpleImputer(strategy="median")),
 ('attribs_adder', CombinedAttributesAdder()),
 ('std_scaler', StandardScaler())])

full_pipeline = ColumnTransformer([
 ("num", num_pipeline, num_attribs),
 ("cat", OneHotEncoder(), cat_attribs)])

housing_prepared = full_pipeline.fit_transform(housing)

In [17]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()

lin_reg.fit(housing_prepared, housing_labels)


In [26]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]

# قم بتحويل البيانات باستخدام الخطوط السابقة
some_data_prepared = full_pipeline.transform(some_data)

# قم بتوقع القيم المتوقعة باستخدام النموذج
predictions = lin_reg.predict(some_data_prepared)
predictions

array([181746.54359616, 290558.74973505, 244957.50017771, 146498.51061398,
       163230.42393939])

In [27]:

from sklearn.metrics import mean_squared_error

# قم بتوقع قيم السكن الوسيطة لكامل مجموعة التدريب
train_predictions = lin_reg.predict(housing_prepared)

# استخدم دالة mean_squared_error لحساب متوسط ​​مربعات الخطأ
mse = mean_squared_error(housing_labels, train_predictions)

# قم بحساب جذر متوسط ​​مربعات الخطأ (RMSE)
rmse = np.sqrt(mse)
rmse

67593.20745775253

In [21]:
housing["median_house_value"].mean()


206855.81690891474

**Given that the RMSE value of 67593.20745775253 is significantly lower than the average housing value of 206855.81690891474, it suggests that the linear regression model is providing relatively accurate predictions. However, while the RMSE value indicates the average error in the model's predictions, it's important to consider other factors such as the specific context of the problem and the acceptable level of error. In general, a lower RMSE value implies better performance, but it's essential to interpret the result in the context of the application and its requirements.**

In [28]:

from sklearn.tree import DecisionTreeRegressor

# Instantiate Decision Tree Regressor model
tree_reg = DecisionTreeRegressor()

# Fit the model to the prepared data
tree_reg.fit(housing_prepared, housing_labels)

In [29]:

from sklearn.metrics import mean_squared_error

# Make predictions on the training set
train_predictions_tree = tree_reg.predict(housing_prepared)

# Calculate mean squared error
mse_tree = mean_squared_error(housing_labels, train_predictions_tree)

# Calculate RMSE
rmse_tree = np.sqrt(mse_tree)
rmse_tree

0.0

**If the RMSE value equals 0.0, it indicates that the model's predictions perfectly match the actual values in the training set, suggesting flawless performance. However, such a result is highly uncommon and may warrant further investigation. While a zero RMSE implies ideal accuracy, it's prudent to verify the data, model, and evaluation process thoroughly to ensure the integrity of the results. Additionally, double-checking each step of the analysis can help identify any potential issues or errors that may have led to this unexpected outcome.**

In [30]:

from sklearn.model_selection import cross_val_score

# Define the Decision Tree Regressor model
tree_reg = DecisionTreeRegressor()

# Perform cross-validation with 10 folds
scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
                         scoring="neg_mean_squared_error", cv=10)

# Calculate RMSE scores (convert neg_mean_squared_error to positive)
rmse_scores = np.sqrt(-scores)

# Display the RMSE scores
print("RMSE scores:", rmse_scores)


RMSE scores: [63200.2610048  71329.65300028 67645.47514514 70807.66034442
 72061.72909329 66599.30658407 65818.48566622 67658.81582416
 66319.25955135 70222.25774999]


In [31]:
# Display the RMSE scores
print("RMSE scores:", rmse_scores)

# Calculate mean and standard deviation of the RMSE scores
mean_rmse = rmse_scores.mean()
std_rmse = rmse_scores.std()

# Display the mean and standard deviation
print("Mean RMSE:", mean_rmse)
print("Standard Deviation of RMSE:", std_rmse)


RMSE scores: [63200.2610048  71329.65300028 67645.47514514 70807.66034442
 72061.72909329 66599.30658407 65818.48566622 67658.81582416
 66319.25955135 70222.25774999]
Mean RMSE: 68166.29039637379
Standard Deviation of RMSE: 2701.0093986068664


In [32]:
from sklearn.linear_model import LinearRegression

# Define the Linear Regression model
lin_reg = LinearRegression()

# Perform cross-validation with 10 folds
scores_lin_reg = cross_val_score(lin_reg, housing_prepared, housing_labels,
                                 scoring="neg_mean_squared_error", cv=10)

# Calculate RMSE scores (convert neg_mean_squared_error to positive)
rmse_scores_lin_reg = np.sqrt(-scores_lin_reg)

# Display the RMSE scores
print("RMSE scores for Linear Regression:", rmse_scores_lin_reg)

# Calculate mean and standard deviation of the RMSE scores for Linear Regression
mean_rmse_lin_reg = rmse_scores_lin_reg.mean()
std_rmse_lin_reg = rmse_scores_lin_reg.std()

# Display the mean and standard deviation for Linear Regression
print("Mean RMSE for Linear Regression:", mean_rmse_lin_reg)
print("Standard Deviation of RMSE for Linear Regression:", std_rmse_lin_reg)


RMSE scores for Linear Regression: [65000.67382615 70960.56056304 67122.63935124 66089.63153865
 68402.54686442 65266.34735288 65218.78174481 68525.46981754
 72739.87555996 68957.34111906]
Mean RMSE for Linear Regression: 67828.38677377408
Standard Deviation of RMSE for Linear Regression: 2468.0913950652275


In [33]:
from sklearn.ensemble import RandomForestRegressor

# Define the RandomForestRegressor model
forest_reg = RandomForestRegressor()

# Train the model on the training data
forest_reg.fit(housing_prepared, housing_labels)


In [34]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

# Define the RandomForestRegressor model
forest_reg = RandomForestRegressor()

# 1. Perform cross-validation with 10 folds for RandomForestRegressor
scores_forest_reg = cross_val_score(forest_reg, housing_prepared, housing_labels,
                                    scoring="neg_mean_squared_error", cv=10)

# Calculate RMSE scores (convert neg_mean_squared_error to positive)
rmse_scores_forest_reg = np.sqrt(-scores_forest_reg)

# 2. Display the RMSE scores and calculate mean and standard deviation for RandomForestRegressor
print("RMSE scores for RandomForestRegressor:", rmse_scores_forest_reg)
print("Mean RMSE for RandomForestRegressor:", rmse_scores_forest_reg.mean())
print("Standard Deviation of RMSE for RandomForestRegressor:", rmse_scores_forest_reg.std())

# 3. Repeat the same steps for Linear Regression model
from sklearn.linear_model import LinearRegression

# Define the Linear Regression model
lin_reg = LinearRegression()

# Perform cross-validation with 10 folds for Linear Regression
scores_lin_reg = cross_val_score(lin_reg, housing_prepared, housing_labels,
                                 scoring="neg_mean_squared_error", cv=10)

# Calculate RMSE scores (convert neg_mean_squared_error to positive)
rmse_scores_lin_reg = np.sqrt(-scores_lin_reg)

# Display the RMSE scores and calculate mean and standard deviation for Linear Regression
print("RMSE scores for Linear Regression:", rmse_scores_lin_reg)
print("Mean RMSE for Linear Regression:", rmse_scores_lin_reg.mean())
print("Standard Deviation of RMSE for Linear Regression:", rmse_scores_lin_reg.std())


RMSE scores for RandomForestRegressor: [47075.95549254 51431.02867988 49651.45008711 51709.33785848
 52760.67430209 47048.15664721 47549.84829958 50611.87740324
 49324.31028659 50230.31773479]
Mean RMSE for RandomForestRegressor: 49739.295679150215
Standard Deviation of RMSE for RandomForestRegressor: 1902.1954902799812
RMSE scores for Linear Regression: [65000.67382615 70960.56056304 67122.63935124 66089.63153865
 68402.54686442 65266.34735288 65218.78174481 68525.46981754
 72739.87555996 68957.34111906]
Mean RMSE for Linear Regression: 67828.38677377408
Standard Deviation of RMSE for Linear Regression: 2468.0913950652275


In [35]:
from joblib import dump

# Save the Decision Tree model
dump(tree_reg, 'decision_tree_model.joblib')

# Save the RandomForestRegressor model
dump(forest_reg, 'random_forest_model.joblib')

# Save the Linear Regression model
dump(lin_reg, 'linear_regression_model.joblib')


['linear_regression_model.joblib']

In [38]:
from sklearn.model_selection import GridSearchCV

# تعريف الهايبربارامترات المراد تحديدها وقيمها المحتملة
param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

# تعريف نموذج RandomForestRegressor
forest_reg = RandomForestRegressor()

# تحديد Grid Search مع نموذج RandomForestRegressor والهايبربارامترات المحددة
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

# تدريب Grid Search
grid_search.fit(housing_prepared, housing_labels)


In [39]:
# Display the best results
print("Best Grid Search results:")
print(grid_search.best_params_)
print(grid_search.best_estimator_)

# Display results for all possible combinations
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)


Best Grid Search results:
{'max_features': 6, 'n_estimators': 30}
RandomForestRegressor(max_features=6, n_estimators=30)
63514.12655087004 {'max_features': 2, 'n_estimators': 3}
54615.846553148185 {'max_features': 2, 'n_estimators': 10}
52583.068040877515 {'max_features': 2, 'n_estimators': 30}
60508.17533475931 {'max_features': 4, 'n_estimators': 3}
52409.04236502985 {'max_features': 4, 'n_estimators': 10}
50499.66569721664 {'max_features': 4, 'n_estimators': 30}
58887.93175717394 {'max_features': 6, 'n_estimators': 3}
52155.515807540716 {'max_features': 6, 'n_estimators': 10}
49962.540661016996 {'max_features': 6, 'n_estimators': 30}
58436.72810006563 {'max_features': 8, 'n_estimators': 3}
52032.907947576445 {'max_features': 8, 'n_estimators': 10}
50126.85950264489 {'max_features': 8, 'n_estimators': 30}
61691.81208431093 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
53730.10978875347 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
59426.29814922754 {'bootstr

In [40]:
# Get the best model obtained from Grid Search
best_model = grid_search.best_estimator_

# Get the feature importances
feature_importances = best_model.feature_importances_

# Create a list of feature names
feature_names = list(housing.columns)

# Pair feature names with their importance scores
feature_importance_pairs = list(zip(feature_names, feature_importances))

# Sort the pairs based on importance scores (descending order)
feature_importance_pairs.sort(key=lambda x: x[1], reverse=True)

# Print the sorted feature importance pairs
for feature, importance in feature_importance_pairs:
    print(f"{feature}: {importance}")


median_income: 0.3698695501294777
longitude: 0.06932547800449126
latitude: 0.06399241102596748
ocean_proximity: 0.051226702856485534
housing_median_age: 0.04161014869280183
total_rooms: 0.01702475920631695
population: 0.01662698308555356
total_bedrooms: 0.016125888283794982
households: 0.01597237724987934


In [42]:
# Get the predictors from the test set (excluding the target variable)
X_test = test_set.drop("median_house_value", axis=1)

# Get the labels from the test set
y_test = test_set["median_house_value"].copy()


In [43]:
# Run the full pipeline to transform the test data
X_test_prepared = full_pipeline.transform(X_test)


In [44]:
# Use the best model obtained from Grid Search to make predictions on the prepared test data
final_model = grid_search.best_estimator_
final_predictions = final_model.predict(X_test_prepared)

# Calculate RMSE to evaluate the final model
final_rmse = np.sqrt(mean_squared_error(y_test, final_predictions))
print("Final RMSE on the test set:", final_rmse)


Final RMSE on the test set: 49628.07981369372


In [45]:
from scipy import stats

# Define the confidence level
confidence_level = 0.95

# Calculate the degrees of freedom
degrees_of_freedom = len(y_test) - 1

# Calculate the mean and standard error of the generalization error (RMSE)
mean_error = final_rmse
standard_error = final_rmse / np.sqrt(len(y_test))

# Compute the confidence interval
ci_lower, ci_upper = stats.t.interval(confidence_level, degrees_of_freedom, loc=mean_error, scale=standard_error)

# Print the confidence interval
print("95% Confidence Interval for Generalization Error:")
print("Lower bound:", ci_lower)
print("Upper bound:", ci_upper)


95% Confidence Interval for Generalization Error:
Lower bound: 48113.70594707431
Upper bound: 51142.453680313134
