In [7]:
# Importing Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn


In [8]:
housing = pd.read_csv("/content/housing.csv")

In [9]:
# arranging data in proper order
# Keep label at the last column of the table
medv= housing.pop('MEDV')
housing['MEDV']= medv


In [10]:
housing.head()


Unnamed: 0,RM,LSTAT,PTRATIO,CHAS,MEDV
0,6.575,4.98,15.3,0.0,504000
1,6.421,9.14,17.8,1.0,453600
2,7.185,4.03,17.8,1.0,728700
3,6.998,2.94,18.7,1.0,701400
4,7.147,5.33,18.7,1.0,760200


In [11]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 489 entries, 0 to 488
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   RM       489 non-null    float64
 1   LSTAT    489 non-null    float64
 2   PTRATIO  489 non-null    float64
 3   CHAS     435 non-null    float64
 4   MEDV     489 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 19.2 KB


In [12]:
# # Data Visualization
# %matplotlib inline
# import matplotlib.pyplot as plot
# housing.hist(bins=50, figsize=(20,15))
# plot.show()



In [13]:
from numpy.ma import median
# Partially Handling the nan for CHAS feature , it wll go through Stratified Sampling
# Since we split along the CHAS attribute for K Fold so lets fill nan by mdian
median = housing['CHAS'].median()
housing["CHAS"] =housing['CHAS'].fillna(median)


In [14]:
# Train_Test Split
# def split_train_test(data, test_ratio):
#   np.random.seed(42)
#   shuffled_indices = np.random.permutation(len(data))
#   test_set_size = int(len(data) * test_ratio)
#   test_indices = shuffled_indices[:test_set_size]
#   train_indices =shuffled_indices[test_set_size:]

#   return data.iloc[train_indices], data.iloc[test_indices]

# train_set, test_set = split_train_test(housing, 0.2)

# Direct Method
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split (housing , test_size = 0.2, random_state = 42)

print(f"Rows in train_set:{len(train_set)}")
print("_______"*50)
print(f"\nRows in test_set:{len(test_set)}")
print("_______"*50)

train_set.info()

print("_______"*50)
test_set.info()


Rows in train_set:391
______________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________

Rows in test_set:98
______________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________
<class 'pandas.core.frame.DataFrame'>
Index: 391 entries, 325 to 102
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   RM       391 non-null    float64
 1   LSTAT    391 non-null    float64
 2

In [15]:
# Stratified Sampling
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split (housing , housing["CHAS"]):
    stratified_train_set = housing.loc[train_index]
    stratified_test_set = housing.loc[test_index]

In [16]:
# copy the train set
housing_train =stratified_train_set.copy()

In [17]:
stratified_train_set["CHAS"].value_counts()



Unnamed: 0_level_0,count
CHAS,Unnamed: 1_level_1
1.0,281
0.0,110


In [18]:
stratified_test_set["CHAS"].value_counts()

Unnamed: 0_level_0,count
CHAS,Unnamed: 1_level_1
1.0,71
0.0,27


In [19]:
stratified_test_set.info()

<class 'pandas.core.frame.DataFrame'>
Index: 98 entries, 378 to 470
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   RM       98 non-null     float64
 1   LSTAT    98 non-null     float64
 2   PTRATIO  98 non-null     float64
 3   CHAS     98 non-null     float64
 4   MEDV     98 non-null     int64  
dtypes: float64(4), int64(1)
memory usage: 4.6 KB


In [20]:
# # Data visualization
# from pandas.plotting import scatter_matrix
# attributes = ["MEDV", "RM", "CHAS", "LSTAT"]

# scatter_matrix(housing_train[attributes], figsize=(20,15))


In [21]:
# housing_train.plot(kind="scatter", x="RM", y="MEDV", alpha=0.8)
# plt.ticklabel_format(style='plain')




In [22]:
# housing_train.plot(kind="scatter", x="LSTAT", y="MEDV", alpha=0.8)

In [23]:
# Separating features and label
train_features = stratified_train_set.drop("MEDV", axis=1)
train_labels = stratified_train_set["MEDV"].copy()

test_features = stratified_test_set.drop("MEDV", axis=1)
test_labels = stratified_test_set["MEDV"].copy()

In [24]:
# # Since there may be missing values in train set
# # So to handle this problem lets use Imputer Class

# from sklearn.impute import SimpleImputer
# imputer = SimpleImputer(strategy="median")
# imputer.fit(housing_train)

# imputer.statistics_

# x = imputer.transform(housing_train)
# housing_tr = pd.DataFrame(x, columns=housing_train.columns)
# housing_tr.describe()


In [25]:
# Feature Scaling
# we will do by 2 methods:
  # 1. minmaxScalar
  # 2. StandardScalar
# Thus, lets create pipeline for above prepocessing including standardization



In [26]:
# Pipeline (imputer + scaler)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

my_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler())
])


In [27]:
train_prepared = my_pipeline.fit_transform(train_features)

In [28]:
train_prepared

array([[-0.58499574, -0.70444268,  0.21510462,  0.6256669 ],
       [-0.43288454,  0.87067978,  0.82729766,  0.6256669 ],
       [-0.26214748, -0.30308242, -0.49127197,  0.6256669 ],
       ...,
       [-0.37700696, -0.07930242, -1.52729096,  0.6256669 ],
       [ 0.96095072, -1.20830862, -0.39708842, -1.59829455],
       [ 0.01724043, -0.73331752, -0.02035424,  0.6256669 ]])

In [29]:
# #Selecting Desigred Model

#  # Linear Regrssion

# from sklearn.linear_model import LinearRegression
# model = LinearRegression()
# model.fit(train_prepared, train_labels)



In [30]:

# # # DecisionTreeRegressior
# from sklearn.tree import DecisionTreeRegressor
# model = DecisionTreeRegressor()
# model.fit(train_features, train_labels)


In [31]:

# RandomForestRegesssor
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(train_features, train_labels)

In [32]:
some_data= train_features.iloc[:5]
some_labels = train_labels.iloc[:5]

In [33]:
prepared_data = my_pipeline.transform(some_data)
model.predict(prepared_data)



array([552699., 552699., 552699., 552699., 552699.])

In [34]:
list(some_labels)

[462000, 266700, 504000, 632100, 117600]

In [35]:
#Evaluationg the model
from sklearn.metrics import mean_squared_error
housing_predictions = model.predict(train_prepared)
linear_mse = mean_squared_error(train_labels, housing_predictions)
linear_rmse = np.sqrt(linear_mse)

print(f"Mean Squre Error MSC:", linear_rmse)

print("___________"*5)
# Relative error in %
avg_price = train_labels.mean()
error_percent = (linear_rmse / avg_price) * 100

print(f"Average House Price: {avg_price:.2f}")
print("___________"*5)
print(f"Error Rate: {error_percent:.2f}%")

Mean Squre Error MSC: 194598.52794816653
_______________________________________________________
Average House Price: 455017.90
_______________________________________________________
Error Rate: 42.77%




In [41]:
# Using k fold Cross validation in 10 groups
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, train_prepared, train_labels, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)


In [37]:
import pandas as pd
from tabulate import tabulate

def print_scores(scores, labels):
    avg_price = labels.mean()   # baseline: mean of actual house prices

    data = {
        "Metric": ["Mean", "Standard Deviation"],
        "Absolute (RMSE)": [scores.mean(), scores.std()],
        "Relative to Avg Price (%)": [
            (scores.mean() / avg_price) * 100,
            (scores.std() / avg_price) * 100
        ]
    }

    results_table = pd.DataFrame(data)
    print("Scores:", scores)
    print("\nEvaluation Results:\n")
    table = tabulate(results_table, headers="keys", tablefmt="grid", showindex=False)
    print(table)

# Example usage
print_scores(rmse_scores, train_labels)


Scores: [62590.61977845 53265.34284707 69122.69578243 78837.42648592
 75121.83538554 73695.48741638 74285.35999364 69750.8843087
 55751.1376469  72406.01775352]

Evaluation Results:

+--------------------+-------------------+-----------------------------+
| Metric             |   Absolute (RMSE) |   Relative to Avg Price (%) |
| Mean               |          68482.7  |                    15.0505  |
+--------------------+-------------------+-----------------------------+
| Standard Deviation |           8111.29 |                     1.78263 |
+--------------------+-------------------+-----------------------------+


In [38]:
import os
import numpy as np

def save_model_output(model_name, scores, labels, filename="Model_Outputs.txt"):
    avg_price = labels.mean()
    mean_rmse = scores.mean()
    std_rmse = scores.std()
    mean_percent = (mean_rmse / avg_price) * 100
    std_percent = (std_rmse / avg_price) * 100

    # Prepare formatted block
    output_block = (
        f"Model: {model_name}\n"
        f"Mean RMSE: {mean_rmse:.2f}\n"
        f"Std RMSE: {std_rmse:.2f}\n"
        f"Mean (% of avg price): {mean_percent:.2f}%\n"
        f"Std (% of avg price): {std_percent:.2f}%\n"
        f"{'='*60}\n"
    )

    # Check if file exists and read content
    if os.path.exists(filename):
        with open(filename, "r") as f:
            content = f.read()
        # Avoid duplicate entries
        if f"Model: {model_name}" in content:
            print(f" Skipping: {model_name} already exists in {filename}")
            return

    # Append new model output
    with open(filename, "a") as f:
        f.write(output_block)

    # Print to console as well
    print(output_block)
    print(f" Saved results for {model_name} into {filename}")


In [39]:
save_model_output("Random Forest Regressor", rmse_scores, train_labels)

Model: Random Forest Regressor
Mean RMSE: 68482.68
Std RMSE: 8111.29
Mean (% of avg price): 15.05%
Std (% of avg price): 1.78%

 Saved results for Random Forest Regressor into Model_Outputs.txt


In [40]:
# Here the model and its behaviour for this prject is save into anoter .txt file

# Saving the model into dragon.joblib
from joblib import dump
dump(model, "Dragin.joblib")

['Dragin.joblib']