In [63]:
# Importing Dependencies
import pandas as pd
import numpy as np
import sklearn


In [64]:

from joblib import load
model = load("/content/Dragin.joblib")


In [65]:
housing = pd.read_csv ("/content/housing.csv")

In [66]:
from numpy.ma import median
# Partially Handling the nan for CHAS feature , it wll go through Stratified Sampling
# Since we split along the CHAS attribute for K Fold so lets fill nan by mdian
median = housing['CHAS'].median()
housing["CHAS"] =housing['CHAS'].fillna(median)


In [67]:
# Stratified Sampling
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split (housing , housing["CHAS"]):
    # stratified_train_set = housing.loc[train_index]
    stratified_test_set = housing.loc[test_index]

test_features = stratified_test_set.drop("MEDV", axis=1)
test_labels = stratified_test_set["MEDV"].copy()


In [68]:
# Pipeline (imputer + scaler)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

my_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

x_test_prepared = my_pipeline.fit_transform(test_features)

In [69]:
from sklearn.metrics import mean_squared_error

final_predictions = model.predict(x_test_prepared)
final_mse = mean_squared_error(test_labels, final_predictions)
final_rmse = np.sqrt(final_mse)

final_rmse



np.float64(179637.97773160329)

In [70]:
# # Using k fold Cross validation in 10 groups
# from sklearn.model_selection import cross_val_score
# scores = cross_val_score(model, x_test_prepared, test_labels, scoring="neg_mean_squared_error", cv=10)
# rmse_scores = np.sqrt(-scores)

In [71]:

from tabulate import tabulate

def print_scores(scores, labels):
    avg_price = labels.mean()   # baseline: mean of actual house prices

    data = {
        "Metric": ["Mean", "Standard Deviation"],
        "Absolute (RMSE)": [scores.mean(), scores.std()],
        "Relative to Avg Price (%)": [
            (scores.mean() / avg_price) * 100,
            (scores.std() / avg_price) * 100
        ]
    }

    results_table = pd.DataFrame(data)
    print("Scores:", scores)
    print("\nEvaluation Results:\n")
    table = tabulate(results_table, headers="keys", tablefmt="grid", showindex=False)
    print(table)

# Example usage
print_scores(final_rmse, test_labels)

Scores: 179637.97773160329

Evaluation Results:

+--------------------+-------------------+-----------------------------+
| Metric             |   Absolute (RMSE) |   Relative to Avg Price (%) |
| Mean               |            179638 |                     39.7737 |
+--------------------+-------------------+-----------------------------+
| Standard Deviation |                 0 |                      0      |
+--------------------+-------------------+-----------------------------+


## Interpretation

Due to the limited amount of training data available, the model achieved a training error of approximately **42%**, while the error on the test set was around **39.77%**. This indicates that the model is **underfitting**, as both training and test errors are relatively high and close in value.


In [72]:
from scipy.sparse import data
# demo: new data
x_test_prepared[0]


array([0.45541225, 0.56819041, 0.68704475, 0.61666984])