In [1]:
from google.colab import drive

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


In [2]:
df = pd.read_csv('/content/drive/MyDrive/data/HousingData.csv')

In [3]:
df.isnull().sum()

Unnamed: 0,0
CRIM,20
ZN,20
INDUS,20
CHAS,20
NOX,0
RM,0
AGE,20
DIS,0
RAD,0
TAX,0


In [4]:
df=df.dropna()

In [5]:
df.isnull().sum()

Unnamed: 0,0
CRIM,0
ZN,0
INDUS,0
CHAS,0
NOX,0
RM,0
AGE,0
DIS,0
RAD,0
TAX,0


In [6]:
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,394.0,394.0,394.0,394.0,394.0,394.0,394.0,394.0,394.0,394.0,394.0,394.0,394.0,394.0
mean,3.690136,11.46066,11.000863,0.068528,0.553215,6.280015,68.932741,3.805268,9.403553,406.431472,18.537563,358.490939,12.769112,22.359645
std,9.202423,23.954082,6.908364,0.252971,0.113112,0.697985,27.888705,2.098571,8.633451,168.312419,2.16646,89.283295,7.30843,9.142979
min,0.00632,0.0,0.46,0.0,0.389,3.561,2.9,1.1296,1.0,187.0,12.6,2.6,1.73,5.0
25%,0.081955,0.0,5.13,0.0,0.453,5.87925,45.475,2.1101,4.0,280.25,17.4,376.7075,7.125,16.8
50%,0.26888,0.0,8.56,0.0,0.538,6.2015,77.7,3.1992,5.0,330.0,19.1,392.19,11.3,21.05
75%,3.435973,12.5,18.1,0.0,0.624,6.6055,94.25,5.1167,24.0,666.0,20.2,396.9,17.1175,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [7]:
df_normalized = (df - df.min()) / (df.max() - df.min())

In [8]:
X = df_normalized.drop('MEDV', axis=1)
y = df_normalized['MEDV']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
#This code is for Liner Regression
class LinearRegressionFromScratch:
    def fit(self, X, y):
        X_ = np.c_[np.ones(X.shape[0]), X]  # Add bias term (column of ones)
        self.coefficients = np.linalg.inv(X_.T.dot(X_)).dot(X_.T).dot(y)

    def predict(self, X):
        X_ = np.c_[np.ones(X.shape[0]), X]  # Add bias term
        return X_.dot(self.coefficients)


linear_model = LinearRegressionFromScratch()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)


rmse_linear = np.sqrt(mean_squared_error(y_test, y_pred_linear))
r2_linear = r2_score(y_test, y_pred_linear)

print(f"Linear Regression RMSE: {rmse_linear}")
print(f"Linear Regression R²: {r2_linear}")


Linear Regression RMSE: 0.12463090725987784
Linear Regression R²: 0.6270849941673172


In [11]:

# Decision Tree Class
class DecisionTree:
    def __init__(self, max_depth=5):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        # Stop if max depth reached or pure node
        if depth >= self.max_depth or len(set(y)) == 1:
            return np.mean(y)

        # Find best split
        best_split = self._find_best_split(X, y)
        left_mask = X[:, best_split['feature']] <= best_split['value']
        right_mask = ~left_mask

        left_tree = self._build_tree(X[left_mask], y[left_mask], depth + 1)
        right_tree = self._build_tree(X[right_mask], y[right_mask], depth + 1)

        return {"split": best_split, "left": left_tree, "right": right_tree}

    def _find_best_split(self, X, y):
        best_split = {}
        best_score = float("inf")

        n_samples, n_features = X.shape
        for feature in range(n_features):
            unique_values = np.unique(X[:, feature])
            for value in unique_values:
                left_mask = X[:, feature] <= value
                right_mask = ~left_mask

                left_score = np.var(y[left_mask]) * len(y[left_mask])
                right_score = np.var(y[right_mask]) * len(y[right_mask])
                score = left_score + right_score

                if score < best_score:
                    best_score = score
                    best_split = {"feature": feature, "value": value}

        return best_split

    def predict(self, X):
        return np.array([self._predict_single(x, self.tree) for x in X])

    def _predict_single(self, x, tree):
        if isinstance(tree, dict):
            feature = tree['split']['feature']
            value = tree['split']['value']
            if x[feature] <= value:
                return self._predict_single(x, tree['left'])
            else:
                return self._predict_single(x, tree['right'])
        else:
            return tree

# Random Forest Class
class RandomForest:
    def __init__(self, n_trees=10, max_depth=5):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.trees = []

    def fit(self, X, y):
        for _ in range(self.n_trees):
            # Bootstrap sampling
            indices = np.random.choice(len(X), size=len(X), replace=True)
            X_sample, y_sample = X[indices], y[indices]

            # Train a decision tree on the bootstrap sample
            tree = DecisionTree(max_depth=self.max_depth)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        # Make predictions using each tree and return the average
        predictions = np.array([tree.predict(X) for tree in self.trees])
        return np.mean(predictions, axis=0)

rf_model = RandomForest(n_trees=10, max_depth=5)
rf_model.fit(X_train.values, y_train.values)
y_pred_rf = rf_model.predict(X_test.values)


rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest RMSE: {rmse_rf}")
print(f"Random Forest R²: {r2_rf}")



  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


Random Forest RMSE: 0.12272082800534444
Random Forest R²: 0.6384279096352332


In [12]:

class XGBoost:
    def __init__(self, n_estimators=100, learning_rate=0.1):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.trees = []

    def fit(self, X, y):
        y_pred = np.zeros_like(y)

        for _ in range(self.n_estimators):
            residuals = y - y_pred
            tree = DecisionTree(max_depth=5)
            tree.fit(X, residuals)  # Fit a tree to the residuals
            self.trees.append(tree)
            y_pred += self.learning_rate * tree.predict(X)

    def predict(self, X):
        y_pred = np.zeros(X.shape[0])
        for tree in self.trees:
            y_pred += self.learning_rate * tree.predict(X)
        return y_pred

xgb_model = XGBoost(n_estimators=100, learning_rate=0.1)
xgb_model.fit(X_train.values, y_train.values)
y_pred_xgb = xgb_model.predict(X_test.values)


rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"XGBoost RMSE: {rmse_xgb}")
print(f"XGBoost R²: {r2_xgb}")


  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


XGBoost RMSE: 0.1150244005247266
XGBoost R²: 0.6823577209124634


In [13]:
# Performance Comparison
print(f"Linear Regression RMSE: {rmse_linear}, R²: {r2_linear}")
print(f"Random Forest RMSE: {rmse_rf}, R²: {r2_rf}")
print(f"XGBoost RMSE: {rmse_xgb}, R²: {r2_xgb}")


Linear Regression RMSE: 0.12463090725987784, R²: 0.6270849941673172
Random Forest RMSE: 0.12272082800534444, R²: 0.6384279096352332
XGBoost RMSE: 0.1150244005247266, R²: 0.6823577209124634
