In [None]:
# ======================================
# STEP 1: Mount Google Drive
# ======================================
from google.colab import drive
drive.mount('/content/drive')

# ======================================
# STEP 2: Import Libraries
# ======================================
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    confusion_matrix,
    roc_curve,
    auc
)

# ======================================
# STEP 3: Load Dataset
# ======================================
df = pd.read_csv('/content/drive/MyDrive/dataset.csv')

X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# ======================================
# STEP 4: Train-Test Split (Manual)
# ======================================
split = int(0.8 * len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

# ======================================
# STEP 5: Decision Tree Node Definition
# ======================================
class TreeNode:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

# ======================================
# STEP 6: Decision Tree Regressor (Scratch)
# ======================================
class DecisionTreeRegressorScratch:
    def __init__(self, max_depth=5):
        self.max_depth = max_depth

    def mse(self, y):
        return np.var(y)

    def best_split(self, X, y):
        best_feature, best_threshold = None, None
        best_mse = float("inf")

        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for t in thresholds:
                left_y = y[X[:, feature] <= t]
                right_y = y[X[:, feature] > t]

                if len(left_y) == 0 or len(right_y) == 0:
                    continue

                mse = (len(left_y) * self.mse(left_y) +
                       len(right_y) * self.mse(right_y)) / len(y)

                if mse < best_mse:
                    best_mse = mse
                    best_feature = feature
                    best_threshold = t

        return best_feature, best_threshold

    def build_tree(self, X, y, depth):
        if depth == self.max_depth or len(np.unique(y)) == 1:
            return TreeNode(value=np.mean(y))

        feature, threshold = self.best_split(X, y)
        if feature is None:
            return TreeNode(value=np.mean(y))

        left_mask = X[:, feature] <= threshold
        right_mask = X[:, feature] > threshold

        left = self.build_tree(X[left_mask], y[left_mask], depth + 1)
        right = self.build_tree(X[right_mask], y[right_mask], depth + 1)

        return TreeNode(feature, threshold, left, right)

    def fit(self, X, y):
        self.root = self.build_tree(X, y, 0)

    def predict_one(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.feature] <= node.threshold:
            return self.predict_one(x, node.left)
        return self.predict_one(x, node.right)

    def predict(self, X):
        return np.array([self.predict_one(x, self.root) for x in X])

# ======================================
# STEP 7: Train Model
# ======================================
model = DecisionTreeRegressorScratch(max_depth=5)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# ======================================
# STEP 8: Regression Metrics
# ======================================
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RÂ² :", r2_score(y_test, y_pred))

# ======================================
# STEP 9: Visualizations
# ======================================
sns.scatterplot(x=y_test, y=y_pred)
plt.title("Decision Tree Scratch: Actual vs Predicted")
plt.show()

# ======================================
# STEP 10: Threshold-based ROC (Educational)
# ======================================
threshold = np.median(y_test)
y_test_bin = (y_test >= threshold).astype(int)

fpr, tpr, _ = roc_curve(y_test_bin, y_pred)
plt.plot(fpr, tpr, label=f"AUC={auc(fpr,tpr):.2f}")
plt.plot([0,1],[0,1],'--')
plt.legend()
plt.show()
