In [1]:
# ============================
# STEP 1: Mount Google Drive
# ============================
from google.colab import drive
drive.mount('/content/drive')

# ============================
# STEP 2: Import Libraries
# ============================
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score
)

from sklearn.metrics import confusion_matrix, roc_curve, auc

# ============================
# STEP 3: Load External Dataset
# ============================
# Replace with your dataset path
data_path = '/content/drive/MyDrive/dataset.csv'
df = pd.read_csv(data_path)

# Display first few rows
df.head()

# ============================
# STEP 4: Separate Features & Target
# ============================
# Assume last column is target
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# ============================
# STEP 5: Train-Test Split
# ============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ============================
# STEP 6: Train Decision Tree Regressor
# ============================
model = DecisionTreeRegressor(
    max_depth=5,
    random_state=42
)
model.fit(X_train, y_train)

# ============================
# STEP 7: Predictions
# ============================
y_pred = model.predict(X_test)

# ============================
# STEP 8: Regression Evaluation Metrics
# ============================
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("MSE :", mse)
print("MAE :", mae)
print("RMSE:", rmse)
print("RÂ²  :", r2)

# ============================
# STEP 9: Visualization (Seaborn)
# ============================

# Actual vs Predicted
sns.scatterplot(x=y_test, y=y_pred)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs Predicted Values")
plt.show()

# Residual Distribution
residuals = y_test - y_pred
sns.histplot(residuals, kde=True)
plt.title("Residual Distribution")
plt.show()

# ============================
# STEP 10: OPTIONAL Classification Metrics (Educational Only)
# ============================
# Convert regression output to binary using median threshold
threshold = y_test.median()

y_test_binary = (y_test >= threshold).astype(int)
y_pred_binary = (y_pred >= threshold).astype(int)

# Confusion Matrix
cm = confusion_matrix(y_test_binary, y_pred_binary)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix (Artificially Binarized)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# ROC Curve & AUC
fpr, tpr, _ = roc_curve(y_test_binary, y_pred)
roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
plt.plot([0,1], [0,1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve (Artificially Binarized)")
plt.legend()
plt.show()

MessageError: Error: credential propagation was unsuccessful