In [None]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive

# Step 2: Mount Google Drive
drive.mount('/content/drive')

# Step 3: Load external CSV dataset
DATA_PATH = '/content/drive/MyDrive/dataset/regression_data.csv'
df = pd.read_csv(DATA_PATH)

# Step 4: Explore dataset briefly
print("First 5 rows:")
print(df.head())
print("\nDataset Info:")
print(df.info())

# Step 5: Separate features and target
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# Step 6: Split dataset into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Step 7: Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 8: Euclidean distance function
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2)**2))

# Step 9: KNN Regressor from scratch
class KNNRegressor:
    def __init__(self, k=5):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        return np.array([self._predict_single(x) for x in X])

    def _predict_single(self, x):
        distances = [euclidean_distance(x, xt) for xt in self.X_train]
        k_idx = np.argsort(distances)[:self.k]
        return np.mean([self.y_train[i] for i in k_idx])

# Step 10: Train model
k = 5
model = KNNRegressor(k=k)
model.fit(X_train, y_train)

# Step 11: Predict
y_pred = model.predict(X_test)

# Step 12: Evaluation metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"R2 Score: {r2:.4f}")

# Step 13: Plot true vs predicted
plt.figure(figsize=(8,6))
sns.scatterplot(x=y_test, y=y_pred)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Actual vs Predicted Values")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')  # diagonal line
plt.show()
