In [None]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    f1_score,
    precision_score,
    recall_score,
    roc_curve,
    auc
)
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from google.colab import drive

# Step 2: Mount Google Drive
drive.mount('/content/drive')

# Step 3: Load dataset
DATA_PATH = '/content/drive/MyDrive/dataset/classification_data.csv'
df = pd.read_csv(DATA_PATH)

# Step 4: Explore dataset
print("First 5 rows:")
print(df.head())
print("\nClass distribution:")
print(df.iloc[:, -1].value_counts())

# Step 5: Split features and target
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# Step 6: Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Step 7: Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 8: Euclidean distance function
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2)**2))

# Step 9: Weighted KNN classifier
class WeightedKNN:
    def __init__(self, k=5):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        return np.array([self._predict_single(x) for x in X])

    def _predict_single(self, x):
        distances = np.array([euclidean_distance(x, xt) for xt in self.X_train])
        k_idx = np.argsort(distances)[:self.k]
        labels = self.y_train[k_idx]
        weights = 1 / (distances[k_idx] + 1e-5)
        vote = {}
        for l, w in zip(labels, weights):
            vote[l] = vote.get(l, 0) + w
        return max(vote, key=vote.get)

# Step 10: Train model
k = 5
model = WeightedKNN(k=k)
model.fit(X_train, y_train)

# Step 11: Predict
y_pred = model.predict(X_test)

# Step 12: Evaluation metrics
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {acc:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print("\nClassification Report:\n", report)

# Step 13: Confusion matrix plot
plt.figure(figsize=(8,6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Step 14: ROC Curve & AUC (binary only)
if len(np.unique(y)) == 2:
    y_prob = np.array([1 / (1 + np.exp(-euclidean_distance(x, X_train.mean(axis=0)))) for x in X_test])
    fpr, tpr, thresholds = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(8,6))
    plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0,1],[0,1],'--', color='grey')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend()
    plt.show()
    print(f"AUC Score: {roc_auc:.4f}")
else:
    print("ROC/AUC is only for binary classification")
