# Classification Analysis: MAGIC Gamma Telescope
## Problem Statement 1 - Complete Implementation

**Student:** [Your Name]  
**Course:** Introduction to Machine Learning  
**Date:** [Current Date]

## 1. Introduction

This notebook implements K-Nearest Neighbors (K-NN) classification for the MAGIC Gamma Telescope dataset to distinguish between gamma particles (signal) and hadrons (background).

## 2. Setup and Imports

In [None]:
import sys
import os
sys.path.append('../src')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

plt.style.use('default')
sns.set_palette("husl")
%matplotlib inline

print("All libraries imported successfully!")

## 3. Data Loading and Exploration

In [None]:
# Load MAGIC dataset
file_path = "../data/magic_gamma_telescope/magic04.data"
column_names = ['fLength', 'fWidth', 'fSize', 'fConc', 'fConc1', 'fAsym', 'fM3Long', 'fM3Trans', 'fAlpha', 'fDist', 'class']
data = pd.read_csv(file_path, names=column_names)

print("📊 MAGIC GAMMA TELESCOPE DATASET")
print("=" * 50)
print(f"Dataset shape: {data.shape}")
print(f"\nClass distribution:")
print(data['class'].value_counts())
print(f"\nFirst 5 rows:")
data.head()

## 4. Data Preprocessing

In [None]:
# Balance the dataset
class_g = data[data['class'] == 'g']
class_h = data[data['class'] == 'h']

min_size = min(len(class_g), len(class_h))
class_g_balanced = class_g.sample(min_size, random_state=42)
class_h_balanced = class_h.sample(min_size, random_state=42)

balanced_data = pd.concat([class_g_balanced, class_h_balanced])

print("🔧 DATA BALANCING")
print("=" * 50)
print(f"Original sizes - Gamma: {len(class_g)}, Hadron: {len(class_h)}")
print(f"Balanced sizes - Gamma: {len(class_g_balanced)}, Hadron: {len(class_h_balanced)}")
print(f"\nBalanced class distribution:")
print(balanced_data['class'].value_counts())

## 5. Manual K-NN Implementation

In [None]:
class ManualKNN:
    def __init__(self, k=5):
        self.k = k
    
    def euclidean_distance(self, x1, x2):
        return np.sqrt(np.sum((x1 - x2) ** 2))
    
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
    
    def predict(self, X):
        predictions = [self._predict_single(x) for x in X]
        return np.array(predictions)
    
    def _predict_single(self, x):
        distances = [self.euclidean_distance(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        most_common = np.bincount(k_nearest_labels).argmax()
        return most_common
    
    def score(self, X, y):
        predictions = self.predict(X)
        return np.mean(predictions == y)

print("✅ Manual K-NN class defined successfully!")

## 6. Model Training and Evaluation

In [None]:
# Prepare data
X = balanced_data.drop('class', axis=1).values
y = balanced_data['class'].values

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Split data (70% train, 15% validation, 15% test)
X_temp, X_test, y_temp, y_test = train_test_split(X, y_encoded, test_size=0.15, random_state=42, stratify=y_encoded)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.176, random_state=42, stratify=y_temp)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print("📊 DATA SPLITTING RESULTS")
print("=" * 50)
print(f"Training set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

## 7. K Value Tuning

In [None]:
# Test different k values
k_values = [1, 3, 5, 7, 9, 11, 13, 15]
manual_val_accuracies = []
sklearn_val_accuracies = []

print("🔍 K VALUE TUNING - MANUAL K-NN")
print("=" * 40)
for k in k_values:
    knn_manual = ManualKNN(k=k)
    knn_manual.fit(X_train_scaled, y_train)
    val_acc = knn_manual.score(X_val_scaled, y_val)
    manual_val_accuracies.append(val_acc)
    print(f"k={k}: Validation Accuracy = {val_acc:.4f}")

print("\n🔍 K VALUE TUNING - SCIKIT-LEARN K-NN")
print("=" * 45)
for k in k_values:
    knn_sklearn = KNeighborsClassifier(n_neighbors=k)
    knn_sklearn.fit(X_train_scaled, y_train)
    val_acc = knn_sklearn.score(X_val_scaled, y_val)
    sklearn_val_accuracies.append(val_acc)
    print(f"k={k}: Validation Accuracy = {val_acc:.4f}")

# Find best k values
best_k_manual = k_values[np.argmax(manual_val_accuracies)]
best_k_sklearn = k_values[np.argmax(sklearn_val_accuracies)]
print(f"\n✅ Best k - Manual: {best_k_manual}, Scikit-Learn: {best_k_sklearn}")

## 8. Results Visualization

In [None]:
# Plot validation accuracy vs k values
plt.figure(figsize=(12, 6))
plt.plot(k_values, manual_val_accuracies, 'bo-', label='Manual K-NN', linewidth=2, markersize=8)
plt.plot(k_values, sklearn_val_accuracies, 'ro-', label='Scikit-Learn K-NN', linewidth=2, markersize=8)
plt.xlabel('K Value')
plt.ylabel('Validation Accuracy')
plt.title('K-NN: Validation Accuracy vs K Value')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xticks(k_values)
plt.tight_layout()
plt.show()

## 9. Final Model Evaluation

In [None]:
# Train final models with best k
final_manual_knn = ManualKNN(k=best_k_manual)
final_manual_knn.fit(X_train_scaled, y_train)

final_sklearn_knn = KNeighborsClassifier(n_neighbors=best_k_sklearn)
final_sklearn_knn.fit(X_train_scaled, y_train)

# Predictions
y_pred_manual = final_manual_knn.predict(X_test_scaled)
y_pred_sklearn = final_sklearn_knn.predict(X_test_scaled)

# Evaluation metrics
def evaluate_model(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    cm = confusion_matrix(y_true, y_pred)
    
    print(f"\n📊 {model_name} RESULTS")
    print("-" * 30)
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-Score:  {f1:.4f}")
    print(f"\nConfusion Matrix:")
    print(cm)
    
    return accuracy, precision, recall, f1, cm

manual_metrics = evaluate_model(y_test, y_pred_manual, "MANUAL K-NN")
sklearn_metrics = evaluate_model(y_test, y_pred_sklearn, "SCIKIT-LEARN K-NN")

## 10. Key Findings and Conclusions

In [None]:
print("🔍 KEY FINDINGS AND INSIGHTS")
print("=" * 50)

print(f"\n1. OPTIMAL K VALUE:")
print(f"   • Manual K-NN: k={best_k_manual}")
print(f"   • Scikit-Learn K-NN: k={best_k_sklearn}")

print(f"\n2. MODEL PERFORMANCE:")
print(f"   • Best Accuracy: {max(manual_metrics[0], sklearn_metrics[0]):.4f}")
print(f"   • Manual vs Scikit-Learn difference: {abs(manual_metrics[0] - sklearn_metrics[0]):.6f}")

print(f"\n3. IMPLEMENTATION VALIDATION:")
difference = abs(manual_metrics[0] - sklearn_metrics[0])
if difference < 0.01:
    print(f"   • ✅ EXCELLENT: Manual implementation matches scikit-learn!")
elif difference < 0.05:
    print(f"   • ✅ GOOD: Manual implementation is very close to scikit-learn!")
else:
    print(f"   • ⚠️  ACCEPTABLE: Some difference between implementations")

print(f"\n4. DATASET CHARACTERISTICS:")
print(f"   • Balanced dataset: {len(balanced_data)} samples")
print(f"   • Features: {X.shape[1]}")
print(f"   • Classes: Gamma vs Hadron")

print(f"\n✅ CLASSIFICATION ANALYSIS COMPLETED SUCCESSFULLY!")