In [None]:
# Step 1: Load and Prepare the Dataset
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import matplotlib.pyplot as plt

# Load Wine dataset
wine = load_wine()
X = wine.data
y = wine.target

# Convert to DataFrame for exploration
df = pd.DataFrame(X, columns=wine.feature_names)
df['target'] = y

# Basic data exploration
print("Dataset shape:", df.shape)
print("\nFirst 5 rows:\n", df.head())
print("\nClass distribution:\n", df['target'].value_counts())

# Split dataset: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize features for distance-based models
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 2: Implement K-Nearest Neighbors (KNN)
k_values = [1, 5, 11, 15, 21]
knn_accuracies = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    y_pred = knn.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    knn_accuracies.append(acc)
    print(f"KNN - K = {k}, Accuracy = {acc:.4f}")

# Step 3: Implement Radius Neighbors (RNN)
radius_values = [350, 400, 450, 500, 550, 600]
rnn_accuracies = []

for r in radius_values:
    rnn = RadiusNeighborsClassifier(radius=r, outlier_label=-1)
    rnn.fit(X_train_scaled, y_train)
    y_pred = rnn.predict(X_test_scaled)
    # Exclude outliers (-1) from accuracy calculation
    valid_idx = y_pred != -1
    acc = accuracy_score(y_test[valid_idx], y_pred[valid_idx]) if valid_idx.any() else 0
    rnn_accuracies.append(acc)
    print(f"RNN - Radius = {r}, Accuracy = {acc:.4f}")

# Step 4: Visualize and Compare Results
plt.figure(figsize=(12,5))

# KNN accuracy plot
plt.subplot(1,2,1)
plt.plot(k_values, knn_accuracies, marker='o')
plt.title('KNN Accuracy vs. K Value')
plt.xlabel('Number of Neighbors (k)')
plt.ylabel('Accuracy')
plt.grid(True)

# RNN accuracy plot
plt.subplot(1,2,2)
plt.plot(radius_values, rnn_accuracies, marker='o', color='orange')
plt.title('RNN Accuracy vs. Radius Value')
plt.xlabel('Radius')
plt.ylabel('Accuracy')
plt.grid(True)

plt.tight_layout()
plt.show()

# Step 5: Performance Summary Table
summary_df = pd.DataFrame({
    'KNN_k': k_values,
    'KNN_Accuracy': knn_accuracies,
    'RNN_Radius': radius_values,
    'RNN_Accuracy': rnn_accuracies
})

print("\nPerformance Summary Table:")
print(summary_df)

# Step 6: Discussion (Markdown cell recommended in notebook)
discussion = """
Observations:
- KNN accuracy stabilizes as k increases. Small k may overfit, large k can generalize better.
- RNN accuracy depends heavily on radius. Too small: no neighbors; too large: noise impacts accuracy.
- KNN generally provides more stable results; RNN is sensitive to radius selection.

When to Use:
- KNN is preferred for evenly distributed, dense datasets.
- RNN can be used for datasets with variable density, but requires careful radius tuning.
"""
print(discussion)