In [8]:
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [9]:
# Step 1: Organize and Load the Datasets
from google.colab import drive
drive.mount('/content/drive')

# Directory containing the datasets
small_data_dir = '/content/drive/My Drive/CS205_small_Data'
large_data_dir = '/content/drive/My Drive/CS205_large_Data'

# List of all small and large dataset files
small_files = [os.path.join(small_data_dir, f) for f in os.listdir(small_data_dir) if f.endswith('.txt')]
large_files = [os.path.join(large_data_dir, f) for f in os.listdir(large_data_dir) if f.endswith('.txt')]

# Function to load and normalize a dataset
def load_and_normalize_dataset(file_path):
    data = np.loadtxt(file_path)
    X = data[:, 1:]
    y = data[:, 0].astype(int)
    X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)  # Normalize features
    return X, y

# Load all small datasets
small_datasets = []
for file in small_files:
    X, y = load_and_normalize_dataset(file)
    small_datasets.append((X, y))

# Load all large datasets
large_datasets = []
for file in large_files:
    X, y = load_and_normalize_dataset(file)
    large_datasets.append((X, y))

print(f'Loaded {len(small_datasets)} small datasets and {len(large_datasets)} large datasets.')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loaded 50 small datasets and 50 large datasets.


In [10]:
def normalize_features(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled

In [11]:
# Step 2: Implement the Nearest Neighbor Classifier
def nearest_neighbor(X_train, y_train, X_test):
    predictions = []
    for test_instance in X_test:
        distances = np.linalg.norm(X_train - test_instance, axis=1)
        nearest_index = np.argmin(distances)
        predictions.append(y_train[nearest_index])
    return np.array(predictions)

In [12]:
# Step 3: Feature Selection Methods
def forward_selection(X, y):
    n_features = X.shape[1]
    selected_features = []
    best_accuracy = 0

    for _ in range(n_features):
        best_feature = None
        for feature in range(n_features):
            if feature in selected_features:
                continue
            features_to_test = selected_features + [feature]
            accuracy = cross_val_accuracy(X[:, features_to_test], y)
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_feature = feature
        if best_feature is not None:
            selected_features.append(best_feature)
    return selected_features

def backward_elimination(X, y):
    n_features = X.shape[1]
    selected_features = list(range(n_features))
    best_accuracy = cross_val_accuracy(X, y)

    for _ in range(n_features):
        worst_feature = None
        for feature in selected_features:
            features_to_test = [f for f in selected_features if f != feature]
            accuracy = cross_val_accuracy(X[:, features_to_test], y)
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                worst_feature = feature
        if worst_feature is not None:
            selected_features.remove(worst_feature)
    return selected_features

In [13]:
# Step 4: Evaluate Performance
def cross_val_accuracy(X, y, k=5):
    accuracies = []
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    y_pred = nearest_neighbor(X_train, y_train, X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    return np.mean(accuracies)

In [None]:
# Evaluate Nearest Neighbor Classifier on Original Data
print("Evaluating Nearest Neighbor Classifier on Original Data")
for i, (X, y) in enumerate(small_datasets):
    accuracy = cross_val_accuracy(X, y)
    print(f'Small Dataset {i+1} Original Accuracy: {accuracy:.4f}')

for i, (X, y) in enumerate(large_datasets):
    accuracy = cross_val_accuracy(X, y)
    print(f'Large Dataset {i+1} Original Accuracy: {accuracy:.4f}')

# Perform Feature Selection and Evaluate
print("\nPerforming Feature Selection and Evaluating")

# For Small Datasets
for i, (X, y) in enumerate(small_datasets):
    print(f'\nSmall Dataset {i+1}:')

    # Forward Selection
    selected_features_forward = forward_selection(X, y)
    accuracy_forward = cross_val_accuracy(X[:, selected_features_forward], y)
    print(f'Forward Selection: Features: {selected_features_forward}, Accuracy: {accuracy_forward:.4f}')

    # Backward Elimination
    selected_features_backward = backward_elimination(X, y)
    accuracy_backward = cross_val_accuracy(X[:, selected_features_backward], y)
    print(f'Backward Elimination: Features: {selected_features_backward}, Accuracy: {accuracy_backward:.4f}')

# For Large Datasets
for i, (X, y) in enumerate(large_datasets):
    print(f'\nLarge Dataset {i+1}:')

    # Forward Selection
    selected_features_forward = forward_selection(X, y)
    accuracy_forward = cross_val_accuracy(X[:, selected_features_forward], y)
    print(f'Forward Selection: Features: {selected_features_forward}, Accuracy: {accuracy_forward:.4f}')

    # Backward Elimination
    selected_features_backward = backward_elimination(X, y)
    accuracy_backward = cross_val_accuracy(X[:, selected_features_backward], y)
    print(f'Backward Elimination: Features: {selected_features_backward}, Accuracy: {accuracy_backward:.4f}')


Evaluating Nearest Neighbor Classifier on Original Data
Small Dataset 1 Original Accuracy: 0.7400
Small Dataset 2 Original Accuracy: 0.6600
Small Dataset 3 Original Accuracy: 0.7000
Small Dataset 4 Original Accuracy: 0.8000
Small Dataset 5 Original Accuracy: 0.7200
Small Dataset 6 Original Accuracy: 0.7600
Small Dataset 7 Original Accuracy: 0.7200
Small Dataset 8 Original Accuracy: 0.6500
Small Dataset 9 Original Accuracy: 0.7400
Small Dataset 10 Original Accuracy: 0.6000
Small Dataset 11 Original Accuracy: 0.6900
Small Dataset 12 Original Accuracy: 0.7000
Small Dataset 13 Original Accuracy: 0.7100
Small Dataset 14 Original Accuracy: 0.6800
Small Dataset 15 Original Accuracy: 0.5800
Small Dataset 16 Original Accuracy: 0.7600
Small Dataset 17 Original Accuracy: 0.7400
Small Dataset 18 Original Accuracy: 0.8100
Small Dataset 19 Original Accuracy: 0.7300
Small Dataset 20 Original Accuracy: 0.7200
Small Dataset 21 Original Accuracy: 0.7600
Small Dataset 22 Original Accuracy: 0.7100
Small D