In [1]:
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
small_data_dir = '/content/drive/My Drive/CS205_small_Data'
large_data_dir = '/content/drive/My Drive/CS205_large_Data'

In [4]:
def load_and_normalize_dataset(file_path):
    data = np.loadtxt(file_path)
    X = data[:, 1:]
    y = data[:, 0].astype(int)
    X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)  # Normalize features
    return X, y

# Function to normalize features
def normalize_features(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled

In [5]:
#Implement the Nearest Neighbor Classifier
def nearest_neighbor(X_train, y_train, X_test):
    predictions = []
    for test_instance in X_test:
        distances = np.linalg.norm(X_train - test_instance, axis=1)
        nearest_index = np.argmin(distances)
        predictions.append(y_train[nearest_index])
    return np.array(predictions)

In [6]:
#  Feature Selection Methods
def forward_selection(X, y):
    n_features = X.shape[1]
    selected_features = []
    best_accuracy = 0

    for _ in range(n_features):
        best_feature = None
        for feature in range(n_features):
            if feature in selected_features:
                continue
            features_to_test = selected_features + [feature]
            accuracy = cross_val_accuracy(X[:, features_to_test], y)
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_feature = feature
        if best_feature is not None:
            selected_features.append(best_feature)
    return selected_features

def backward_elimination(X, y):
    n_features = X.shape[1]
    selected_features = list(range(n_features))
    best_accuracy = cross_val_accuracy(X, y)

    for _ in range(n_features):
        worst_feature = None
        for feature in selected_features:
            features_to_test = [f for f in selected_features if f != feature]
            accuracy = cross_val_accuracy(X[:, features_to_test], y)
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                worst_feature = feature
        if worst_feature is not None:
            selected_features.remove(worst_feature)
    return selected_features

In [7]:
#  Evaluate Performance
def cross_val_accuracy(X, y, k=5):
    accuracies = []
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    y_pred = nearest_neighbor(X_train, y_train, X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    return np.mean(accuracies)

In [11]:
file_name = input("Type in the file name to test: ")

file_path = None
small_file_path = os.path.join(small_data_dir, file_name)
large_file_path = os.path.join(large_data_dir, file_name)

if os.path.exists(small_file_path):
    file_path = small_file_path
elif os.path.exists(large_file_path):
    file_path = large_file_path
else:
    print("File not found in the specified directories.")
    exit()


# Load and normalize the dataset
if file_path:
    X, y = load_and_normalize_dataset(file_path)

    print("Evaluating Nearest Neighbor Classifier on Original Data")
    original_accuracy = cross_val_accuracy(X, y)
    print(f'Original Accuracy: {original_accuracy:.4f}')

    # Perform Feature Selection and Evaluate
    print("\nPerforming Feature Selection and Evaluating")

Type in the file name to test: CS205_small_Data__19.txt
Evaluating Nearest Neighbor Classifier on Original Data
Original Accuracy: 0.7500

Performing Feature Selection and Evaluating


In [12]:
# Forward Selection
selected_features_forward = forward_selection(X, y)
accuracy_forward = cross_val_accuracy(X[:, selected_features_forward], y)
print(f'Forward Selection: Features: {selected_features_forward}, Accuracy: {accuracy_forward:.4f}')

# Backward Elimination
selected_features_backward = backward_elimination(X, y)
accuracy_backward = cross_val_accuracy(X[:, selected_features_backward], y)
print(f'Backward Elimination: Features: {selected_features_backward}, Accuracy: {accuracy_backward:.4f}')

Forward Selection: Features: [5, 8, 0], Accuracy: 0.9100
Backward Elimination: Features: [0, 1, 2, 4, 5, 6, 8, 11], Accuracy: 0.7400
