In [1]:
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [2]:
# Step 1: Organize and Load the Datasets
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Directories containing the datasets
small_data_dir = '/content/drive/My Drive/CS205_small_Data'
large_data_dir = '/content/drive/My Drive/CS205_large_Data'

In [4]:
# Function to load and normalize a dataset
def load_and_normalize_dataset(file_path):
    data = np.loadtxt(file_path)
    X = data[:, 1:]
    y = data[:, 0].astype(int)
    X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)  # Normalize features
    return X, y

# Function to normalize features
def normalize_features(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled

In [5]:
# Implement the Nearest Neighbor Classifier
def nearest_neighbor(X_train, y_train, X_test):
    predictions = []
    for test_instance in X_test:
        distances = np.linalg.norm(X_train - test_instance, axis=1)
        nearest_index = np.argmin(distances)
        predictions.append(y_train[nearest_index])
    return np.array(predictions)

In [6]:
# Feature Selection Methods
def forward_selection(X, y):
    n_features = X.shape[1]
    selected_features = []
    best_accuracy = 0

    print("Beginning search.\n")

    for _ in range(n_features):
        best_feature = None
        for feature in range(n_features):
            if feature in selected_features:
                continue
            features_to_test = selected_features + [feature]
            accuracy = cross_val_accuracy(X[:, features_to_test], y)
            print(f"Using feature(s) {features_to_test} accuracy is {accuracy * 100:.1f}%")
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_feature = feature
        if best_feature is not None:
            selected_features.append(best_feature)
            print(f"Feature set {selected_features} was best, accuracy is {best_accuracy * 100:.1f}%\n")

    return selected_features

def backward_elimination(X, y):
    n_features = X.shape[1]
    selected_features = list(range(n_features))
    best_accuracy = cross_val_accuracy(X, y)

    for _ in range(n_features):
        worst_feature = None
        for feature in selected_features:
            features_to_test = [f for f in selected_features if f != feature]
            accuracy = cross_val_accuracy(X[:, features_to_test], y)
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                worst_feature = feature
        if worst_feature is not None:
            selected_features.remove(worst_feature)
    return selected_features

In [7]:
# Evaluate Performance
def cross_val_accuracy(X, y, k=5):
    accuracies = []
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    y_pred = nearest_neighbor(X_train, y_train, X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    return np.mean(accuracies)

In [10]:
# Prompt the user to input the file name
file_name = input("Type in the file name to test: ")

# Check if the file exists in the specified directories
small_data_dir = '/content/drive/My Drive/CS205_small_Data'
large_data_dir = '/content/drive/My Drive/CS205_large_Data'

file_path = None
small_file_path = os.path.join(small_data_dir, file_name)
large_file_path = os.path.join(large_data_dir, file_name)

if os.path.exists(small_file_path):
    file_path = small_file_path
elif os.path.exists(large_file_path):
    file_path = large_file_path
else:
    print("File not found in the specified directories.")
    exit()

# Load and normalize the dataset
if file_path:
    X, y = load_and_normalize_dataset(file_path)

    print(f"This dataset has {X.shape[1]} features (not including the class attribute), with {X.shape[0]} instances.\n")
    print(f"Running nearest neighbor with all {X.shape[1]} features, using 'leaving-one-out' evaluation, I get an accuracy of {cross_val_accuracy(X, y) * 100:.1f}%\n")

    # Perform Feature Selection and Evaluate
    print("Beginning search.\n")

    # Forward Selection
    print("Forward Selection")
    selected_features_forward = forward_selection(X, y)
    accuracy_forward = cross_val_accuracy(X[:, selected_features_forward], y)
    print(f"Finished search!! The best feature subset is {selected_features_forward}, which has an accuracy of {accuracy_forward * 100:.1f}%\n")

    # Backward Elimination
    print("Backward Elimination")
    selected_features_backward = backward_elimination(X, y)
    accuracy_backward = cross_val_accuracy(X[:, selected_features_backward], y)
    print(f"Finished search!! The best feature subset is {selected_features_backward}, which has an accuracy of {accuracy_backward * 100:.1f}%\n")

Type in the file name to test: CS205_small_Data__5.txt
This dataset has 12 features (not including the class attribute), with 500 instances.

Running nearest neighbor with all 12 features, using 'leaving-one-out' evaluation, I get an accuracy of 70.0%

Beginning search.

Forward Selection
Beginning search.

Using feature(s) [0] accuracy is 68.0%
Using feature(s) [1] accuracy is 73.0%
Using feature(s) [2] accuracy is 64.0%
Using feature(s) [3] accuracy is 69.0%
Using feature(s) [4] accuracy is 57.0%
Using feature(s) [5] accuracy is 74.0%
Using feature(s) [6] accuracy is 87.0%
Using feature(s) [7] accuracy is 70.0%
Using feature(s) [8] accuracy is 56.0%
Using feature(s) [9] accuracy is 59.0%
Using feature(s) [10] accuracy is 64.0%
Using feature(s) [11] accuracy is 64.0%
Feature set [6] was best, accuracy is 87.0%

Using feature(s) [6, 0] accuracy is 81.0%
Using feature(s) [6, 1] accuracy is 73.0%
Using feature(s) [6, 2] accuracy is 82.0%
Using feature(s) [6, 3] accuracy is 77.0%
Using fe