# Name: Aparna Vivek Sarawadekar
# AI Project Phase 2: Feature Selection with Nearest Neighbor

In [1]:
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [2]:
# Step 1: We will upload the dataset from google drive
from google.colab import drive
drive.mount('/content/drive')

# Directories containing the datasets
small_data_dir = '/content/drive/My Drive/CS205_small_Data'
large_data_dir = '/content/drive/My Drive/CS205_large_Data'

Mounted at /content/drive


In [3]:
# Step 2: we will load and normalize a dataset
def load_and_normalize_dataset(file_path):
    data = np.loadtxt(file_path)
    # Separating features (X) and target labels (y)
    X = data[:, 1:]
    y = data[:, 0].astype(int)
    X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)  # Normalizinf features by subtracting mean and dividing by standard deviation
    return X, y

# Function to normalize features
def normalize_features(X):
    scaler = StandardScaler()  #initializing standard scalar
    X_scaled = scaler.fit_transform(X)  #fit and transform
    return X_scaled

In [4]:
# Step 3: Nearest Neighbor Classifier
def nearest_neighbor(X_train, y_train, X_test):
    predictions = []  #stores predicted labels for test cases
    for test_instance in X_test:
        distances = np.linalg.norm(X_train - test_instance, axis=1)   # Calculating Euclidean distance between test cases and training cases
        nearest_index = np.argmin(distances) #finding distance of closest training case
        predictions.append(y_train[nearest_index]) #add the label of the above test case to the prediction list created
    return np.array(predictions)

In [5]:
# Step 4: Feature Selection Method 1: Forward Selection
def forward_selection(X, y):
    n_features = X.shape[1]  #no. of features
    selected_features = []
    best_accuracy = 0

    print("Beginning search.\n")

    for _ in range(n_features):
        best_feature = None
        for feature in range(n_features):
            if (feature + 1) in selected_features: # Skip the feature if it is already selected
                continue
            features_to_test = selected_features + [feature + 1] # Creating a list of features to test, including current feature
            accuracy = cross_val_accuracy(X[:, [f - 1 for f in features_to_test]], y)
            print(f"Using feature(s) {features_to_test} accuracy is {accuracy * 100:.1f}%")
            #updating best accuracy
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_feature = feature + 1
        if best_feature is not None:
            selected_features.append(best_feature)
            print(f"Feature set {selected_features} was best, accuracy is {best_accuracy * 100:.1f}%\n")

    return selected_features

In [6]:
# Step 5: Feature Selection Method 2: Backward Elimination
def backward_elimination(X, y):
    n_features = X.shape[1]
    selected_features = list(range(1, n_features + 1))
    best_accuracy = cross_val_accuracy(X, y)

    for _ in range(n_features):
        worst_feature = None
        for feature in selected_features:
            features_to_test = [f for f in selected_features if f != feature]   #list of features to test excluding current feature
            accuracy = cross_val_accuracy(X[:, [f - 1 for f in features_to_test]], y)
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                worst_feature = feature
        if worst_feature is not None:
            selected_features.remove(worst_feature)
    return selected_features

In [7]:
# Step 6: Evaluate Performance
def cross_val_accuracy(X, y, k=5):
    accuracies = []
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    y_pred = nearest_neighbor(X_train, y_train, X_test)  #predicting labels using NN classifiers
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    return np.mean(accuracies)

In [None]:
file_name = input("Type in the file name to test: ")
small_file_path = os.path.join(small_data_dir, file_name)
large_file_path = os.path.join(large_data_dir, file_name)

if os.path.exists(small_file_path):
    file_path = small_file_path
elif os.path.exists(large_file_path):
    file_path = large_file_path
else:
    print("File not found in the specified directories.")
    exit()

# Load and normalize the dataset
if file_path:
    X, y = load_and_normalize_dataset(file_path)

    print(f"This dataset has {X.shape[1]} features (not including the class attribute), with {X.shape[0]} instances.\n")
    print(f"Running nearest neighbor with all {X.shape[1]} features, using 'leaving-one-out' evaluation, I get an accuracy of {cross_val_accuracy(X, y) * 100:.1f}%\n")
    print("Beginning search.\n")

    # Forward Selection
    print("Forward Selection")
    selected_features_forward = forward_selection(X, y)
    accuracy_forward = cross_val_accuracy(X[:, [f - 1 for f in selected_features_forward]], y)
    print(f"Finished search!! The best feature subset is {selected_features_forward}, which has an accuracy of {accuracy_forward * 100:.1f}%\n")

    # Backward Elimination
    print("Backward Elimination")
    selected_features_backward = backward_elimination(X, y)
    accuracy_backward = cross_val_accuracy(X[:, [f - 1 for f in selected_features_backward]], y)
    print(f"Finished search!! The best feature subset is {selected_features_backward}, which has an accuracy of {accuracy_backward * 100:.1f}%\n")

Type in the file name to test: CS205_large_Data__27.txt
This dataset has 50 features (not including the class attribute), with 5000 instances.

Running nearest neighbor with all 50 features, using 'leaving-one-out' evaluation, I get an accuracy of 67.8%

Beginning search.

Forward Selection
Beginning search.

Using feature(s) [1] accuracy is 69.8%
Using feature(s) [2] accuracy is 69.2%
Using feature(s) [3] accuracy is 73.8%
Using feature(s) [4] accuracy is 69.9%
Using feature(s) [5] accuracy is 72.5%
Using feature(s) [6] accuracy is 68.8%
Using feature(s) [7] accuracy is 68.9%
Using feature(s) [8] accuracy is 69.9%
Using feature(s) [9] accuracy is 70.8%
Using feature(s) [10] accuracy is 70.1%
Using feature(s) [11] accuracy is 70.5%
Using feature(s) [12] accuracy is 70.4%
Using feature(s) [13] accuracy is 71.0%
Using feature(s) [14] accuracy is 70.7%
Using feature(s) [15] accuracy is 70.5%
Using feature(s) [16] accuracy is 70.9%
Using feature(s) [17] accuracy is 69.8%
Using feature(s) [