In [51]:
!pip install ipywidgets
import os
import numpy as np
from google.colab import drive
from IPython.display import display
import ipywidgets as widgets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score



In [52]:
print("Welcome to Feature Selection Algorithm")

# text input and button widgets
text_input = widgets.Text(
    value='',
    placeholder='File Name',
    description='Type File Name:',
    layout=widgets.Layout(width='50%')
)

submit_button = widgets.Button(
    description='Submit',
    disabled=False,
    button_style='success',
    icon='check'
)

# Function to handle button click
def on_button_click(b):
    file_name = text_input.value
    print(f"File to test: {file_name}")
    mount_and_load_file(file_name)

submit_button.on_click(on_button_click)
display(text_input, submit_button)

def mount_and_load_file(file_name):

    # Step 1: Organize and Load the Datasets
    drive.mount('/content/drive')
    small_data_dir = '/content/drive/My Drive/CS205_small_Data'
    large_data_dir = '/content/drive/My Drive/CS205_large_Data'
    small_file_path = os.path.join(small_data_dir, file_name)
    large_file_path = os.path.join(large_data_dir, file_name)

    # if file exists in the small or large data directory
    if os.path.isfile(small_file_path):
        file_path = small_file_path
    elif os.path.isfile(large_file_path):
        file_path = large_file_path
    else:
        print(f"File '{file_name}' not found in the specified directories.")
        return

    # load and normalize a dataset
    def load_and_normalize_dataset(file_path):
        data = np.loadtxt(file_path)
        X = data[:, 1:]
        y = data[:, 0].astype(int)
        X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
        return X, y

    X, y = load_and_normalize_dataset(file_path)
    print(f'Loaded dataset from {file_path}.')

    # Perform evaluations and feature selection
    evaluate_and_select_features(X, y)


Welcome to Feature Selection Algorithm


Text(value='', description='Type File Name:', layout=Layout(width='50%'), placeholder='File Name')

Button(button_style='success', description='Submit', icon='check', style=ButtonStyle())

File to test: CS205_large_Data__6.txt
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loaded dataset from /content/drive/My Drive/CS205_large_Data/CS205_large_Data__6.txt.
Evaluating Nearest Neighbor Classifier on Original Data
Original Accuracy: 0.6600

Performing Feature Selection and Evaluating
Forward Selection Process:
Using feature(s) {1}, accuracy is 0.7390
Using feature(s) {2}, accuracy is 0.6780
Using feature(s) {3}, accuracy is 0.6750
Using feature(s) {4}, accuracy is 0.6950
Using feature(s) {5}, accuracy is 0.6970
Using feature(s) {6}, accuracy is 0.6890
Using feature(s) {7}, accuracy is 0.6640
Using feature(s) {8}, accuracy is 0.6840
Using feature(s) {9}, accuracy is 0.6790
Using feature(s) {10}, accuracy is 0.7110
Using feature(s) {11}, accuracy is 0.7070
Using feature(s) {12}, accuracy is 0.7070
Using feature(s) {13}, accuracy is 0.6840
Using feature(s) {14}, accuracy is 0.6840
Using feature(

In [53]:
# Step 2: Implement the Nearest Neighbor Classifier
def nearest_neighbor(X_train, y_train, X_test):
    predictions = []
    for test_instance in X_test:
        distances = np.linalg.norm(X_train - test_instance, axis=1)
        nearest_index = np.argmin(distances)
        predictions.append(y_train[nearest_index])
    return np.array(predictions)

In [54]:
# Step 3: Feature Selection Methods
def forward_selection(X, y):
    n_features = X.shape[1]
    selected_features = []
    best_accuracy = 0
    feature_selection_process = []

    for _ in range(n_features):
        best_feature = None
        for feature in range(n_features):
            if feature in selected_features:
                continue
            features_to_test = selected_features + [feature]
            accuracy = cross_val_accuracy(X[:, features_to_test], y)
            feature_selection_process.append((features_to_test, accuracy))
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_feature = feature
        if best_feature is not None:
            selected_features.append(best_feature)
    return selected_features, feature_selection_process

def backward_elimination(X, y):
    n_features = X.shape[1]
    selected_features = list(range(n_features))
    best_accuracy = cross_val_accuracy(X, y)
    feature_selection_process = [(selected_features.copy(), best_accuracy)]

    for _ in range(n_features):
        worst_feature = None
        for feature in selected_features:
            features_to_test = [f for f in selected_features if f != feature]
            accuracy = cross_val_accuracy(X[:, features_to_test], y)
            feature_selection_process.append((features_to_test, accuracy))
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                worst_feature = feature
        if worst_feature is not None:
            selected_features.remove(worst_feature)
    return selected_features, feature_selection_process


In [55]:
# Step 4: Evaluate Performance
def cross_val_accuracy(X, y, k=5):
    accuracies = []
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    y_pred = nearest_neighbor(X_train, y_train, X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    return np.mean(accuracies)

def print_feature_selection_process(process):
    for step, (features, accuracy) in enumerate(process):
        features_str = ", ".join(str(f+1) for f in features)
        print(f"Using feature(s) {{{features_str}}}, accuracy is {accuracy:.4f}")

def evaluate_and_select_features(X, y):
    print("Evaluating Nearest Neighbor Classifier on Original Data")
    accuracy = cross_val_accuracy(X, y)
    print(f'Original Accuracy: {accuracy:.4f}')

    print("\nPerforming Feature Selection and Evaluating")

    # Forward Selection
    selected_features_forward, forward_process = forward_selection(X, y)
    accuracy_forward = cross_val_accuracy(X[:, selected_features_forward], y)
    print("Forward Selection Process:")
    print_feature_selection_process(forward_process)
    print(f'Forward Selection: Best Features: {selected_features_forward}, Accuracy: {accuracy_forward:.4f}')

    # Backward Elimination
    selected_features_backward, backward_process = backward_elimination(X, y)
    accuracy_backward = cross_val_accuracy(X[:, selected_features_backward], y)
    print("Backward Elimination Process:")
    print_feature_selection_process(backward_process)
    print(f'Backward Elimination: Best Features: {selected_features_backward}, Accuracy: {accuracy_backward:.4f}')
