In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def load_data(filepath):
    """
    Load the Banknote Authentication dataset from a CSV file.
    """
    df = pd.read_csv(filepath, header=None)
    df.columns = ["variance", "skewness", "curtosis", "entropy", "class"]
    X = df.iloc[:, :-1].values
    y = df["class"].values
    return X, y

def preprocess_data(X, y, test_size=0.2, random_state=42):
    """
    Split data into training and test sets and standardize features.
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled, y_train, y_test

def compute_class_means(X_train, y_train):
    """
    Compute the mean vectors for each class.
    """
    mean_0 = np.mean(X_train[y_train == 0], axis=0)
    mean_1 = np.mean(X_train[y_train == 1], axis=0)
    return mean_0, mean_1

def compute_within_class_scatter(X_train, y_train, mean_0, mean_1):
    """
    Compute the within-class scatter matrix.
    """
    SW = np.zeros((X_train.shape[1], X_train.shape[1]))
    for i, mean in zip([0, 1], [mean_0, mean_1]):
        X_class = X_train[y_train == i]
        SW += (X_class - mean).T @ (X_class - mean)
    return SW

def compute_between_class_scatter(mean_0, mean_1):
    """
    Compute the between-class scatter matrix.
    """
    return np.outer((mean_0 - mean_1), (mean_0 - mean_1))

def compute_lda_projection(X_train, SW, SB):
    """
    Compute the LDA projection vector.
    """
    eigen_values, eigen_vectors = np.linalg.eig(np.linalg.inv(SW) @ SB)
    w = eigen_vectors[:, np.argmax(eigen_values)].real.reshape(-1, 1)
    return X_train @ w, w

def apply_lda(X_train, y_train):
    """
    Perform Linear Discriminant Analysis (LDA) and project data.
    """
    mean_0, mean_1 = compute_class_means(X_train, y_train)
    SW = compute_within_class_scatter(X_train, y_train, mean_0, mean_1)
    SB = compute_between_class_scatter(mean_0, mean_1)
    return compute_lda_projection(X_train, SW, SB)

def nearest_centroid_classifier(X_lda, m_0, m_1):
    """
    Classify data using nearest centroid classifier.
    """
    return np.where(np.abs(X_lda - m_0) < np.abs(X_lda - m_1), 0, 1)

# Main Execution
X, y = load_data("../Data/data_banknote_authentication.txt")
X_train_scaled, X_test_scaled, y_train, y_test = preprocess_data(X, y)
X_lda_train, w = apply_lda(X_train_scaled, y_train)
m_0_lda, m_1_lda = np.mean(X_lda_train[y_train == 0]), np.mean(X_lda_train[y_train == 1])
y_pred = nearest_centroid_classifier(X_test_scaled @ w, m_0_lda, m_1_lda)
accuracy = accuracy_score(y_test, y_pred) * 100
print(f"LDA Accuracy on test set: {accuracy:.2f}%")


LDA Accuracy on test set: 96.73%
