Importing Libraries

In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KernelDensity
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

Loading the data

In [16]:

# Load the data (replace with your file paths)
train_data = pd.read_csv('train_shuttle.csv', header=None)
test_data = pd.read_csv('test_shuttle.csv', header=None)



Data Preprocessing and Processing

In [17]:
# Separate features and labels
X_train = train_data.iloc[:, :-1]
y_train = train_data.iloc[:, -1]
X_test = test_data.iloc[:, :-1]
y_test = test_data.iloc[:, -1]

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



LDA Training and Accuracy

In [19]:
# Separate the majority class (class 1) and the rest
X_1s = []
y_1s = []
X_rest = []
y_rest = []

for i, label in enumerate(y_train):
    if label == 1:
        X_1s.append(X_train[i])
        y_1s.append(1)
    else:
        X_rest.append(X_train[i])
        y_rest.append(label)

# Undersample the majority class
X_train_1s, _, y_train_1s, _ = train_test_split(X_1s, y_1s, test_size=0.5, random_state=42)

# Merge the undersampled class 1 data with the rest of the data
X_train = X_train_1s + X_rest
y_train = y_train_1s + y_rest

# Create LDA model
lda = LDA()
lda.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lda.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"LDA Accuracy: {accuracy:.4f}")

LDA Accuracy: 0.9692


So we see a LDA accuracy of 96.92%.

QDA Training and Accuracy

In [22]:
# Separate the majority class (class 1) and the rest
X_1s = []
y_1s = []
X_rest = []
y_rest = []

for i, label in enumerate(y_train):
    if label == 1:
        X_1s.append(X_train[i])
        y_1s.append(1)
    else:
        X_rest.append(X_train[i])
        y_rest.append(label)

# Undersample the majority class
X_train_1s, _, y_train_1s, _ = train_test_split(X_1s, y_1s, test_size=0.5, random_state=42)

# Merge the undersampled class 1 data with the rest of the data
X_train = X_train_1s + X_rest
y_train = y_train_1s + y_rest

# Hyperparameter tuning for QDA
qda = QDA()
param_grid_qda = {'reg_param': [0.01, 0.1, 0.5, 1.0, 2.0]}
grid_search_qda = GridSearchCV(qda, param_grid_qda, cv=5)
grid_search_qda.fit(X_train, y_train)
best_qda = grid_search_qda.best_estimator_
best_qda.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_qda.predict(X_test)



QDA Accuracy: 0.9666


5 fits failed out of a total of 25.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\aryam\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\aryam\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\discriminant_analysis.py", line 889, in fit
    self._validate_params()
  File "c:\Users\aryam\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\aryam\AppData\Local\Programs\Python\Python311

In [23]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"QDA Accuracy: {accuracy:.4f}")

QDA Accuracy: 0.9666


We see a QDA accuracy of 96.6%~96.7%

BDR Training and Accuracy

In [25]:
# Load the datasets (replace with your file paths)
train_data = pd.read_csv('train_shuttle.csv', header=None)
test_data = pd.read_csv('test_shuttle.csv', header=None)

# Separate features and labels
X_train = train_data.iloc[:, :-1]
y_train = train_data.iloc[:, -1]
X_test = test_data.iloc[:, :-1]
y_test = test_data.iloc[:, -1]

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the grid size for feature space (adjust as needed)
grid_size = 0.2  # You can change this value as per your experimentation

# Create a list of class labels
class_labels = np.unique(y_train)

# Initialize an empty list to store predicted labels
y_pred = []

In [27]:
# Load the datasets (replace with your file paths)
train_data = pd.read_csv('train_shuttle.csv', header=None)
test_data = pd.read_csv('test_shuttle.csv', header=None)

# Separate features and labels
X_train = train_data.iloc[:, :-1]
y_train = train_data.iloc[:, -1]
X_test = test_data.iloc[:, :-1]
y_test = test_data.iloc[:, -1]

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the grid size for feature space (adjust as needed)
grid_size = 0.2  # You can change this value as per your experimentation

# Create a list of class labels
class_labels = np.unique(y_train)

# Initialize an empty list to store predicted labels
y_pred = []

# Fit KDE models for each class
kde_models = {}
for label in class_labels:
    # Select samples belonging to the current class
    class_samples = X_train[y_train == label]

    # Create and fit a KDE model for the class
    kde = KernelDensity(bandwidth=0.5)  # Adjust bandwidth as needed
    kde.fit(class_samples)

    # Store the KDE model for this class
    kde_models[label] = kde

# Convert X_test back to a Pandas DataFrame
X_test = pd.DataFrame(X_test)

# Iterate through each test sample
for i in range(len(X_test)):
    # Calculate the grid element where the test sample is located
    grid_element = np.floor(X_test.iloc[i] / grid_size)

    # Calculate the class with the highest prior probability in the grid element
    max_posterior_prob = -1
    predicted_label = None

    for label in class_labels:
        # Calculate prior probability for the class
        prior_prob = np.sum(y_train[y_train == label]) / len(y_train)

        # Calculate the density using the KDE model for the class
        log_density = kde_models[label].score_samples([X_test.iloc[i]])

        # Calculate the posterior probability
        posterior_prob = prior_prob * np.exp(log_density)

        # Check if this class has a higher posterior probability
        if posterior_prob > max_posterior_prob:
            max_posterior_prob = posterior_prob
            predicted_label = label

    y_pred.append(predicted_label)

In [28]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"BDR Accuracy: {accuracy:.4f}")

BDR Accuracy: 0.9700


So we see BDR Accuracy of 97%.