In [1]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier

from BoVW import BoVW
from utils import read_processed_data

In [2]:
data, labels = read_processed_data('../Preprocessed Dataset')

# Split the data using sklearn's train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)
data = 1
labels = 1
X_test = 1
y_test = 1
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Check the size of the training and test sets
print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")

Training set size: 2397
Validation set size: 800


In [3]:
# Load the data from the file, assuming no missing values but using genfromtxt for its flexibility
data = np.genfromtxt("BoVW_features_training.csv", delimiter=",")

# Splitting into features and target variable
X_train_loaded = data[:, :-1]
y_train_loaded = data[:, -1]

In [4]:
# Load the data from the file, assuming no missing values but using genfromtxt for its flexibility
data = np.genfromtxt("BoVW_features_validation.csv", delimiter=",")

# Splitting into features and target variable
X_val_loaded = data[:, :-1]
y_val_loaded = data[:, -1]

In [5]:
np.all(y_train == y_train_loaded)

True

In [6]:
np.all(y_val == y_val_loaded)

True

In [7]:
BoVW_extractor = BoVW()
X_train, kmeans = BoVW_extractor.extract_BoVW(X_train)
X_train.shape

(2397, 112)

In [8]:
# Define the classifiers
classifiers = {
    "Logistic Regression": LogisticRegression(),
    "LDA": LinearDiscriminantAnalysis(),
    "QDA": QuadraticDiscriminantAnalysis(),
    "SVM": svm.SVC(kernel='linear'),
    "Decision Tree": DecisionTreeClassifier()
}

# Train and test each classifier
for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X_train, y_train)
    
    # Predict on the training data
    y_train_pred = clf.predict(X_train)
    
    # Calculate and print the result statistics
    print(f"Classification Report on {name}:\n", classification_report(y_train, y_train_pred))

Classification Report on Logistic Regression:
               precision    recall  f1-score   support

           0       0.96      0.95      0.96       602
           1       0.96      0.99      0.97       589
           2       0.89      0.90      0.90       597
           3       0.92      0.89      0.90       609

    accuracy                           0.93      2397
   macro avg       0.93      0.93      0.93      2397
weighted avg       0.93      0.93      0.93      2397

Classification Report on LDA:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99       602
           1       1.00      0.99      0.99       589
           2       0.96      0.98      0.97       597
           3       0.99      0.96      0.97       609

    accuracy                           0.98      2397
   macro avg       0.98      0.98      0.98      2397
weighted avg       0.98      0.98      0.98      2397

Classification Report on QDA:
               precisi



Classification Report on SVM:
               precision    recall  f1-score   support

           0       0.94      0.98      0.96       602
           1       0.99      0.96      0.98       589
           2       0.75      0.98      0.85       597
           3       0.99      0.68      0.81       609

    accuracy                           0.90      2397
   macro avg       0.92      0.90      0.90      2397
weighted avg       0.92      0.90      0.90      2397

Classification Report on Decision Tree:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       602
           1       1.00      1.00      1.00       589
           2       1.00      1.00      1.00       597
           3       1.00      1.00      1.00       609

    accuracy                           1.00      2397
   macro avg       1.00      1.00      1.00      2397
weighted avg       1.00      1.00      1.00      2397



In [9]:
BoVW_extractor = BoVW()
X_val, _ = BoVW_extractor.extract_BoVW(data=X_val, kmeans=kmeans)
X_val.shape

(800, 112)

In [10]:
# Train and test each classifier
for name, clf in classifiers.items():    
    # Predict on the training data
    y_val_pred = clf.predict(X_val)
    
    # Calculate and print the result statistics
    print(f"Classification Report on {name}:\n", classification_report(y_val, y_val_pred))

Classification Report on Logistic Regression:
               precision    recall  f1-score   support

           0       0.95      0.91      0.93       190
           1       0.93      1.00      0.96       201
           2       0.88      0.90      0.89       212
           3       0.91      0.86      0.89       197

    accuracy                           0.92       800
   macro avg       0.92      0.92      0.92       800
weighted avg       0.92      0.92      0.92       800

Classification Report on LDA:
               precision    recall  f1-score   support

           0       0.96      0.98      0.97       190
           1       1.00      1.00      1.00       201
           2       0.93      0.95      0.94       212
           3       0.96      0.92      0.94       197

    accuracy                           0.96       800
   macro avg       0.96      0.96      0.96       800
weighted avg       0.96      0.96      0.96       800

Classification Report on QDA:
               precisi

In [11]:
# Ensure y_train is a 2D column vector
y_train = y_train.reshape(-1, 1)

# Combine the arrays
combined_array = np.hstack((X_train, y_train))

# Save the combined array to a file
np.savetxt("BoVW_features_training.csv", combined_array, delimiter=",", fmt='%f')

In [12]:
# Ensure y_train is a 2D column vector
y_val = y_val.reshape(-1, 1)

# Combine the arrays
combined_array = np.hstack((X_val, y_val))

# Save the combined array to a file
np.savetxt("BoVW_features_validation.csv", combined_array, delimiter=",", fmt='%f')

In [13]:
# Load the data from the file, assuming no missing values but using genfromtxt for its flexibility
data = np.genfromtxt("BoVW_features_training.csv", delimiter=",")

# Splitting into features and target variable
X_train_loaded = data[:, :-1]
y_train_loaded = data[:, -1]

In [14]:
# Load the data from the file, assuming no missing values but using genfromtxt for its flexibility
data = np.genfromtxt("BoVW_features_validation.csv", delimiter=",")

# Splitting into features and target variable
X_val_loaded = data[:, :-1]
y_val_loaded = data[:, -1]

In [15]:
np.all(y_train.reshape(-1) == y_train_loaded)

True

In [16]:
np.all(y_val.reshape(-1) == y_val_loaded)

True