In [1]:
import numpy as np
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier

from BoVW import BoVW
from utils import read_processed_data, shuffle_and_partition_data

In [2]:
data, labels = read_processed_data('../Preprocessed Dataset')
X_train, X_val, X_test, y_train, y_val, y_test = shuffle_and_partition_data(data, labels)

In [3]:
# Load the data from the file, assuming no missing values but using genfromtxt for its flexibility
data = np.genfromtxt("BoVW_features_training.csv", delimiter=",")

# Splitting into features and target variable
X_train_loaded = data[:, :-1]
y_train_loaded = data[:, -1]

In [4]:
# Load the data from the file, assuming no missing values but using genfromtxt for its flexibility
data = np.genfromtxt("BoVW_features_validation.csv", delimiter=",")

# Splitting into features and target variable
X_val_loaded = data[:, :-1]
y_val_loaded = data[:, -1]

In [5]:
np.all(y_train == y_train_loaded)

True

In [6]:
np.all(y_val == y_val_loaded)

True

In [3]:
BoVW_extractor = BoVW()
X_train, kmeans = BoVW_extractor.extract_BoVW(X_train)
y_train = np.array(y_train)
X_train.shape

(2397, 112)

In [14]:
# Define the classifiers
classifiers = {
    "Logistic Regression": LogisticRegression(),
    "LDA": LinearDiscriminantAnalysis(),
    "QDA": QuadraticDiscriminantAnalysis(),
    "SVM": svm.SVC(gamma=0.0005, C=0.0007, kernel='linear'),
    "Decision Tree": DecisionTreeClassifier()
}

# Train and test each classifier
for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X_train, y_train)
    
    # Predict on the training data
    y_train_pred = clf.predict(X_train)
    
    # Calculate and print the result statistics
    print(f"Classification Report on {name}:\n", classification_report(y_train, y_train_pred))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Classification Report on Logistic Regression:
               precision    recall  f1-score   support

           0       0.98      0.97      0.98       603
           1       0.98      1.00      0.99       571
           2       0.90      0.97      0.94       596
           3       0.98      0.91      0.94       627

    accuracy                           0.96      2397
   macro avg       0.96      0.96      0.96      2397
weighted avg       0.96      0.96      0.96      2397

Classification Report on LDA:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       603
           1       1.00      1.00      1.00       571
           2       0.99      1.00      0.99       596
           3       1.00      0.99      0.99       627

    accuracy                           1.00      2397
   macro avg       1.00      1.00      1.00      2397
weighted avg       1.00      1.00      1.00      2397

Classification Report on QDA:
               precisi

  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report on SVM:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       603
           1       0.00      0.00      0.00       571
           2       0.00      0.00      0.00       596
           3       0.26      1.00      0.41       627

    accuracy                           0.26      2397
   macro avg       0.07      0.25      0.10      2397
weighted avg       0.07      0.26      0.11      2397

Classification Report on Decision Tree:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       603
           1       1.00      1.00      1.00       571
           2       1.00      1.00      1.00       596
           3       1.00      1.00      1.00       627

    accuracy                           1.00      2397
   macro avg       1.00      1.00      1.00      2397
weighted avg       1.00      1.00      1.00      2397



In [5]:
BoVW_extractor = BoVW()
X_val, _ = BoVW_extractor.extract_BoVW(data=X_val, kmeans=kmeans)
y_val = np.array(y_val)
X_val.shape

(800, 112)

In [15]:
# Train and test each classifier
for name, clf in classifiers.items():    
    # Predict on the training data
    y_val_pred = clf.predict(X_val)
    
    # Calculate and print the result statistics
    print(f"Classification Report on {name}:\n", classification_report(y_val, y_val_pred))

Classification Report on Logistic Regression:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98       181
           1       1.00      1.00      1.00       212
           2       0.92      0.96      0.94       215
           3       0.97      0.91      0.94       192

    accuracy                           0.96       800
   macro avg       0.97      0.96      0.96       800
weighted avg       0.96      0.96      0.96       800

Classification Report on LDA:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       181
           1       1.00      1.00      1.00       212
           2       0.99      1.00      0.99       215
           3       1.00      0.99      0.99       192

    accuracy                           0.99       800
   macro avg       0.99      0.99      0.99       800
weighted avg       0.99      0.99      0.99       800

Classification Report on QDA:
               precisi

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
# Ensure y_train is a 2D column vector
y_train = y_train.reshape(-1, 1)

# Combine the arrays
combined_array = np.hstack((X_train, y_train))

# Save the combined array to a file
np.savetxt("BoVW_features_training.csv", combined_array, delimiter=",", fmt='%f')

In [9]:
# Ensure y_train is a 2D column vector
y_val = y_val.reshape(-1, 1)

# Combine the arrays
combined_array = np.hstack((X_val, y_val))

# Save the combined array to a file
np.savetxt("BoVW_features_validation.csv", combined_array, delimiter=",", fmt='%f')

In [10]:
# Load the data from the file, assuming no missing values but using genfromtxt for its flexibility
data = np.genfromtxt("BoVW_features_training.csv", delimiter=",")

# Splitting into features and target variable
X_train_loaded = data[:, :-1]
y_train_loaded = data[:, -1]

In [11]:
# Load the data from the file, assuming no missing values but using genfromtxt for its flexibility
data = np.genfromtxt("BoVW_features_validation.csv", delimiter=",")

# Splitting into features and target variable
X_val_loaded = data[:, :-1]
y_val_loaded = data[:, -1]

In [12]:
np.all(y_train.reshape(-1) == y_train_loaded)

True

In [13]:
np.all(y_val.reshape(-1) == y_val_loaded)

True