In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Load the data from the file, assuming no missing values but using genfromtxt for its flexibility
data = np.genfromtxt("BoVW_features_training.csv", delimiter=",")

# Splitting into features and target variable
X_train_BoVW = data[:, :-1]
y_train_BoVW = data[:, -1]

In [3]:
# Load the data from the file, assuming no missing values but using genfromtxt for its flexibility
data = np.genfromtxt("BoVW_features_validation.csv", delimiter=",")

# Splitting into features and target variable
X_val_BoVW = data[:, :-1]
y_val_BoVW = data[:, -1]

In [4]:
# Load the data from the file, assuming no missing values but using genfromtxt for its flexibility
data = np.genfromtxt("Gabor_features_training.csv", delimiter=",")

# Splitting into features and target variable
X_train_Gabor = data[:, :-1]
y_train_Gabor = data[:, -1]

In [5]:
# Load the data from the file, assuming no missing values but using genfromtxt for its flexibility
data = np.genfromtxt("Gabor_features_validation.csv", delimiter=",")

# Splitting into features and target variable
X_val_Gabor = data[:, :-1]
y_val_Gabor = data[:, -1]

In [6]:
# Load the data from the file, assuming no missing values but using genfromtxt for its flexibility
data = np.genfromtxt("Laws_features_training.csv", delimiter=",")

# Splitting into features and target variable
X_train_Laws = data[:, :-1]
y_train_Laws = data[:, -1]

In [7]:
# Load the data from the file, assuming no missing values but using genfromtxt for its flexibility
data = np.genfromtxt("Laws_features_validation.csv", delimiter=",")

# Splitting into features and target variable
X_val_Laws = data[:, :-1]
y_val_Laws = data[:, -1]

In [8]:
# Concatenate the feature vectors horizontally
X_train = np.concatenate((X_train_BoVW, X_train_Gabor), axis=1)
X_train = np.concatenate((X_train, X_train_Laws), axis=1)
y_train = y_train_BoVW 
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_train_normalized.shape

(2397, 212)

In [9]:
# Remove highly correlated features
def remove_highly_correlated_features(X, threshold=0.95):
    corr_matrix = pd.DataFrame(X).corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    X_reduced = np.delete(X, to_drop, axis=1)
    return X_reduced, to_drop

In [10]:
X_train_reduced, dropped_features = remove_highly_correlated_features(X_train_normalized)

In [11]:
X_train.shape

(2397, 212)

In [12]:
# Define the classifiers
classifiers = {
    "Logistic Regression": LogisticRegression(),
    "LDA": LinearDiscriminantAnalysis(),
    "QDA": QuadraticDiscriminantAnalysis(),
    "SVM": svm.SVC(kernel='linear'),
    "Decision Tree": DecisionTreeClassifier()
}

# Train and test each classifier
for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X_train_reduced, y_train)
    
    # Predict on the training data
    y_train_pred = clf.predict(X_train_reduced)
    
    # Calculate and print the result statistics
    print(f"Classification Report on {name}:\n", classification_report(y_train, y_train_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Classification Report on Logistic Regression:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       603
         1.0       1.00      1.00      1.00       571
         2.0       1.00      1.00      1.00       596
         3.0       1.00      1.00      1.00       627

    accuracy                           1.00      2397
   macro avg       1.00      1.00      1.00      2397
weighted avg       1.00      1.00      1.00      2397

Classification Report on LDA:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       603
         1.0       1.00      1.00      1.00       571
         2.0       0.99      1.00      1.00       596
         3.0       1.00      1.00      1.00       627

    accuracy                           1.00      2397
   macro avg       1.00      1.00      1.00      2397
weighted avg       1.00      1.00      1.00      2397

Classification Report on QDA:
               precisi

In [13]:
# Concatenate the feature vectors horizontally
X_val = np.concatenate((X_val_BoVW, X_val_Gabor), axis=1)
X_val = np.concatenate((X_val, X_val_Laws), axis=1)
y_val = y_val_BoVW 
X_val_normalized = scaler.transform(X_val)
X_val_reduced = np.delete(X_val_normalized, dropped_features, axis=1)

In [14]:
X_val.shape

(800, 212)

In [15]:
X_val_reduced.shape

(800, 134)

In [16]:
# Train and test each classifier
for name, clf in classifiers.items():    
    # Predict on the training data
    y_val_pred = clf.predict(X_val_reduced)
    
    # Calculate and print the result statistics
    print(f"Classification Report on {name}:\n", classification_report(y_val, y_val_pred))

Classification Report on Logistic Regression:
               precision    recall  f1-score   support

         0.0       0.99      1.00      1.00       181
         1.0       1.00      1.00      1.00       212
         2.0       0.99      1.00      0.99       215
         3.0       1.00      0.98      0.99       192

    accuracy                           0.99       800
   macro avg       1.00      0.99      1.00       800
weighted avg       1.00      0.99      0.99       800

Classification Report on LDA:
               precision    recall  f1-score   support

         0.0       0.99      1.00      1.00       181
         1.0       1.00      1.00      1.00       212
         2.0       0.99      1.00      0.99       215
         3.0       1.00      0.99      0.99       192

    accuracy                           1.00       800
   macro avg       1.00      1.00      1.00       800
weighted avg       1.00      1.00      1.00       800

Classification Report on QDA:
               precisi

In [17]:
dropped_features

[56,
 97,
 115,
 116,
 117,
 120,
 122,
 123,
 124,
 126,
 128,
 129,
 134,
 135,
 136,
 138,
 139,
 140,
 141,
 142,
 144,
 146,
 147,
 148,
 150,
 151,
 152,
 153,
 159,
 160,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185,
 186,
 187,
 188,
 189,
 190,
 191,
 192,
 193,
 194,
 195,
 196,
 197,
 198,
 199,
 200,
 201,
 202,
 203,
 204,
 205,
 206,
 207,
 208,
 209,
 210,
 211]

In [18]:
X_train_BoVW.shape

(2397, 112)

In [19]:
X_train_Gabor.shape

(2397, 50)

In [20]:
X_train_Laws.shape

(2397, 50)

In [22]:
# Ensure y_train is a 2D column vector
y_train = y_train.reshape(-1, 1)

# Combine the arrays
combined_array = np.hstack((X_train_reduced, y_train))

# Save the combined array to a file
np.savetxt("Final_features_training.csv", combined_array, delimiter=",", fmt='%f')

In [23]:
# Ensure y_train is a 2D column vector
y_val = y_val.reshape(-1, 1)

# Combine the arrays
combined_array = np.hstack((X_val_reduced, y_val))

# Save the combined array to a file
np.savetxt("Final_features_validation.csv", combined_array, delimiter=",", fmt='%f')