In [1]:
import numpy as np
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
import joblib

# Load the labeled training data
train_data_labeled = np.load("./data/split/Pca_traningSet_labeled.npz")
train_embeddings = train_data_labeled["data"][:, :-1]  # Features
train_labels = train_data_labeled["data"][:, -1].astype(int) - 1  # 0-based labels

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    train_embeddings, train_labels, test_size=0.2, random_state=42
)

# Train the XGBoost model with regularization
xgb_model = xgb.XGBClassifier(
    objective="multi:softmax",
    use_label_encoder=False,
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    reg_alpha=0.1,
    reg_lambda=0.1,
)  # Regularization parameters
xgb_model.fit(X_train, y_train)

# Save the trained model
joblib.dump(xgb_model, "./models/xgboost_model.joblib")
print("Model training completed and saved.")

# Evaluate the model on the validation set
val_predictions = xgb_model.predict(X_val)

# Adjust labels for reporting
y_val += 1
val_predictions += 1

# Evaluate predictions
val_accuracy = accuracy_score(y_val, val_predictions)
val_f1 = classification_report(y_val, val_predictions, output_dict=True)[
    "weighted avg"
]["f1-score"]

print(f"Validation Set Accuracy: {val_accuracy}")
print(f"Validation Set F1 Score: {val_f1}")
print(
    f"Validation Set Classification Report:\n{classification_report(y_val, val_predictions)}"
)

# Load the test data without labels
test_data = np.load("./data/split/Pca_testSet.npz")
test_features = test_data["data"]  # Features only

# Ensure feature count matches the model's expected input
if test_features.shape[1] != X_train.shape[1]:
    raise ValueError("Feature count mismatch between training and test data.")

# Predict using the XGBoost model
test_predictions = xgb_model.predict(test_features)
test_predictions += 1  # Adjust predictions to original labeling

# Save predictions to a CSV file
np.savetxt("xgb_test_predictions.csv", test_predictions, delimiter=",", fmt="%d")
print("Predictions for the test set have been saved to 'xgb_test_predictions.csv'.")


Model training completed and saved.
Validation Set Accuracy: 0.7466307277628033
Validation Set F1 Score: 0.745721478052658
Validation Set Classification Report:
              precision    recall  f1-score   support

           1       0.72      0.81      0.77       410
           2       0.77      0.73      0.75       367
           3       0.76      0.68      0.72       336

    accuracy                           0.75      1113
   macro avg       0.75      0.74      0.74      1113
weighted avg       0.75      0.75      0.75      1113

Predictions for the test set have been saved to 'xgb_test_predictions.csv'.


In [4]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

# Load the Random Forest model
rf_model = joblib.load("./models/random_forest_model.joblib")

# Load the labeled training data
train_data_labeled = np.load("./data/split/Pca_traningSet_labeled.npz")
train_embeddings = train_data_labeled["data"][:, :-1]  # Features
train_labels = train_data_labeled["data"][:, -1].astype(int) - 1  # 0-based labels

# Evaluate the model on the training set
train_predictions = rf_model.predict(train_embeddings)

# Evaluate predictions
train_accuracy = accuracy_score(train_labels, train_predictions)
train_f1 = classification_report(train_labels, train_predictions, output_dict=True)["weighted avg"]["f1-score"]

print(f"Training Set Accuracy: {train_accuracy}")
print(f"Training Set F1 Score: {train_f1}")
print(f"Training Set Classification Report:\n{classification_report(train_labels, train_predictions)}")

# Load the test data without labels
test_data = np.load("./data/split/Pca_testSet.npz")
test_features = test_data["data"]  # Features only

# Ensure feature count matches the model's expected input
if test_features.shape[1] != train_embeddings.shape[1]:
    raise ValueError("Feature count mismatch between training and test data.")

# Predict using the Random Forest model
test_predictions = rf_model.predict(test_features)

# Save predictions to a CSV file
np.savetxt("rf_test_predictions.csv", test_predictions, delimiter=",", fmt="%d")
print("Predictions for the test set have been saved to 'rf_test_predictions.csv'.")


Training Set Accuracy: 0.9422973215890706
Training Set F1 Score: 0.9422511100272639
Training Set Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.97      0.94      2134
           1       0.95      0.94      0.94      1817
           2       0.97      0.91      0.94      1612

    accuracy                           0.94      5563
   macro avg       0.95      0.94      0.94      5563
weighted avg       0.94      0.94      0.94      5563

Predictions for the test set have been saved to 'rf_test_predictions.csv'.


In [5]:
import numpy as np
from sklearn.metrics import classification_report, accuracy_score
import joblib

# Load the Naive Bayes model
nb_model = joblib.load("./models/naive_bayes_model.joblib")

# Load the labeled training data
train_data_labeled = np.load("./data/split/Pca_traningSet_labeled.npz")
train_embeddings = train_data_labeled["data"][:, :-1]  # Features
train_labels = train_data_labeled["data"][:, -1].astype(int) -1  # 1-based labels

# Evaluate the model on the training set
train_predictions = nb_model.predict(train_embeddings)

# Evaluate predictions
train_accuracy = accuracy_score(train_labels, train_predictions)
train_f1 = classification_report(train_labels, train_predictions, output_dict=True)["weighted avg"]["f1-score"]

print(f"Training Set Accuracy: {train_accuracy}")
print(f"Training Set F1 Score: {train_f1}")
print(f"Training Set Classification Report:\n{classification_report(train_labels, train_predictions)}")

# Load the test data without labels
test_data = np.load("./data/split/Pca_testSet.npz")
test_features = test_data["data"]  # Features only

# Ensure feature count matches the model's expected input
if test_features.shape[1] != train_embeddings.shape[1]:
    raise ValueError("Feature count mismatch between training and test data.")

# Predict using the Naive Bayes model
test_predictions = nb_model.predict(test_features)

# Save predictions to a CSV file
np.savetxt("nb_test_predictions.csv", test_predictions, delimiter=",", fmt="%d")
print("Predictions for the test set have been saved to 'nb_test_predictions.csv'.")


Training Set Accuracy: 0.7073521481215171
Training Set F1 Score: 0.7069249145041674
Training Set Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.75      0.74      2134
           1       0.71      0.70      0.71      1817
           2       0.67      0.65      0.66      1612

    accuracy                           0.71      5563
   macro avg       0.70      0.70      0.70      5563
weighted avg       0.71      0.71      0.71      5563

Predictions for the test set have been saved to 'nb_test_predictions.csv'.


In [1]:
import numpy as np
from sklearn.metrics import classification_report, accuracy_score
import joblib

# Load the Voting Classifier model
voting_model = joblib.load("./models/voting_classifier_model.joblib")

# Load the labeled training data
train_data_labeled = np.load("./data/split/Pca_traningSet_labeled.npz")
train_embeddings = train_data_labeled["data"][:, :-1]  # Features
train_labels = train_data_labeled["data"][:, -1].astype(int) - 1 # 1-based labels

# Evaluate the model on the training set
train_predictions = voting_model.predict(train_embeddings)

# Evaluate predictions
train_accuracy = accuracy_score(train_labels, train_predictions)
train_f1 = classification_report(train_labels, train_predictions, output_dict=True)["weighted avg"]["f1-score"]

print(f"Training Set Accuracy: {train_accuracy}")
print(f"Training Set F1 Score: {train_f1}")
print(f"Training Set Classification Report:\n{classification_report(train_labels, train_predictions)}")

# Load the test data without labels
test_data = np.load("./data/split/Pca_testSet.npz")
test_features = test_data["data"]  # Features only

# Ensure feature count matches the model's expected input
if test_features.shape[1] != train_embeddings.shape[1]:
    raise ValueError("Feature count mismatch between training and test data.")

# Predict using the Voting Classifier model
test_predictions = voting_model.predict(test_features)

# Save predictions to a CSV file
np.savetxt("voting_test_predictions.csv", test_predictions, delimiter=",", fmt="%d")
print("Predictions for the test set have been saved to 'voting_test_predictions.csv'.")


Training Set Accuracy: 0.9466115405356822
Training Set F1 Score: 0.9465628321096534
Training Set Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      2134
           1       0.96      0.94      0.95      1817
           2       0.96      0.92      0.94      1612

    accuracy                           0.95      5563
   macro avg       0.95      0.94      0.95      5563
weighted avg       0.95      0.95      0.95      5563

Predictions for the test set have been saved to 'voting_test_predictions.csv'.
