In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
import joblib

# Load the labeled training data from CSV
data = pd.read_csv("./xgb_test_predictions_with_embeddings.csv", header=None)
data = data.values  # Convert DataFrame to NumPy array

# Assuming the last column is the label and the rest are features
train_embeddings = data[:, :-1]
train_labels = data[:, -1].astype(int) - 1  # 0-based labels

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    train_embeddings, train_labels, test_size=0.2, random_state=42
)

# Train the XGBoost model with regularization
xgb_model = xgb.XGBClassifier(
    objective="multi:softmax",
    use_label_encoder=False,
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    reg_alpha=0.1,
    reg_lambda=0.1,
)
xgb_model.fit(X_train, y_train)

# Save the trained model
# joblib.dump(xgb_model, "./models/xgboost_model.joblib")
print("Model training completed and saved.")

# Evaluate the model on the validation set
val_predictions = xgb_model.predict(X_val)

# Adjust labels for reporting
y_val += 1
val_predictions += 1

# Evaluate predictions
val_accuracy = accuracy_score(y_val, val_predictions)
val_f1 = classification_report(y_val, val_predictions, output_dict=True)["weighted avg"]["f1-score"]
conf_matrix = confusion_matrix(y_val, val_predictions)

print(f"Validation Set Accuracy: {val_accuracy}")
print(f"Validation Set F1 Score: {val_f1}")
print(f"Validation Set Classification Report:\n{classification_report(y_val, val_predictions)}")
print(f"Confusion Matrix:\n{conf_matrix}")

# Load the test data without labels
test_data = np.load("./data/split/Pca_testSet.npz")
test_features = test_data["data"]  # Features only

# Ensure feature count matches the model's expected input
if test_features.shape[1] != X_train.shape[1]:
    raise ValueError("Feature count mismatch between training and test data.")

# Predict using the XGBoost model
test_predictions = xgb_model.predict(test_features)
test_predictions += 1  # Adjust predictions to original labeling

# Combine test features with predictions
test_results = np.hstack((test_features, test_predictions.reshape(-1, 1)))

# Save predictions and embeddings to a CSV file
np.savetxt("xgb_test_predictions_with_embeddings.csv", test_results, delimiter=",", fmt="%.6f")
print("Predictions and embeddings for the test set have been saved to 'xgb_test_predictions_with_embeddings.csv'.")


Model training completed and saved.
Validation Set Accuracy: 0.82048223841145
Validation Set F1 Score: 0.8178946250757467
Validation Set Classification Report:
              precision    recall  f1-score   support

           1       0.85      0.84      0.85     33818
           2       0.80      0.90      0.85     37554
           3       0.80      0.66      0.72     21694

    accuracy                           0.82     93066
   macro avg       0.82      0.80      0.81     93066
weighted avg       0.82      0.82      0.82     93066

Predictions and embeddings for the test set have been saved to 'xgb_test_predictions_with_embeddings.csv'.


In [6]:
import numpy as np
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
import joblib

# Load the labeled training data
train_data_labeled = np.load("./data/split/Pca_traningSet_labeled.npz")
train_embeddings = train_data_labeled["data"][:, :-1]  # Features
train_labels = train_data_labeled["data"][:, -1].astype(int) - 1  # 0-based labels

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    train_embeddings, train_labels, test_size=0.2, random_state=42
)

# Train the XGBoost model with regularization
xgb_model = xgb.XGBClassifier(
    objective="multi:softmax",
    use_label_encoder=False,
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    reg_alpha=0.1,
    reg_lambda=0.1,
)  # Regularization parameters
xgb_model.fit(X_train, y_train)

# Save the trained model
joblib.dump(xgb_model, "./models/xgboost_model.joblib")
print("Model training completed and saved.")

# Evaluate the model on the validation set
val_predictions = xgb_model.predict(X_val)

# Adjust labels for reporting
y_val += 1
val_predictions += 1

# Evaluate predictions
val_accuracy = accuracy_score(y_val, val_predictions)
val_f1 = classification_report(y_val, val_predictions, output_dict=True)["weighted avg"]["f1-score"]

print(f"Validation Set Accuracy: {val_accuracy}")
print(f"Validation Set F1 Score: {val_f1}")
print(f"Validation Set Classification Report:\n{classification_report(y_val, val_predictions)}")

# Load the test data without labels
test_data = np.load("./data/split/Pca_testSet.npz")
test_features = test_data["data"]  # Features only

# Ensure feature count matches the model's expected input
if test_features.shape[1] != X_train.shape[1]:
    raise ValueError("Feature count mismatch between training and test data.")

# Predict using the XGBoost model
test_predictions = xgb_model.predict(test_features)
test_predictions += 1  # Adjust predictions to original labeling

# Combine test features with predictions
test_results = np.hstack((test_features, test_predictions.reshape(-1, 1)))

# Save predictions and embeddings to a CSV file
np.savetxt("xgb_test_predictions_with_embeddings.csv", test_results, delimiter=",", fmt="%.6f")
print("Predictions and embeddings for the test set have been saved to 'xgb_test_predictions_with_embeddings.csv'.")


Model training completed and saved.
Validation Set Accuracy: 0.7466307277628033
Validation Set F1 Score: 0.745721478052658
Validation Set Classification Report:
              precision    recall  f1-score   support

           1       0.72      0.81      0.77       410
           2       0.77      0.73      0.75       367
           3       0.76      0.68      0.72       336

    accuracy                           0.75      1113
   macro avg       0.75      0.74      0.74      1113
weighted avg       0.75      0.75      0.75      1113

Predictions and embeddings for the test set have been saved to 'xgb_test_predictions_with_embeddings.csv'.


In [7]:
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import joblib

# Load the Naive Bayes model
nb_model = joblib.load("./models/naive_bayes_model.joblib")

# Load the labeled training data
train_data_labeled = np.load("./data/split/Pca_traningSet_labeled.npz")
train_embeddings = train_data_labeled["data"][:, :-1]  # Features
train_labels = train_data_labeled["data"][:, -1].astype(int) - 1  # 1-based labels adjusted to 0-based

# Evaluate the model on the training set
train_predictions = nb_model.predict(train_embeddings)

# Adjust labels for reporting (if necessary)
train_labels_adjusted = train_labels + 1
train_predictions_adjusted = train_predictions + 1

# Evaluate predictions
train_accuracy = accuracy_score(train_labels_adjusted, train_predictions_adjusted)
train_f1 = classification_report(train_labels_adjusted, train_predictions_adjusted, output_dict=True)["weighted avg"]["f1-score"]

print(f"Training Set Accuracy: {train_accuracy}")
print(f"Training Set F1 Score: {train_f1}")
print(f"Training Set Classification Report:\n{classification_report(train_labels_adjusted, train_predictions_adjusted)}")

# Confusion Matrix
conf_matrix = confusion_matrix(train_labels_adjusted, train_predictions_adjusted)
print(f"Confusion Matrix:\n{conf_matrix}")

# Load the test data without labels
test_data = np.load("./data/split/Pca_testSet.npz")
test_features = test_data["data"]  # Features only

# Ensure feature count matches the model's expected input
if test_features.shape[1] != train_embeddings.shape[1]:
    raise ValueError("Feature count mismatch between training and test data.")

# Predict using the Naive Bayes model
test_predictions = nb_model.predict(test_features)
test_predictions += 1  # Adjust predictions to original labeling

# Combine test features with predictions
test_results = np.hstack((test_features, test_predictions.reshape(-1, 1)))

# Save predictions and embeddings to a CSV file
np.savetxt("nb_test_predictions_with_embeddings.csv", test_results, delimiter=",", fmt="%.6f")
print("Predictions and embeddings for the test set have been saved to 'nb_test_predictions_with_embeddings.csv'.")


Training Set Accuracy: 0.7073521481215171
Training Set F1 Score: 0.7069249145041674
Training Set Classification Report:
              precision    recall  f1-score   support

           1       0.73      0.75      0.74      2134
           2       0.71      0.70      0.71      1817
           3       0.67      0.65      0.66      1612

    accuracy                           0.71      5563
   macro avg       0.70      0.70      0.70      5563
weighted avg       0.71      0.71      0.71      5563

Confusion Matrix:
[[1607  257  270]
 [ 297 1279  241]
 [ 301  262 1049]]
Predictions and embeddings for the test set have been saved to 'nb_test_predictions_with_embeddings.csv'.


In [8]:
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import joblib

# Load the Voting Classifier model
voting_model = joblib.load("./models/voting_classifier_model.joblib")

# Load the labeled training data
train_data_labeled = np.load("./data/split/Pca_traningSet_labeled.npz")
train_embeddings = train_data_labeled["data"][:, :-1]  # Features
train_labels = train_data_labeled["data"][:, -1].astype(int) - 1  # 1-based labels adjusted to 0-based

# Evaluate the model on the training set
train_predictions = voting_model.predict(train_embeddings)

# Adjust labels for reporting (if necessary)
train_labels_adjusted = train_labels + 1
train_predictions_adjusted = train_predictions + 1

# Evaluate predictions
train_accuracy = accuracy_score(train_labels_adjusted, train_predictions_adjusted)
train_f1 = classification_report(train_labels_adjusted, train_predictions_adjusted, output_dict=True)["weighted avg"]["f1-score"]

print(f"Training Set Accuracy: {train_accuracy}")
print(f"Training Set F1 Score: {train_f1}")
print(f"Training Set Classification Report:\n{classification_report(train_labels_adjusted, train_predictions_adjusted)}")

# Confusion Matrix
conf_matrix = confusion_matrix(train_labels_adjusted, train_predictions_adjusted)
print(f"Confusion Matrix:\n{conf_matrix}")

# Load the test data without labels
test_data = np.load("./data/split/Pca_testSet.npz")
test_features = test_data["data"]  # Features only

# Ensure feature count matches the model's expected input
if test_features.shape[1] != train_embeddings.shape[1]:
    raise ValueError("Feature count mismatch between training and test data.")

# Predict using the Voting Classifier model
test_predictions = voting_model.predict(test_features)
test_predictions += 1  # Adjust predictions to original labeling

# Combine test features with predictions
test_results = np.hstack((test_features, test_predictions.reshape(-1, 1)))

# Save predictions and embeddings to a CSV file
np.savetxt("voting_test_predictions_with_embeddings.csv", test_results, delimiter=",", fmt="%.6f")
print("Predictions and embeddings for the test set have been saved to 'voting_test_predictions_with_embeddings.csv'.")


Training Set Accuracy: 0.9466115405356822
Training Set F1 Score: 0.9465628321096534
Training Set Classification Report:
              precision    recall  f1-score   support

           1       0.93      0.97      0.95      2134
           2       0.96      0.94      0.95      1817
           3       0.96      0.92      0.94      1612

    accuracy                           0.95      5563
   macro avg       0.95      0.94      0.95      5563
weighted avg       0.95      0.95      0.95      5563

Confusion Matrix:
[[2070   36   28]
 [  76 1710   31]
 [  82   44 1486]]
Predictions and embeddings for the test set have been saved to 'voting_test_predictions_with_embeddings.csv'.
