In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import label_binarize
import xgboost as xgb

# Load preprocessed features
df_tfidf = pd.read_pickle('tfidf.pkl')
df_bow = pd.read_pickle('bow.pkl')
df_ngram = pd.read_pickle('ngram.pkl')
df_lda = pd.read_pickle('lda.pkl')

# Prepare data
labels = df_tfidf['label']
numeric_labels = df_tfidf['label_num'].values
classes = np.unique(numeric_labels)
y_binarized = label_binarize(numeric_labels, classes=classes)

# Feature matrices (drop label columns)
X_tfidf = df_tfidf.drop(['label', 'label_num'], axis=1).values
X_bow = df_bow.drop(['label', 'label_num'], axis=1).values
X_ngram = df_ngram.drop(['label', 'label_num'], axis=1).values
X_lda = df_lda.drop(['label', 'label_num'], axis=1).values

# Combine features function
def combine_features(*arrays):
    return np.hstack(arrays)

# Dictionary of feature sets (include individual and combinations)
feature_sets = {
    'TF-IDF': X_tfidf,
    'BoW': X_bow,
    'N-grams': X_ngram,
    'LDA': X_lda,
    'TF-IDF + BoW': combine_features(X_tfidf, X_bow),
    'TF-IDF + N-grams': combine_features(X_tfidf, X_ngram),
    'TF-IDF + LDA': combine_features(X_tfidf, X_lda),
    'BoW + N-grams': combine_features(X_bow, X_ngram),
    'BoW + LDA': combine_features(X_bow, X_lda),
    'N-grams + LDA': combine_features(X_ngram, X_lda),
    'TF-IDF + BoW + N-grams': combine_features(X_tfidf, X_bow, X_ngram),
    'TF-IDF + BoW + LDA': combine_features(X_tfidf, X_bow, X_lda),
    'TF-IDF + N-grams + LDA': combine_features(X_tfidf, X_ngram, X_lda),
    'BoW + N-grams + LDA': combine_features(X_bow, X_ngram, X_lda),
    'All Combined': combine_features(X_tfidf, X_bow, X_ngram, X_lda),
}

# Set up 10-fold cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

results = {}

for name, X in feature_sets.items():
    accuracies = []
    roc_aucs = []

    print(f"Training and evaluating with feature set: {name}")
    for fold, (train_index, test_index) in enumerate(kf.split(X, numeric_labels), 1):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = numeric_labels[train_index], numeric_labels[test_index]
        y_test_binarized = y_binarized[test_index]

        # Initialize XGBoost classifier
        model = xgb.XGBClassifier(eval_metric='mlogloss', random_state=42)
        model.fit(X_train, y_train)

        # Predict
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)

        # Metrics
        acc = accuracy_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test_binarized, y_proba, multi_class='ovr')

        accuracies.append(acc)
        roc_aucs.append(roc_auc)

        print(f"  Fold {fold}: Accuracy = {acc:.4f}, ROC-AUC = {roc_auc:.4f}")

    avg_acc = np.mean(accuracies)
    avg_roc = np.mean(roc_aucs)
    results[name] = (avg_acc, avg_roc)
    print(f"Average Accuracy: {avg_acc:.4f}, Average ROC-AUC: {avg_roc:.4f}\n")

# Summary of all feature sets
print("\nSummary of Results:")
print(f"{'Feature Set':<30} {'Accuracy':<10} {'ROC-AUC':<10}")
for key, (acc, roc) in results.items():
    print(f"{key:<30} {acc:<10.4f} {roc:<10.4f}")

# Select the best feature set by accuracy
best_feature_set = max(results.items(), key=lambda x: x[1][0])
best_name, (best_acc, best_roc) = best_feature_set

print("\n=== Best Feature Set ===")
print(f"Feature Set: {best_name}")
print(f"Accuracy: {best_acc:.4f}")
print(f"ROC-AUC: {best_roc:.4f}")


Training and evaluating with feature set: TF-IDF
  Fold 1: Accuracy = 0.8600, ROC-AUC = 0.9770
  Fold 2: Accuracy = 0.9200, ROC-AUC = 0.9897
  Fold 3: Accuracy = 0.9700, ROC-AUC = 0.9990
  Fold 4: Accuracy = 0.8800, ROC-AUC = 0.9821
  Fold 5: Accuracy = 0.8900, ROC-AUC = 0.9851
  Fold 6: Accuracy = 0.8700, ROC-AUC = 0.9910
  Fold 7: Accuracy = 0.9100, ROC-AUC = 0.9887
  Fold 8: Accuracy = 0.9000, ROC-AUC = 0.9835
  Fold 9: Accuracy = 0.8700, ROC-AUC = 0.9840
  Fold 10: Accuracy = 0.8300, ROC-AUC = 0.9744
Average Accuracy: 0.8900, Average ROC-AUC: 0.9855

Training and evaluating with feature set: BoW
  Fold 1: Accuracy = 0.8900, ROC-AUC = 0.9829
  Fold 2: Accuracy = 0.9100, ROC-AUC = 0.9945
  Fold 3: Accuracy = 0.9600, ROC-AUC = 0.9990
  Fold 4: Accuracy = 0.9200, ROC-AUC = 0.9890
  Fold 5: Accuracy = 0.8900, ROC-AUC = 0.9896
  Fold 6: Accuracy = 0.9200, ROC-AUC = 0.9949
  Fold 7: Accuracy = 0.9300, ROC-AUC = 0.9875
  Fold 8: Accuracy = 0.9100, ROC-AUC = 0.9834
  Fold 9: Accuracy = 0.89

In [17]:
# Save results to a file for later comparison
import pickle
import os

xgb_file = "xgboost_results.pkl"
with open(xgb_file, "wb") as f:
    pickle.dump(results, f)

# Confirm file was saved
if os.path.exists(xgb_file):
    print(f"XGBoost results successfully saved to '{xgb_file}'.")
else:
    print(f"Failed to save XGBoost results.")


XGBoost results successfully saved to 'xgboost_results.pkl'.
