In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import (confusion_matrix, classification_report,
                             roc_auc_score, roc_curve, auc)
from sklearn.preprocessing import label_binarize
from itertools import cycle
#  from scipy import interp

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score
import numpy as np



In [None]:
DPPpath = '/content/drive/My Drive/Colab/AS4/STEP2-feature_Engineering/'
# Load preprocessed features
df_tfidf = pd.read_pickle(DPPpath + 'features_tfidf.pkl')
df_bow = pd.read_pickle(DPPpath + 'features_bow.pkl')
df_ngram = pd.read_pickle(DPPpath + 'features_ngram2.pkl')
df_lda = pd.read_pickle(DPPpath + 'features_lda.pkl')

# Prepare data
labels = df_tfidf['label']
numeric_labels = df_tfidf['label_num'].values
classes = np.unique(numeric_labels)
from sklearn.preprocessing import label_binarize
y_binarized = label_binarize(numeric_labels, classes=classes)

# Feature matrices (drop label columns)
X_tfidf = df_tfidf.drop(['label', 'label_num'], axis=1).values
X_bow = df_bow.drop(['label', 'label_num'], axis=1).values
X_ngram = df_ngram.drop(['label', 'label_num'], axis=1).values
X_lda = df_lda.drop(['label', 'label_num'], axis=1).values

# Combine features function
def combine_features(*arrays):
    return np.hstack(arrays)



In [None]:

# Dictionary of feature sets (include individual and combinations)
feature_sets = {
    'TF-IDF': X_tfidf,
    'BoW': X_bow,
    'N-grams': X_ngram,
    'LDA': X_lda,
    'TF-IDF + BoW': combine_features(X_tfidf, X_bow),
    'TF-IDF + N-grams': combine_features(X_tfidf, X_ngram),
    'TF-IDF + LDA': combine_features(X_tfidf, X_lda),
    'BoW + N-grams': combine_features(X_bow, X_ngram),
    'BoW + LDA': combine_features(X_bow, X_lda),
    'N-grams + LDA': combine_features(X_ngram, X_lda),
    'TF-IDF + BoW + N-grams': combine_features(X_tfidf, X_bow, X_ngram),
    'TF-IDF + BoW + LDA': combine_features(X_tfidf, X_bow, X_lda),
    'TF-IDF + N-grams + LDA': combine_features(X_tfidf, X_ngram, X_lda),
    'BoW + N-grams + LDA': combine_features(X_bow, X_ngram, X_lda),
    'All Combined': combine_features(X_tfidf, X_bow, X_ngram, X_lda),
}
y = labels

In [None]:
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results = {}

for name, X in feature_sets.items():
    accuracies = []
    roc_aucs = []

    print(f"\nTraining and evaluating with feature set: {name}")
    for fold, (train_index, test_index) in enumerate(kf.split(X, numeric_labels), 1):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = numeric_labels[train_index], numeric_labels[test_index]
        y_test_binarized = y_binarized[test_index]

        params = {'metric': 'cosine', 'n_neighbors': 11, 'p': 1, 'weights': 'uniform'}

        fin_knn = KNeighborsClassifier(**params)
        fin_knn.fit(X_train, y_train)


        y_pred = fin_knn.predict(X_test)
        y_proba = fin_knn.predict_proba(X_test)

        acc = accuracy_score(y_test, y_pred)

        # Handle binary/multiclass ROC-AUC
        if len(np.unique(numeric_labels)) == 2:
            roc_auc = roc_auc_score(y_test, y_proba[:, 1])
        else:
            roc_auc = roc_auc_score(y_test_binarized, y_proba, multi_class='ovr')

        accuracies.append(acc)
        roc_aucs.append(roc_auc)

        print(f"  Fold {fold}: Accuracy = {acc:.4f}, ROC-AUC = {roc_auc:.4f}")

    avg_acc = np.mean(accuracies)
    avg_roc = np.mean(roc_aucs)
    results[name] = (avg_acc, avg_roc)

    print(f"Average Accuracy: {avg_acc:.4f}, Average ROC-AUC: {avg_roc:.4f}\n")

# Summary Table
print("\nSummary of Results:")
print(f"{'Feature Set':<30} {'Accuracy':<10} {'ROC-AUC':<10}")
for key, (acc, roc) in results.items():
    print(f"{key:<30} {acc:<10.4f} {roc:<10.4f}")

# Best Feature Set
best_feature_set = max(results.items(), key=lambda x: x[1][0])
best_name, (best_acc, best_roc) = best_feature_set

print("\n=== Best Feature Set ===")
print(f"Feature Set: {best_name}")
print(f"Accuracy: {best_acc:.4f}")
print(f"ROC-AUC: {best_roc:.4f}")


Training and evaluating with feature set: TF-IDF
  Fold 1: Accuracy = 0.8400, ROC-AUC = 0.9752
  Fold 2: Accuracy = 0.8300, ROC-AUC = 0.9841
  Fold 3: Accuracy = 0.8900, ROC-AUC = 0.9868
  Fold 4: Accuracy = 0.8700, ROC-AUC = 0.9767
  Fold 5: Accuracy = 0.8900, ROC-AUC = 0.9833
  Fold 6: Accuracy = 0.8200, ROC-AUC = 0.9839
  Fold 7: Accuracy = 0.8000, ROC-AUC = 0.9759
  Fold 8: Accuracy = 0.8300, ROC-AUC = 0.9656
  Fold 9: Accuracy = 0.8300, ROC-AUC = 0.9756
  Fold 10: Accuracy = 0.8200, ROC-AUC = 0.9727
Average Accuracy: 0.8420, Average ROC-AUC: 0.9780


Training and evaluating with feature set: BoW
  Fold 1: Accuracy = 0.7600, ROC-AUC = 0.9228
  Fold 2: Accuracy = 0.7800, ROC-AUC = 0.9542
  Fold 3: Accuracy = 0.9100, ROC-AUC = 0.9748
  Fold 4: Accuracy = 0.7300, ROC-AUC = 0.9185
  Fold 5: Accuracy = 0.7500, ROC-AUC = 0.9420
  Fold 6: Accuracy = 0.7800, ROC-AUC = 0.9464
  Fold 7: Accuracy = 0.8100, ROC-AUC = 0.9690
  Fold 8: Accuracy = 0.7300, ROC-AUC = 0.9271
  Fold 9: Accuracy = 0.