In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Nouvelle section

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.preprocessing import label_binarize
from sklearn.metrics import accuracy_score, roc_auc_score

In [4]:
# Load preprocessed features
df_tfidf = pd.read_pickle('/content/drive/My Drive/Colab/AS4/STEP2-feature_Engineering/features_tfidf.pkl')
df_bow = pd.read_pickle('/content/drive/My Drive/Colab/AS4/STEP2-feature_Engineering/features_bow.pkl')
df_ngram = pd.read_pickle('/content/drive/My Drive/Colab/AS4/STEP2-feature_Engineering/features_ngram2.pkl')
df_lda = pd.read_pickle('/content/drive/My Drive/Colab/AS4/STEP2-feature_Engineering/features_lda.pkl')

# Prepare data
labels = df_tfidf['label']
numeric_labels = df_tfidf['label_num'].values
classes = np.unique(numeric_labels)
y_binarized = label_binarize(numeric_labels, classes=classes)

# Feature matrices (drop label columns)
X_tfidf = df_tfidf.drop(['label', 'label_num'], axis=1).values
X_bow = df_bow.drop(['label', 'label_num'], axis=1).values
X_ngram = df_ngram.drop(['label', 'label_num'], axis=1).values
X_lda = df_lda.drop(['label', 'label_num'], axis=1).values

# Combine features function
def combine_features(*arrays):
    return np.hstack(arrays)

# Dictionary of feature sets (include individual and combinations)
feature_sets = {
    'TF-IDF': X_tfidf,
    'BoW': X_bow,
    'N-grams': X_ngram,
    'LDA': X_lda,
    'TF-IDF + BoW': combine_features(X_tfidf, X_bow),
    'TF-IDF + N-grams': combine_features(X_tfidf, X_ngram),
    'TF-IDF + LDA': combine_features(X_tfidf, X_lda),
    'BoW + N-grams': combine_features(X_bow, X_ngram),
    'BoW + LDA': combine_features(X_bow, X_lda),
    'N-grams + LDA': combine_features(X_ngram, X_lda),
    'TF-IDF + BoW + N-grams': combine_features(X_tfidf, X_bow, X_ngram),
    'TF-IDF + BoW + LDA': combine_features(X_tfidf, X_bow, X_lda),
    'TF-IDF + N-grams + LDA': combine_features(X_tfidf, X_ngram, X_lda),
    'BoW + N-grams + LDA': combine_features(X_bow, X_ngram, X_lda),
    'All Combined': combine_features(X_tfidf, X_bow, X_ngram, X_lda),
}
y = labels

In [None]:
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results = {}

for name, X in feature_sets.items():
    accuracies = []
    roc_aucs = []

    print(f"\nTraining and evaluating with feature set: {name}")
    for fold, (train_index, test_index) in enumerate(kf.split(X, numeric_labels), 1):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = numeric_labels[train_index], numeric_labels[test_index]
        y_test_binarized = y_binarized[test_index]

        # SVM (Linear Kernel, with probability enabled for ROC-AUC)
        model = SVC(kernel='linear', probability=True, random_state=42)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)

        acc = accuracy_score(y_test, y_pred)

        # Handle binary/multiclass ROC-AUC
        if len(np.unique(numeric_labels)) == 2:
            roc_auc = roc_auc_score(y_test, y_proba[:, 1])
        else:
            roc_auc = roc_auc_score(y_test_binarized, y_proba, multi_class='ovr')

        accuracies.append(acc)
        roc_aucs.append(roc_auc)

        print(f"  Fold {fold}: Accuracy = {acc:.4f}, ROC-AUC = {roc_auc:.4f}")

    avg_acc = np.mean(accuracies)
    avg_roc = np.mean(roc_aucs)
    results[name] = (avg_acc, avg_roc)

    print(f"Average Accuracy: {avg_acc:.4f}, Average ROC-AUC: {avg_roc:.4f}\n")

# Summary Table
print("\nSummary of Results:")
print(f"{'Feature Set':<30} {'Accuracy':<10} {'ROC-AUC':<10}")
for key, (acc, roc) in results.items():
    print(f"{key:<30} {acc:<10.4f} {roc:<10.4f}")

# Best Feature Set
best_feature_set = max(results.items(), key=lambda x: x[1][0])
best_name, (best_acc, best_roc) = best_feature_set

print("\n=== Best Feature Set ===")
print(f"Feature Set: {best_name}")
print(f"Accuracy: {best_acc:.4f}")
print(f"ROC-AUC: {best_roc:.4f}")


Training and evaluating with feature set: TF-IDF
  Fold 1: Accuracy = 0.9400, ROC-AUC = 0.9920
  Fold 2: Accuracy = 0.9700, ROC-AUC = 0.9901
  Fold 3: Accuracy = 0.9700, ROC-AUC = 0.9992
  Fold 4: Accuracy = 0.9500, ROC-AUC = 0.9956
  Fold 5: Accuracy = 0.9700, ROC-AUC = 0.9994
  Fold 6: Accuracy = 0.9600, ROC-AUC = 0.9974
  Fold 7: Accuracy = 0.9500, ROC-AUC = 0.9961
  Fold 8: Accuracy = 0.9100, ROC-AUC = 0.9945
  Fold 9: Accuracy = 0.9400, ROC-AUC = 0.9996
  Fold 10: Accuracy = 0.9300, ROC-AUC = 0.9940
Average Accuracy: 0.9490, Average ROC-AUC: 0.9958


Training and evaluating with feature set: BoW
  Fold 1: Accuracy = 0.9200, ROC-AUC = 0.9880
  Fold 2: Accuracy = 0.9500, ROC-AUC = 0.9901
  Fold 3: Accuracy = 0.9500, ROC-AUC = 0.9979
  Fold 4: Accuracy = 0.9500, ROC-AUC = 0.9939
  Fold 5: Accuracy = 0.9500, ROC-AUC = 0.9969
  Fold 6: Accuracy = 0.9600, ROC-AUC = 0.9949
  Fold 7: Accuracy = 0.9100, ROC-AUC = 0.9949
  Fold 8: Accuracy = 0.9200, ROC-AUC = 0.9929
  Fold 9: Accuracy = 0.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import (confusion_matrix, classification_report,
                             roc_auc_score, roc_curve, auc)
from sklearn.preprocessing import label_binarize
from itertools import cycle
#  from scipy import interp

from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.preprocessing import label_binarize
from sklearn.metrics import accuracy_score, roc_auc_score

# CHANGE THIS ROW TO THE BEST DATASET
df_tfidf = pd.read_pickle('../Step2_Feature_Engineering/tfidf.pkl')

# Prepare data
labels = df_tfidf['label']

# Feature matrices (drop label columns)
X_tfidf = df_tfidf.drop(['label', 'label_num'], axis=1).values
X = X_tfidf

y = labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42, stratify=y)

y_test_bin = label_binarize(y_test, classes=[0, 1, 2, 3, 4])

model = SVC(kernel='linear', probability=True, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

roc_auc = roc_auc_score(y_test_bin, y_proba, multi_class='ovr')
print(f"\nROC-AUC Score (One-vs-Rest): {roc_auc:.4f}")

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)


Classification Report:
                                           precision    recall  f1-score   support

                biological image analysis       0.97      0.90      0.94        40
               disease outcome prediction       0.95      1.00      0.98        40
                 gene expression analysis       0.93      0.93      0.93        40
protein structure and function prediction       0.89      0.97      0.93        40
    sequence classification and alignment       0.89      0.82      0.86        40

                                 accuracy                           0.93       200
                                macro avg       0.93      0.93      0.92       200
                             weighted avg       0.93      0.93      0.92       200


ROC-AUC Score (One-vs-Rest): nan

Confusion Matrix:
[[36  0  1  0  3]
 [ 0 40  0  0  0]
 [ 1  1 37  0  1]
 [ 0  1  0 39  0]
 [ 0  0  2  5 33]]


