In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score
import numpy as np

# Charger les données
data = pd.read_parquet(r'C:\Users\antoi\OneDrive\Documents\IPSA\Aero_4\Ma412\project\train-00000-of-00001-b21313e511aa601a.parquet')

# Prétraitement des abstracts et labels
data['abstract'] = data['abstract'].fillna('')  # Remplacer les valeurs manquantes par des chaînes vides
data['verified_uat_labels'] = data['verified_uat_labels'].apply(lambda x: x.tolist() if isinstance(x, (list, np.ndarray)) else [])

# Liste complète des labels possibles
all_labels = set()

for labels in data['verified_uat_labels']:
    if isinstance(labels, (list, np.ndarray)):  # Vérifier si les labels sont sous forme de liste ou tableau numpy
        all_labels.update(labels.tolist() if isinstance(labels, np.ndarray) else labels)
    else:
        print(f"Format inattendu pour les labels : {labels}")

all_possible_labels = sorted(all_labels)

# MultiLabelBinarizer avec la liste complète des labels possibles
mlb = MultiLabelBinarizer(classes=all_possible_labels)
mlb.fit(all_possible_labels)

# Transformation des labels dans les données
Y = mlb.transform(data['verified_uat_labels'])

# Diviser les données en ensembles d'entraînement et de test
X_train_raw, X_test_raw, Y_train, Y_test = train_test_split(data['abstract'], Y, test_size=0.2, random_state=42)

# Vectorisation avec TfidfVectorizer (sublinear TF)
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', sublinear_tf=True)
X_train = vectorizer.fit_transform(X_train_raw)
X_test = vectorizer.transform(X_test_raw)

# Fonction pour tester différentes architectures de modèles
def test_model_architectures():
    models = {
        "Logistic Regression (default)": LogisticRegression(max_iter=2000, class_weight='balanced'),
        "Logistic Regression (C=0.5)": LogisticRegression(max_iter=2000, class_weight='balanced', C=0.5),
        "Logistic Regression (C=2.0)": LogisticRegression(max_iter=2000, class_weight='balanced', C=2.0),
        "Support Vector Machine (default)": LinearSVC(class_weight='balanced', max_iter=2000),
        "Support Vector Machine (C=0.5)": LinearSVC(class_weight='balanced', max_iter=2000, C=0.5),
        "Support Vector Machine (C=2.0)": LinearSVC(class_weight='balanced', max_iter=2000, C=2.0),
        "Naive Bayes (default)": MultinomialNB(),
        "Naive Bayes (alpha=0.5)": MultinomialNB(alpha=0.5),
        "Naive Bayes (alpha=1.5)": MultinomialNB(alpha=1.5)
    }

    results = {}

    for name, model in models.items():
        print(f"\n=== Testing {name} ===")
        # Entraîner le modèle
        classifier = OneVsRestClassifier(model)
        classifier.fit(X_train, Y_train)

        # Évaluer le modèle
        Y_pred = classifier.predict(X_test)
        accuracy = accuracy_score(Y_test, Y_pred)
        f1_micro = f1_score(Y_test, Y_pred, average='micro')
        f1_macro = f1_score(Y_test, Y_pred, average='macro')

        print("Rapport de classification :")
        print(classification_report(Y_test, Y_pred, target_names=mlb.classes_))

        results[name] = {
            "accuracy": accuracy,
            "f1_micro": f1_micro,
            "f1_macro": f1_macro,
        }

    return results

# Tester les architectures de modèles
results = test_model_architectures()

# Afficher les résultats
print("\n=== Résultats des architectures de modèles ===")
for method, metrics in results.items():
    print(f"\nModèle : {method}")
    for metric, value in metrics.items():
        print(f"{metric.capitalize()} : {value:.4f}")

NameError: name 'df_train' is not defined