In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score
from scipy.sparse import csr_matrix
import numpy as np

# Charger les données
data = pd.read_parquet(r'C:\Users\antoi\OneDrive\Documents\IPSA\Aero_4\Ma412\project\train-00000-of-00001-b21313e511aa601a.parquet')

# Prétraitement des abstracts et labels
data['abstract'] = data['abstract'].fillna('')  # Remplacer les valeurs manquantes par des chaînes vides
data['verified_uat_labels'] = data['verified_uat_labels'].apply(lambda x: x.tolist() if isinstance(x, (list, np.ndarray)) else [])

# Liste complète des labels possibles
all_labels = set()

for labels in data['verified_uat_labels']:
    if isinstance(labels, (list, np.ndarray)):  # Vérifier si les labels sont sous forme de liste ou tableau numpy
        all_labels.update(labels.tolist() if isinstance(labels, np.ndarray) else labels)
    else:
        print(f"Format inattendu pour les labels : {labels}")

all_possible_labels = sorted(all_labels)

# MultiLabelBinarizer avec la liste complète des labels possibles
mlb = MultiLabelBinarizer(classes=all_possible_labels)
mlb.fit(all_possible_labels)

# Transformation des labels dans les données
Y = mlb.transform(data['verified_uat_labels'])

# Diviser les données en ensembles d'entraînement et de test
X_train_raw, X_test_raw, Y_train, Y_test = train_test_split(data['abstract'], Y, test_size=0.2, random_state=42)

# Vectorisation avec TfidfVectorizer (sublinear TF)
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', sublinear_tf=True)
X_train = vectorizer.fit_transform(X_train_raw)
X_test = vectorizer.transform(X_test_raw)

# Définir les modèles sélectionnés
models = {
    "Logistic Regression (C=2.0)": LogisticRegression(max_iter=2000, class_weight='balanced', C=2.0),
    "Support Vector Machine (C=0.5)": LinearSVC(class_weight='balanced', max_iter=2000, C=0.5),
    "Naive Bayes (alpha=0.5)": MultinomialNB(alpha=0.5)
}

# Entraîner tous les modèles et obtenir les prédictions
predictions = {}
for name, model in models.items():
    print(f"\nTraining {name}...")
    classifier = OneVsRestClassifier(model)
    classifier.fit(X_train, Y_train)
    predictions[name] = classifier.predict(X_test)

# Combiner les prédictions
print("\nCombining predictions...")
final_predictions = csr_matrix(predictions["Logistic Regression (C=2.0)"])  # Base avec Logistic Regression

# Ajouter les prédictions de SVM où Logistic Regression n'a pas prédit
final_predictions += (predictions["Support Vector Machine (C=0.5)"] > final_predictions).astype(int)

# Ajouter les prédictions de Naive Bayes où ni Logistic Regression ni SVM n'ont prédit
final_predictions += (predictions["Naive Bayes (alpha=0.5)"] > final_predictions).astype(int)

# Évaluer les prédictions combinées
accuracy = accuracy_score(Y_test, final_predictions)
f1_micro = f1_score(Y_test, final_predictions, average='micro')
f1_macro = f1_score(Y_test, final_predictions, average='macro')

print("\n=== Combined Model Results ===")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1-score (micro): {f1_micro:.4f}")
print(f"F1-score (macro): {f1_macro:.4f}")
print("Classification Report:")
print(classification_report(Y_test, final_predictions, target_names=mlb.classes_))


Training Logistic Regression (C=2.0)...


NameError: name 'X_train' is not defined