In [1]:
import numpy as np
import matplotlib.pyplot as plt
import random
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score
from pca_implementation import PCA


df_train = pd.read_parquet(r'C:\Users\antoi\OneDrive\Documents\IPSA\Aero_4\Ma412\project\train-00000-of-00001-b21313e511aa601a.parquet')
df = pd.read_parquet(r'C:\Users\antoi\OneDrive\Documents\IPSA\Aero_4\Ma412\project\val-00000-of-00001-66ce8665444026dc.parquet')
print(np.shape(df_train))
# Extraire les labels uniques
all_labels = set()

for labels in df_train['verified_uat_labels']:
    if isinstance(labels, (list, np.ndarray)):  # Vérifier si les labels sont sous forme de liste ou tableau numpy
        all_labels.update(labels.tolist() if isinstance(labels, np.ndarray) else labels)
    else:
        print(f"Format inattendu pour les labels : {labels}")

# Afficher tous les labels uniques
unique_labels = sorted(all_labels)
print(f"Nombre de labels uniques : {len(unique_labels)}")
print(unique_labels)

# Prétraitement des abstracts et labels
df_train['abstract'] = df_train['abstract'].fillna('')  # Remplacer les valeurs manquantes par des chaînes vides
df_train['verified_uat_labels'] = df_train['verified_uat_labels'].apply(lambda x: x.tolist() if isinstance(x, (list, np.ndarray)) else [])



# MultiLabelBinarizer avec la liste complète des labels possibles
mlb = MultiLabelBinarizer(classes=unique_labels)
mlb.fit(unique_labels)

# Transformation des labels dans les données
Y = mlb.transform(df_train['verified_uat_labels'])

# Vectorisation des abstracts (CountVectorizer)
# Case 1
vectorizer1 = CountVectorizer(max_features=5000, stop_words='english')  # Limiter à 5000 caractéristiques
X1 = vectorizer1.fit_transform(df_train['abstract'])

# Case 2
vectorizer2 = TfidfVectorizer(max_features=5000)
X2 = vectorizer2.fit_transform(df_train['abstract'])

# Case 3
vectorizer3 = TfidfVectorizer(max_features=5000,max_df = 0.8, min_df = 0.1)
X3 = vectorizer3.fit_transform(df_train['abstract'])

# Diviser les données en ensembles d'entraînement et de test
# Case 1
X_train1, X_test1, Y_train, Y_test = train_test_split(X1, Y, test_size=0.2, random_state=42)

# Case 2
X_train2, X_test2, Y_train, Y_test = train_test_split(X2, Y, test_size=0.2, random_state=42)

# Case 3
X_train3, X_test3, Y_train, Y_test = train_test_split(X3, Y, test_size=0.2, random_state=42)

# Filtrer les colonnes correspondant aux labels toujours présents
indices_to_keep = [i for i, label in enumerate(mlb.classes_) if label not in always_present_labels]
Y_train = Y_train[:, indices_to_keep]
Y_test = Y_test[:, indices_to_keep]
mlb.classes_ = mlb.classes_[indices_to_keep]

# Entraîner le modèle de régression logistique et evaluer le modèle

# Case 1
logreg = LogisticRegression(max_iter=2000, class_weight='balanced')
classifier1 = OneVsRestClassifier(logreg)
classifier1.fit(X_train1, Y_train)
Y_pred1 = (classifier1.predict_proba(X_test1) > 0.3).astype(int)  # Seuil de 30% pour la prédiction
print("Rapport de classification :")
print(classification_report(Y_test, Y_pred1, target_names=mlb.classes_))

# Calcul de la précision moyenne sur tous les labels
accuracy1 = accuracy_score(Y_test, Y_pred1)
f1_micro1 = f1_score(Y_test, Y_pred1, average='micro')
f1_macro1 = f1_score(Y_test, Y_pred1, average='macro')

print(f"Précision globale (Accuracy) : {accuracy1:.4f}")
print(f"F1-score (micro) : {f1_micro1:.4f}")
print(f"F1-score (macro) : {f1_macro1:.4f}")

# Case 2
logreg = LogisticRegression(max_iter=2000, class_weight='balanced')
classifier2 = OneVsRestClassifier(logreg)
classifier2.fit(X_train2, Y_train)
Y_pred2 = (classifier2.predict_proba(X_test2) > 0.3).astype(int)  # Seuil de 30% pour la prédiction
print("Rapport de classification :")
print(classification_report(Y_test, Y_pred1, target_names=mlb.classes_))

# Calcul de la précision moyenne sur tous les labels
accuracy2 = accuracy_score(Y_test, Y_pred2)
f1_micro2 = f1_score(Y_test, Y_pred2, average='micro')
f1_macro2 = f1_score(Y_test, Y_pred2, average='macro')

print(f"Précision globale (Accuracy) : {accuracy2:.4f}")
print(f"F1-score (micro) : {f1_micro2:.4f}")
print(f"F1-score (macro) : {f1_macro2:.4f}")

# Case 3
logreg = LogisticRegression(max_iter=2000, class_weight='balanced')
classifier3 = OneVsRestClassifier(logreg)
classifier3.fit(X_train3, Y_train)
Y_pred3 = (classifier3.predict_proba(X_test3) > 0.3).astype(int)  # Seuil de 30% pour la prédiction
print("Rapport de classification :")
print(classification_report(Y_test, Y_pred3, target_names=mlb.classes_))

# Calcul de la précision moyenne sur tous les labels
accuracy3 = accuracy_score(Y_test, Y_pred3)
f1_micro3 = f1_score(Y_test, Y_pred3, average='micro')
f1_macro3 = f1_score(Y_test, Y_pred3, average='macro')

print(f"Précision globale (Accuracy) : {accuracy3:.4f}")
print(f"F1-score (micro) : {f1_micro3:.4f}")
print(f"F1-score (macro) : {f1_macro3:.4f}")

ModuleNotFoundError: No module named 'pca_implementation'