## Importation des bibliothèques

In [34]:
# Importation des bibliothèques
import numpy as np
import pandas as pd
import shap
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import mutual_info_classif, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

## Importation de la base de données

In [35]:
# Importation de la base de données
data = pd.read_excel(r"../datasets/clinic_data.xlsx")
print("Base de données importée ✅✅")

Base de données importée ✅✅


## Préparation des données

In [36]:
num_col = data.select_dtypes(include=["int64","float64"]).columns.tolist()
cat_col = data.select_dtypes(include=["object"]).columns.difference(['Diagnostique','Traitement']).tolist()

num_transformer = Pipeline([
    ("impute", SimpleImputer(strategy="constant", fill_value=-1)),
    ("scaler", StandardScaler())
])

cat_transofmer = Pipeline([
    ("impute", SimpleImputer(strategy="constant", fill_value="Missing")),
    ("oneencoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_col),
        ("cat", cat_transofmer, cat_col)
    ]
)
data_transformed = preprocessor.fit_transform(data)

# Récuperér les colonnes encodées
cat_features = preprocessor.named_transformers_["cat"]["oneencoder"].get_feature_names_out(cat_col)
all_featues = num_col + cat_features.tolist()
data_transformed = pd.DataFrame(data_transformed, columns=all_featues)
print("Nouvelles base de données créée après prétraitement ✅✅")

# Définition des variables
x = data_transformed
y = data["Diagnostique"]
encoder = LabelEncoder()
y = encoder.fit_transform(y)

Nouvelles base de données créée après prétraitement ✅✅


## Application de Mutual Information

In [None]:
# Initialisation de Mutual Information
mutual_info = mutual_info_classif(x,y)

# Conversion en dataframe 
mutual = pd.DataFrame({'Feature': x.columns, "MI Scores": mutual_info})

# Tri des scores 
mutual_df = mutual.sort_values(by="MI Scores", ascending=False)

# Filtrage des features avec socre signficatif
mi_selected = mutual_df[mutual_df['MI Scores']>0.01]

# Affichage des résultats
print('Sortie des features pertinentes ✅✅')
print(mi_selected)

## Application de Random Forest Classifier

In [38]:
# Séparation en train/test
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Modèle de base
model = LogisticRegression()
model.fit(X_train, y_train)

# Sélection de features avec RFE
rfe_selector = RFE(model, n_features_to_select=15, step=1)
rfe_selector.fit(X_train, y_train)
rfe_features = X_train.columns[rfe_selector.support_]

print("✅ Features sélectionnées par RFE :", rfe_features)

✅ Features sélectionnées par RFE : Index(['Temperature', 'Pulse', 'BloodPressure', 'SpO2', 'RespiratoryRate',
       'BMI', 'FastingGlucose', 'Cholesterol', 'StressLevel',
       'Douleurs_musculaires_Absent', 'Douleurs_musculaires_Présent',
       'Fatigue_intense_Absent', 'Frissons_Absent', 'Perte_gout_odorat_Absent',
       'Toux_seche_Absent'],
      dtype='object')


## Application de SHAP

In [39]:
from sklearn.ensemble import RandomForestClassifier

# Séparation en train/test
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Modèle de base
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Analyse SHAP avec TreeExplainer
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# Gestion des SHAP values (multiclasse ou non)
if isinstance(shap_values, list):
    shap_values_aggregated = np.mean([np.abs(sv) for sv in shap_values], axis=0)
else:
    shap_values_aggregated = np.abs(shap_values)

# Conversion explicite des noms de colonnes en array
columns_array = np.array(X_test.columns)
shap_importance = shap_values_aggregated.mean(axis=0)
shap_features = columns_array[np.argsort(-shap_importance)[:10]]  # Top 10 features SHAP

print("\nTop features selon SHAP :")
print(shap_features)



Top features selon SHAP :
[['Pulse' 'RespiratoryRate' 'Temperature' 'BloodPressure' 'SpO2']
 ['Temperature' 'BloodPressure' 'SpO2' 'RespiratoryRate' 'Pulse']
 ['SpO2' 'Pulse' 'Temperature' 'RespiratoryRate' 'BloodPressure']
 ['RespiratoryRate' 'BloodPressure' 'Pulse' 'Temperature' 'SpO2']
 ['Temperature' 'Pulse' 'BloodPressure' 'SpO2' 'RespiratoryRate']
 ['SpO2' 'Temperature' 'RespiratoryRate' 'BloodPressure' 'Pulse']
 ['SpO2' 'BloodPressure' 'RespiratoryRate' 'Temperature' 'Pulse']
 ['Temperature' 'Pulse' 'BloodPressure' 'SpO2' 'RespiratoryRate']
 ['SpO2' 'RespiratoryRate' 'BloodPressure' 'Temperature' 'Pulse']
 ['Temperature' 'SpO2' 'BloodPressure' 'Pulse' 'RespiratoryRate']]


In [40]:
# Listes des colonnes après RFE et SHAP
colonnes_pour_prediction = ["Diagnostique","Traitement","Temperature","Pulse","BloodPressure","SpO2","RespiratoryRate","BMI","FastingGlucose","Cholesterol","StressLevel","Fatigue_intense","Frissons","Perte_gout_odorat","Toux_seche"]

# Creation du nouveau jeu de données
data = data.copy()
data = data[colonnes_pour_prediction]

# Exportation du jeu de données
data.to_excel(r"../datasets/newDataClinic.xlsx", index=False)
print("Jeu de données importé ✅✅")

Jeu de données importé ✅✅


In [None]:
import numpy as np
df = pd.read_excel(r"..\datasets\clinic_data.xlsx")

colonnes_a_changer = ['Frissons','Fievre', 'Toux_seche', 'Perte_gout_odorat','Douleurs_musculaires', 'Fatigue_intense', "Diagnostique","Traitement"]

df[colonnes_a_changer] = df[colonnes_a_changer].replace({'Present': 'Présent'})

# Pour chaque colonne ciblée
for col in colonnes_a_changer:
    mask = df[col] == 'Présent'  # détecte les lignes à modifier
    indices = df[mask].sample(frac=0.3, random_state=42).index  # 30% des "Présent"
    df.loc[indices, col] = 'Absent'