# Analyse des données et prétraitement

## Regroupement des données

In [None]:
import os
import pandas as pd

# Chemins vers les répertoires et fichiers
data_dir = "data/audio_and_txt_files"
diagnosis_file = "data/patient_diagnosis.csv"
demographics_file = "data/demographic_info.txt"

# Charger le fichier des diagnostics
diagnosis_df = pd.read_csv(diagnosis_file, header=None, names=["patient", "diagnosis"])

# Charger le fichier des informations démographiques
demographics_df = pd.read_csv(
    demographics_file,
    delim_whitespace=True,  # Utiliser détection d'espaces multiples comme séparateur
    header=None,
    names=["patient", "age", "sex", "adult_bmi", "child_weight", "child_height"],
    na_values=["NA"]  # Gérer NA comme valeur manquante
)

# Initialisation de la liste pour stocker les données
data = []

# Parcourir les fichiers audio et texte
for filename in os.listdir(data_dir):
    if filename.endswith(".wav"):  # Fichiers audio
        # Extraire les métadonnées à partir du nom de fichier
        parts = filename.split("_")
        patient = int(parts[0])
        recording_index = parts[1]
        chest_location = parts[2]
        acquisition_mode = parts[3]
        recording_equipment = parts[4].split(".")[0]
        
        # Associer le diagnostic
        diagnosis = diagnosis_df.loc[diagnosis_df["patient"] == patient, "diagnosis"]
        diagnosis = diagnosis.iloc[0] if not diagnosis.empty else "Unknown"

        # Associer les informations démographiques
        demographics = demographics_df.loc[demographics_df["patient"] == patient]
        if not demographics.empty:
            age = demographics["age"].iloc[0]
            sex = demographics["sex"].iloc[0]
            adult_bmi = demographics["adult_bmi"].iloc[0]
            child_weight = demographics["child_weight"].iloc[0]
            child_height = demographics["child_height"].iloc[0]
        else:
            age, sex, adult_bmi, child_weight, child_height = None, None, None, None, None

        # Charger le fichier texte correspondant
        txt_filename = filename.replace(".wav", ".txt")
        txt_filepath = os.path.join(data_dir, txt_filename)
        
        if os.path.exists(txt_filepath):
            with open(txt_filepath, "r") as file:
                for line in file:
                    begin_time, end_time, crackles, wheezes = map(float, line.strip().split("\t"))
                    data.append({
                        "patient": patient,
                        "recording_index": recording_index,
                        "chest_location": chest_location,
                        "acquisition_mode": acquisition_mode,
                        "recording_equipment": recording_equipment,
                        "diagnosis": diagnosis,
                        "age": age,
                        "sex": sex,
                        "adult_bmi": adult_bmi,
                        "child_weight": child_weight,
                        "child_height": child_height,
                        "begin_time": begin_time,
                        "end_time": end_time,
                        "crackles": int(crackles),
                        "wheezes": int(wheezes),
                    })

# Créer un DataFrame à partir des données
df = pd.DataFrame(data)

# Sauvegarder dans un fichier CSV
df.to_csv("data/processed_data.csv", index=False)


  demographics_df = pd.read_csv(


   patient recording_index chest_location acquisition_mode  \
0      101             1b1             Al               sc   
1      101             1b1             Al               sc   
2      101             1b1             Al               sc   
3      101             1b1             Al               sc   
4      101             1b1             Al               sc   

  recording_equipment diagnosis  age sex  adult_bmi  child_weight  \
0            Meditron      URTI  3.0   F        NaN          19.0   
1            Meditron      URTI  3.0   F        NaN          19.0   
2            Meditron      URTI  3.0   F        NaN          19.0   
3            Meditron      URTI  3.0   F        NaN          19.0   
4            Meditron      URTI  3.0   F        NaN          19.0   

   child_height  begin_time  end_time  crackles  wheezes  
0          99.0       0.036     0.579         0        0  
1          99.0       0.579     2.450         0        0  
2          99.0       2.450     3.8

## Analyse des données