In [4]:
import pandas as pd
import numpy as np

import wave

In [16]:
import os


def get_infos_from_filename(filename: str):
    root_name = filename.split("/")[-1]
    root_name = root_name.split(".")[0]
    root_name = root_name.split("_")
    
    return pd.Series({
        "Patient ID": root_name[0],
        "Recording index": root_name[1],
        "Chest location": root_name[2],
        "Acquisition mode": root_name[3],
        "Recording equipment": root_name[4]
    })

def read_wav(filename: str):
    patient_infos = get_infos_from_filename(filename)
    wav_file = wave.open(filename)
    
    return patient_infos, wav_file

def read_file_info(filename: str):
    return pd.read_csv(filename, sep="\t", names=["Start time", "End time", "Wheeze", "Cackle"])

all_data = []
file_infos = pd.DataFrame()
diagnosis = pd.read_csv("data/patient_diagnosis.csv", names=["Patient ID", "Diagnosis"], dtype={"Patient ID": str})
demographic_infos =  pd.read_csv('data/demographic_info.txt', names = 
                                 ['Patient ID', 'Age', 'Sex' , 'Adult BMI (kg/m2)', 'Child Weight (kg)' , 'Child Height (cm)'],
                                 delimiter = ' ', dtype={"Patient ID": str})

for file in os.listdir("data/audio_and_txt_files"):
    if file.endswith(".wav"):
        # print("Reading wav " + file)
        patient_info, wav = read_wav("data/audio_and_txt_files/" + file)
        file_info = read_file_info("data/audio_and_txt_files/" + file.replace(".wav", ".txt"))
        file_info["Patient ID"] = patient_info["Patient ID"]
        
        all_data.append(patient_info)
        
        file_infos = pd.concat((file_infos, file_info))
        
all_data = pd.DataFrame(all_data)

merged = all_data.merge(file_infos, on="Patient ID", how="inner")


In [17]:
from typing import Optional


def get_data_for(patient_id: str, chest_location: Optional[str] = None, recording_equipment: Optional[str] = None):
    global merged
    
    result = merged[merged["Patient ID"] == patient_id]
    if chest_location:
        result = result[result["Chest location"] == chest_location]
    if recording_equipment:
        result = result[result["Recording equipment"] == recording_equipment]
    
    return result

def get_patient_diagnosis(patient_id: str):
    global diagnosis
    
    return diagnosis[diagnosis["Patient ID"] == patient_id]["Diagnosis"].values[0]

def get_patient_demographic_info(patient_id: str):
    global demographic_infos
    
    return demographic_infos[demographic_infos["Patient ID"] == patient_id]

In [None]:
patient_data = get_data_for("112")
p_id = patient_data.iloc[0]["Patient ID"]

COPD
   Patient ID   Age Sex  Adult BMI (kg/m2)  Child Weight (kg)  \
11        112  60.0   M              22.86                NaN   

    Child Height (cm)  
11                NaN  


Unnamed: 0,Patient ID,Recording index,Chest location,Acquisition mode,Recording equipment,Start time,End time,Wheeze,Cackle
7575,112,1b1,Ar,sc,Meditron,1.9370,4.8918,0,0
7576,112,1b1,Ar,sc,Meditron,4.8918,9.1707,0,0
7577,112,1b1,Ar,sc,Meditron,9.1707,14.5880,0,0
7578,112,1b1,Ar,sc,Meditron,14.5880,19.6980,0,0
7579,112,1b1,Ar,sc,Meditron,19.6980,25.6520,0,0
...,...,...,...,...,...,...,...,...,...
7855,112,1p1,Pr,sc,Litt3200,14.8650,17.7870,0,1
7856,112,1p1,Pr,sc,Litt3200,17.7870,20.7100,0,1
7857,112,1p1,Pr,sc,Litt3200,20.7100,23.8740,0,1
7858,112,1p1,Pr,sc,Litt3200,23.8740,26.6520,0,1


## Visualisations

In [None]:
import plotly.express as pe

bins = [i * 10 for i in range(11)]
labels = [f"{i * 10}-{(i + 1) * 10}" for i in range(10)]

sorted_age = pd.cut(demographic_infos["Age"], bins, labels=labels).value_counts().sort_index()
print(sorted_age)

pe.bar(data_frame=sorted_age, title="Distribution de l'âge", labels={"value": "Nombre d'individus"})

Age
0-10      41
10-20      9
20-30      1
30-40      0
40-50      2
50-60     10
60-70     30
70-80     26
80-90      5
90-100     1
Name: count, dtype: int64


In [103]:
demographic_infos.head()

bins = [i for i in range(5)]
labels = [f"{i}-{(i + 1)} ans" for i in range(4)]

before_five_year = demographic_infos[demographic_infos["Age"] <= 5]["Age"]

sorted_age = pd.cut(before_five_year, bins, labels=labels).value_counts().sort_index()

pe.bar(data_frame=sorted_age, title="Distribution de l'âge (avant 5 ans)", labels={"value": "Nombre d'individus"})

In [None]:
bmis = demographic_infos["Adult BMI (kg/m2)"].value_counts()

bins = [0, 16.0, 18.4, 24.9, 30, 35, 40, np.inf]
labels = ["Anorexie", "Sous-poids", "Normal", "Surpoids", "Obésité", "Obésité sévère", "Obésité morbide"]

sorted_bmis = pd.cut(demographic_infos["Adult BMI (kg/m2)"], bins, labels=labels).value_counts().sort_index()
print(sorted_bmis)

pe.bar(data_frame=sorted_bmis, title="Distribution de l'IMC", labels={"value": "Nombre d'individus", "Adult BMI (kg/m2)": "IMC (Indice)"})

Adult BMI (kg/m2)
Anorexie            0
Sous-poids          3
Normal             20
Surpoids           38
Obésité             9
Obésité sévère      4
Obésité morbide     1
Name: count, dtype: int64
