# Imports & Configs

In [None]:
import os
import random
import numpy as np
import pandas as pd
import librosa


from os import listdir
from os.path import join
from tqdm import tqdm

In [44]:

# Parameters
seed = 42

# Set seeds for reproducibility
np.random.seed(seed)
tf.random.set_seed(seed)
random.seed(seed)

# Path to data
data_path = "E:/projects/paper/myself/lung/download/Respiratory_Sound_Database/Respiratory_Sound_Database/audio_and_txt_files"
reduced_datapath = "E:/projects/paper/myself/reduced_audio_files"
diagnosis_path = "E:/projects/paper/myself/lung/download/Respiratory_Sound_Database/Respiratory_Sound_Database/patient_diagnosis.csv"
# os.makedirs(save_directory, exist_ok=True)

---

# Data Frame

### df PID DIS

In [6]:
df = pd.read_csv(diagnosis_path,   names=["Pid","dis"])
df

Unnamed: 0,Pid,dis
0,101,URTI
1,102,Healthy
2,103,Asthma
3,104,COPD
4,105,URTI
...,...,...
121,222,COPD
122,223,COPD
123,224,Healthy
124,225,Healthy


In [7]:
df.Pid=df.Pid.astype(int)

df.dis.value_counts()

dis
COPD              64
Healthy           26
URTI              14
Bronchiectasis     7
Pneumonia          6
Bronchiolitis      6
LRTI               2
Asthma             1
Name: count, dtype: int64

In [8]:
files=[i.split(".")[0] for i in listdir(data_path) if "txt" in i ]
l=[i.split("_")[0] for i in files]

d=pd.DataFrame(zip(l,files),columns=["Pid","file_name"]) 

### d pid file

In [13]:
d.head()

Unnamed: 0,Pid,file_name
0,101,101_1b1_Al_sc_Meditron
1,101,101_1b1_Pr_sc_Meditron
2,102,102_1b1_Ar_sc_Meditron
3,104,104_1b1_Al_sc_Litt3200
4,104,104_1b1_Ar_sc_Litt3200


In [14]:
d.Pid=d.Pid.astype(int)
d.file_name=d.file_name.astype(str)

### DF pid dis file X

In [15]:
DF=pd.merge(df,d,on="Pid")
DF

Unnamed: 0,Pid,dis,file_name
0,101,URTI,101_1b1_Al_sc_Meditron
1,101,URTI,101_1b1_Pr_sc_Meditron
2,102,Healthy,102_1b1_Ar_sc_Meditron
3,104,COPD,104_1b1_Al_sc_Litt3200
4,104,COPD,104_1b1_Ar_sc_Litt3200
...,...,...,...
912,224,Healthy,224_1b2_Al_sc_Meditron
913,225,Healthy,225_1b1_Pl_sc_Meditron
914,226,Pneumonia,226_1b1_Al_sc_Meditron
915,226,Pneumonia,226_1b1_Ll_sc_Meditron


In [16]:
DF.dis.value_counts()


dis
COPD              793
Pneumonia          37
Healthy            35
URTI               23
Bronchiectasis     16
Bronchiolitis      13
Name: count, dtype: int64

### check

In [40]:
# 3 less bcs d has audio and no audio for asthma and lrti
(d["Pid"]).unique().__len__() ,df.index.unique().__len__() ,  DF.__len__()


(123, 126, 917)

In [33]:
DF.groupby("Pid").count().sort_values("dis",ascending=0)[:20]


Unnamed: 0_level_0,dis,file_name
Pid,Unnamed: 1_level_1,Unnamed: 2_level_1
130,66,66
107,28,28
151,28,28
172,27,27
138,27,27
170,25,25
178,24,24
162,24,24
158,24,24
186,24,24


### reduced df

In [73]:
final_df=pd.read_csv(r'E:\projects\paper\myself\data_processing\down_sampled_dataset.csv')
final_df

Unnamed: 0,Pid,dis,file_name
0,101,URTI,101_1b1_Al_sc_Meditron
1,101,URTI,101_1b1_Pr_sc_Meditron
2,102,Healthy,102_1b1_Ar_sc_Meditron
3,105,URTI,105_1b1_Tc_sc_Meditron
4,111,Bronchiectasis,111_1b2_Tc_sc_Meditron
...,...,...,...
144,224,Healthy,224_1b2_Al_sc_Meditron
145,225,Healthy,225_1b1_Pl_sc_Meditron
146,226,Pneumonia,226_1b1_Ll_sc_Meditron
147,226,Pneumonia,226_1b1_Al_sc_Meditron


In [74]:
final_df['dis'].value_counts()

dis
Pneumonia         37
Healthy           35
COPD              25
URTI              23
Bronchiectasis    16
Bronchiolitis     13
Name: count, dtype: int64

# FE

In [81]:
def fe(file, max_length):
    audio, sr = librosa.load(file, sr=4800)  # Load audio
    return audio

## non_downsampled

### non_downsampled_x

In [47]:
x=[]
for i in tqdm(listdir(data_path)):
    if i.split(".")[1]=="wav":
        x.append(fe(f"{data_path}/{i}"))

100%|██████████| 1834/1834 [00:40<00:00, 44.78it/s]


In [48]:
x

[array([0.05343904, 0.10471563, 0.09451785, ..., 0.08763523, 0.08038723,
        0.08959479], dtype=float32),
 array([-0.02075644, -0.0404014 , -0.03602667, ..., -0.03796917,
        -0.03727765, -0.0449216 ], dtype=float32),
 array([ 0.00218729,  0.00407564,  0.00339912, ..., -0.00444467,
        -0.00759088, -0.01371126], dtype=float32),
 array([ 2.8901319e-05, -2.1220703e-04, -3.0317443e-04, ...,
        -2.4063427e-02, -3.0680586e-02, -1.8851658e-02], dtype=float32),
 array([ 0.00076797,  0.00020486, -0.00400467, ..., -0.01401444,
        -0.01255397,  0.        ], dtype=float32),
 array([ 2.8987200e-05, -2.7774258e-05, -2.4569940e-04, ...,
        -3.0343056e-02, -3.6847439e-02, -2.8113030e-02], dtype=float32),
 array([-0.00062679, -0.00062604,  0.00245718, ..., -0.04093595,
        -0.03717685,  0.        ], dtype=float32),
 array([ 0.00107154,  0.00085598, -0.00447832, ..., -0.01839567,
        -0.02388723, -0.01890163], dtype=float32),
 array([ 5.9669480e-07,  2.4567693e-05,  2

In [51]:
max_len = max(map(len, x))


In [52]:
max_len

413760

In [53]:
padded_x = np.array([np.pad(seq, (0, max_len - len(seq)), 'constant') for seq in x])


In [59]:
padded_x

array([[ 0.05343904,  0.10471563,  0.09451785, ...,  0.        ,
         0.        ,  0.        ],
       [-0.02075644, -0.0404014 , -0.03602667, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.00218729,  0.00407564,  0.00339912, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.00942281,  0.01889487,  0.01717605, ...,  0.        ,
         0.        ,  0.        ],
       [-0.05081851, -0.09910698, -0.08860061, ...,  0.        ,
         0.        ,  0.        ],
       [-0.05121048, -0.09565088, -0.07949052, ...,  0.        ,
         0.        ,  0.        ]], dtype=float32)

In [55]:
padded_x.shape

(917, 413760)

In [57]:
np.savez_compressed('x_data_non_downsampled.npz', x=padded_x)


### non_downsampled_y

In [60]:
from sklearn.preprocessing import LabelEncoder
y=DF.dis
le=LabelEncoder()
y=le.fit_transform(y)
set(le.inverse_transform(y))
y = np.array(y)


In [61]:
len(y)

917

In [80]:
le.classes_

array(['Bronchiectasis', 'Bronchiolitis', 'COPD', 'Healthy', 'Pneumonia',
       'URTI'], dtype=object)

In [62]:
y

array([5, 5, 3, 2, 2, 2, 2, 2, 2, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 5, 2, 2, 2, 2, 2, 3, 3, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 3, 2, 2, 2, 2, 2, 2, 3, 3, 3, 2, 5, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 5, 5, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
       3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5,

In [63]:
set(le.inverse_transform(y))

{'Bronchiectasis', 'Bronchiolitis', 'COPD', 'Healthy', 'Pneumonia', 'URTI'}

In [64]:
np.savez_compressed('y_data_non_downsampled.npz', x=y)


## downsampled

### downsampled_x

In [None]:
x_d=[]
for i in tqdm(listdir(reduced_datapath)):
    if i.split(".")[1]=="wav":
        x_d.append(fe(f"{reduced_datapath}/{i}"))

In [66]:
x_d

[array([0.05343904, 0.10471563, 0.09451785, ..., 0.08763523, 0.08038723,
        0.08959479], dtype=float32),
 array([-0.02075644, -0.0404014 , -0.03602667, ..., -0.03796917,
        -0.03727765, -0.0449216 ], dtype=float32),
 array([ 0.00218729,  0.00407564,  0.00339912, ..., -0.00444467,
        -0.00759088, -0.01371126], dtype=float32),
 array([ 0.00027351,  0.00646203, -0.01362575, ...,  0.03294948,
         0.02760839,  0.0339541 ], dtype=float32),
 array([-0.00161239, -0.00424626, -0.00268496, ..., -0.00322157,
        -0.00171977,  0.        ], dtype=float32),
 array([-0.00054496, -0.00147614, -0.00126609, ...,  0.01165519,
         0.01060031,  0.        ], dtype=float32),
 array([0.01050373, 0.02043646, 0.01868778, ..., 0.00629313, 0.00633676,
        0.        ], dtype=float32),
 array([-0.0052017 , -0.00989682, -0.00840468, ..., -0.01078835,
        -0.02435746,  0.        ], dtype=float32),
 array([ 0.00443251,  0.00870727,  0.0078141 , ..., -0.00105041,
        -0.00115508

In [67]:
max_len=max(map(len, x))

413760

In [68]:
padded_x_d = np.array([np.pad(seq, (0, max_len - len(seq)), 'constant') for seq in x_d])


In [69]:
padded_x_d

array([[ 0.05343904,  0.10471563,  0.09451785, ...,  0.        ,
         0.        ,  0.        ],
       [-0.02075644, -0.0404014 , -0.03602667, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.00218729,  0.00407564,  0.00339912, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.00942281,  0.01889487,  0.01717605, ...,  0.        ,
         0.        ,  0.        ],
       [-0.05081851, -0.09910698, -0.08860061, ...,  0.        ,
         0.        ,  0.        ],
       [-0.05121048, -0.09565088, -0.07949052, ...,  0.        ,
         0.        ,  0.        ]], dtype=float32)

In [70]:
padded_x_d.shape

(149, 413760)

In [71]:
np.savez_compressed('x_data_downsampled.npz', x=padded_x_d)

### downsampled_y

In [75]:
from sklearn.preprocessing import LabelEncoder
y_d=final_df.dis
le=LabelEncoder()
y_d=le.fit_transform(y_d)
set(le.inverse_transform(y_d))
y_d = np.array(y_d)


In [77]:
len(y_d)

149

In [78]:
y_d

array([5, 5, 3, 5, 0, 0, 0, 0, 2, 2, 5, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       3, 2, 3, 3, 3, 5, 2, 2, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       3, 5, 5, 2, 4, 4, 4, 4, 3, 3, 3, 2, 2, 5, 1, 1, 1, 5, 2, 3, 3, 2,
       2, 2, 3, 3, 3, 3, 1, 1, 5, 5, 5, 5, 1, 1, 0, 0, 0, 3, 2, 1, 2, 2,
       2, 2, 3, 3, 3, 3, 3, 3, 2, 2, 3, 5, 5, 5, 5, 5, 4, 4, 4, 2, 3, 3,
       0, 5, 5, 2, 0, 0, 0, 0, 0, 0, 3, 2, 1, 1, 1, 2, 3, 3, 5, 5, 2, 2,
       3, 0, 0, 1, 1, 3, 4, 4, 4, 4, 4, 3, 3, 3, 4, 4, 4])

In [79]:
np.savez_compressed('y_data_downsampled.npz', x=y_d)


In [83]:
pd.Series(y_d).value_counts()

4    37
3    35
2    25
5    23
0    16
1    13
Name: count, dtype: int64

In [84]:
le.classes_

array(['Bronchiectasis', 'Bronchiolitis', 'COPD', 'Healthy', 'Pneumonia',
       'URTI'], dtype=object)