In [15]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import librosa as lb
import librosa.display as lbd
import os
import soundfile as sf

# Reading and Utilizing Provided Data

In [16]:
patient_data=pd.read_csv('data/patient_diagnosis.csv',names=['pid','disease'])
patient_data.head()

Unnamed: 0,pid,disease
0,101,URTI
1,102,Healthy
2,103,Asthma
3,104,COPD
4,105,URTI


In [17]:
path='data/selected_files/'
files=[s.split('.')[0] for s in os.listdir(path) if '.txt' in s]
files[:5]

['102_1b1_Ar_sc_Meditron',
 '103_2b2_Ar_mc_LittC2SE',
 '104_1b1_Al_sc_Litt3200',
 '104_1b1_Ar_sc_Litt3200',
 '104_1b1_Ll_sc_Litt3200']

In [18]:
def getFilenameInfo(file):
    return file.split('_')

In [19]:
getFilenameInfo('160_1b3_Al_mc_AKGC417L')

['160', '1b3', 'Al', 'mc', 'AKGC417L']

In [20]:
files_data=[]
for file in files:
    data=pd.read_csv(path + file + '.txt',sep='\t',names=['start','end','crackles','weezels'])
    name_data=getFilenameInfo(file)
    data['pid']=name_data[0]
    data['mode']=name_data[-2]
    data['filename']=file
    files_data.append(data)
files_df=pd.concat(files_data)
files_df.reset_index()
files_df.head()

Unnamed: 0,start,end,crackles,weezels,pid,mode,filename
0,0.264,1.736,0,0,102,sc,102_1b1_Ar_sc_Meditron
1,1.736,3.293,0,0,102,sc,102_1b1_Ar_sc_Meditron
2,3.293,5.307,0,0,102,sc,102_1b1_Ar_sc_Meditron
3,5.307,6.636,0,0,102,sc,102_1b1_Ar_sc_Meditron
4,6.636,8.036,0,0,102,sc,102_1b1_Ar_sc_Meditron


In [21]:
patient_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 126 entries, 0 to 125
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   pid      126 non-null    int64 
 1   disease  126 non-null    object
dtypes: int64(1), object(1)
memory usage: 2.1+ KB


In [22]:
files_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1005 entries, 0 to 4
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   start     1005 non-null   float64
 1   end       1005 non-null   float64
 2   crackles  1005 non-null   int64  
 3   weezels   1005 non-null   int64  
 4   pid       1005 non-null   object 
 5   mode      1005 non-null   object 
 6   filename  1005 non-null   object 
dtypes: float64(2), int64(2), object(3)
memory usage: 62.8+ KB


In [23]:
patient_data.pid=patient_data.pid.astype('int32')
files_df.pid=files_df.pid.astype('int32')

In [24]:
data=pd.merge(files_df,patient_data,on='pid')
data.head()

Unnamed: 0,start,end,crackles,weezels,pid,mode,filename,disease
0,0.264,1.736,0,0,102,sc,102_1b1_Ar_sc_Meditron,Healthy
1,1.736,3.293,0,0,102,sc,102_1b1_Ar_sc_Meditron,Healthy
2,3.293,5.307,0,0,102,sc,102_1b1_Ar_sc_Meditron,Healthy
3,5.307,6.636,0,0,102,sc,102_1b1_Ar_sc_Meditron,Healthy
4,6.636,8.036,0,0,102,sc,102_1b1_Ar_sc_Meditron,Healthy


In [25]:
data.to_csv('data/data.csv',index=False)

# Processing Audio files

In [26]:
if not os.path.exists('data/processed_audio_files'):
    os.makedirs('data/processed_audio_files')

In [27]:
def getPureSample(raw_data,start,end,sr=22050):
    '''
    Takes a numpy array and spilts its using start and end args
    
    raw_data=numpy array of audio sample
    start=time
    end=time
    sr=sampling_rate
    mode=mono/stereo
    
    '''
    max_ind = len(raw_data) 
    start_ind = min(int(start * sr), max_ind)
    end_ind = min(int(end * sr), max_ind)
    return raw_data[start_ind: end_ind]

In [28]:
i,c=0,0
for index,row in data.iterrows():
    maxLen=6
    start=row['start']
    end=row['end']
    filename=row['filename']
    
    #If len > maxLen , change it to maxLen
    if end-start>maxLen:
        end=start+maxLen
    
    audio_file_loc=path + filename + '.wav'
    
    if index > 0:
        #check if more cycles exits for same patient if so then add i to change filename
        if data.iloc[index-1]['filename']==filename:
            i+=1
        else:
            i=0
    filename= filename + '_' + str(i) + '.wav'
    
    save_path='data/processed_audio_files/' + filename
    c+=1
    
    audioArr,sampleRate=lb.load(audio_file_loc)
    pureSample=getPureSample(audioArr,start,end,sampleRate)
    
    #pad audio if pureSample len < max_len
    reqLen=6*sampleRate
    padded_data = lb.util.pad_center(pureSample, size=reqLen)
    
    sf.write(file=save_path,data=padded_data,samplerate=sampleRate)
print('Total Files Processed: ',c)

Total Files Processed:  1005
