In [None]:
pip freeze "requirements.txt"


In [67]:
import numpy as np
import librosa 
import librosa.display
import pandas as pd
import soundfile as sf
import sklearn
import matplotlib.pyplot as plt
from tqdm import tqdm
from audiomentations import *
import warnings
import scipy.signal
from pydub import AudioSegment
warnings.simplefilter(action='ignore', category=FutureWarning)

In [68]:
# augments
augment = Compose([
    Reverse(p=0.8),
    PolarityInversion(p=1)
])

augment_spec = SpecCompose([
    SpecFrequencyMask(p=1)
])

def augment_signal(S,sr,outputpath):
    augmented_signal = augment(S,sr)
    sf.write(outputpath,augmented_signal,sr)


# for balancing
def grab_id(amt,val_srs):
    temp_idx=[]
    temp_count=0
    for i,k in val_srs.iteritems():
        if temp_count >= amt:
            break
        else:
            temp_count+=k
            temp_idx.append(i)
    return temp_idx

# feature extraction functions
def mfccs_features_extract(S,sr):
    mfccs_features = librosa.feature.mfcc(y=S, sr=sr, n_mfcc= 40,n_fft=len(S))
    mfccs_scaled_features = np.mean(mfccs_features.T, axis= 0)
    return mfccs_scaled_features,mfccs_features

def segment_audio(t1,t2,fn):
    t1 = t1 * 1000
    t2 = t2 * 1000
    segemented_audio = AudioSegment.from_wav(fn)
    segemented_audio = segemented_audio[t1:t2]
    return segemented_audio


In [69]:
# paths
data_path = "./1.0.1/"
augment_path = "./1.0.1/training_data_balanced/"
records = data_path+ "RECORDS"

In [70]:
demographics = pd.read_csv(data_path+"training_data.csv")
print("Class distribution by patient id")
demographics["Murmur"].value_counts(normalize=True)

Class distribution by patient id


Absent     0.737792
Present    0.190021
Unknown    0.072187
Name: Murmur, dtype: float64

In [71]:
# read all audio recordings and augment
with open(records,'r') as r:
    filenames = r.readlines()

    # dataframe containing audio recording path with corresponding patient_id
    file_df = pd.DataFrame(filenames,columns=["filenames"])
    file_df["filenames"] = file_df["filenames"].str.strip() + '.wav'
    file_df["patient_id"] = file_df["filenames"].str.split("/").str[1]
    file_df["patient_id"] = file_df["patient_id"].str.split("_").str[0].astype(str)
    # merge with demographics to enrich dataframe
    demographics["Patient ID"] = demographics["Patient ID"].astype(str)
    file_df = file_df.merge(
        demographics[["Murmur","Patient ID"]],
        how='left',
        left_on='patient_id',
        right_on='Patient ID'
    )
    file_df.pop("Patient ID")
# class distribution by audio recordings
print("Class distribution by audio recordings"+ "\n" ,file_df["Murmur"].value_counts(normalize=True)) #Heavily biased to absent class

# grabbing only present and absent class
file_df_present = file_df.loc[file_df["Murmur"]=="Present"]
file_df_absent = file_df.loc[file_df["Murmur"]=="Absent"]

# creating test df
test_present_ids = grab_id(300,file_df_present["patient_id"].value_counts())
test_absent_ids = grab_id(300,file_df_absent["patient_id"].value_counts())
test_ids = test_absent_ids+test_present_ids
test_df = file_df.loc[file_df["patient_id"].isin(test_ids)]

# create train df
train_df = file_df.loc[~(file_df["patient_id"].isin(test_ids)) & (file_df["Murmur"] != "Unknown")]
print("\nTraining set class distribution (Pre-Augment)\n",train_df["Murmur"].value_counts())
print("\nAugmenting present class audio data\n")

print(train_df)


Class distribution by audio recordings
 Absent     0.755928
Present    0.194752
Unknown    0.049320
Name: Murmur, dtype: float64

Training set class distribution (Pre-Augment)
 Absent     2089
Present     314
Name: Murmur, dtype: int64

Augmenting present class audio data

                       filenames patient_id   Murmur
16    training_data/14241_AV.wav      14241  Present
17    training_data/14241_MV.wav      14241  Present
18    training_data/14241_PV.wav      14241  Present
19    training_data/14241_TV.wav      14241  Present
20    training_data/14998_AV.wav      14998   Absent
...                          ...        ...      ...
3158  training_data/85345_AV.wav      85345   Absent
3159  training_data/85345_PV.wav      85345   Absent
3160  training_data/85349_AV.wav      85349   Absent
3161  training_data/85349_PV.wav      85349   Absent
3162  training_data/85349_TV.wav      85349   Absent

[2403 rows x 3 columns]


In [72]:
# 
train_df["data"] = np.array(None)
train_df = train_df.reset_index(drop=True)
SEGMENTDIR = augment_path+"training_data/segmented_data"
new_train_df = pd.DataFrame(data=[],columns=["filenames","patient_id","Murmur","data"])
for i,k in tqdm(train_df.copy().iterrows(),total=train_df.copy().shape[0]):
    segments_ = k["filenames"].split("/")[1]
    tsv_name = segments_.replace(".wav",".tsv")
    tsv_df = pd.read_csv(f"{augment_path}/training_data/{tsv_name}",sep='\t',header=None)
    tsv_df_1 = tsv_df.loc[tsv_df[2] == 1][0].values.tolist()
    tsv_df_4 = tsv_df.loc[tsv_df[2] == 4][1].values.tolist()
    
    if len(tsv_df_1) < len(tsv_df_4):
        for n in range(len(tsv_df_1)):
            segemented_audio_name = segments_.split(".")[0]+f"_s{n}.wav"
            export_audio = segment_audio(tsv_df_1[n],tsv_df_4[n],augment_path+k["filenames"])
            export_audio.export(SEGMENTDIR+f"/{segemented_audio_name}",format="wav")
            full_export_path = SEGMENTDIR+f"/{segemented_audio_name}"
            new_train_df.loc[new_train_df.shape[0]] = [
                full_export_path,k["patient_id"],k["Murmur"],None
            ]
    else:
        for n in range(len(tsv_df_4)):
            segemented_audio_name = segments_.split(".")[0]+f"_s{n}.wav"
            export_audio = segment_audio(tsv_df_1[n],tsv_df_4[n],augment_path+k["filenames"])
            export_audio.export(SEGMENTDIR+f"/{segemented_audio_name}",format="wav")
            full_export_path = SEGMENTDIR+f"/{segemented_audio_name}"
            new_train_df.loc[new_train_df.shape[0]] = [
                full_export_path,k["patient_id"],k["Murmur"],None
            ]
print(new_train_df)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df["data"] = np.array(None)
100%|██████████| 2403/2403 [08:29<00:00,  4.72it/s]

                                               filenames patient_id   Murmur  \
0      ./1.0.1/training_data_balanced/training_data/s...      14241  Present   
1      ./1.0.1/training_data_balanced/training_data/s...      14241  Present   
2      ./1.0.1/training_data_balanced/training_data/s...      14241  Present   
3      ./1.0.1/training_data_balanced/training_data/s...      14241  Present   
4      ./1.0.1/training_data_balanced/training_data/s...      14241  Present   
...                                                  ...        ...      ...   
49892  ./1.0.1/training_data_balanced/training_data/s...      85349   Absent   
49893  ./1.0.1/training_data_balanced/training_data/s...      85349   Absent   
49894  ./1.0.1/training_data_balanced/training_data/s...      85349   Absent   
49895  ./1.0.1/training_data_balanced/training_data/s...      85349   Absent   
49896  ./1.0.1/training_data_balanced/training_data/s...      85349   Absent   

       data  
0      None  
1      None




In [75]:
print(new_train_df)
print(new_train_df["filenames"][0])
train_df = new_train_df


                                               filenames patient_id   Murmur  \
0      ./1.0.1/training_data_balanced/training_data/s...      14241  Present   
1      ./1.0.1/training_data_balanced/training_data/s...      14241  Present   
2      ./1.0.1/training_data_balanced/training_data/s...      14241  Present   
3      ./1.0.1/training_data_balanced/training_data/s...      14241  Present   
4      ./1.0.1/training_data_balanced/training_data/s...      14241  Present   
...                                                  ...        ...      ...   
49892  ./1.0.1/training_data_balanced/training_data/s...      85349   Absent   
49893  ./1.0.1/training_data_balanced/training_data/s...      85349   Absent   
49894  ./1.0.1/training_data_balanced/training_data/s...      85349   Absent   
49895  ./1.0.1/training_data_balanced/training_data/s...      85349   Absent   
49896  ./1.0.1/training_data_balanced/training_data/s...      85349   Absent   

       data  
0      None  
1      None

In [76]:
# augmenting present class audio data
filtered_train = train_df.loc[train_df["Murmur"]!="Absent"].copy()
for i,k in tqdm(filtered_train.iterrows(),total=filtered_train.shape[0]):
    S,sr = librosa.load(k["filenames"],sr=4000)
    augmented_signal = augment(S,sr)
    augmented_signal_filename = k["filenames"].replace(".wav","_a1.wav")
    augment_signal(S,sr,augmented_signal_filename)
    train_df = train_df.append(pd.DataFrame(
        data=[[augmented_signal_filename,k["patient_id"],k["Murmur"],np.array(None)]],
        columns=list(train_df.columns.values)
        ))
print("\nTraining set class distribution (Post Signal Augment)\n",train_df["Murmur"].value_counts())
    

100%|██████████| 6982/6982 [02:47<00:00, 41.66it/s]


Training set class distribution (Post Signal Augment)
 Absent     42915
Present    13964
Name: Murmur, dtype: int64





In [77]:
train_df["Murmur"].value_counts()
train_df["data"] = np.array(None)

In [78]:
# extracting features and augmenting mfcc spectogram for training set
print("\nExtracting features and augmenting present class mfcc spectogram\n")
train_df = train_df.reset_index(drop=True)
for i,k in tqdm(train_df.copy().iterrows(),total=train_df.copy().shape[0]):
    S,sr = librosa.load(k["filenames"],sr=4000)
    if len(S) != 0:
        scaled_mfccs , mfccs = mfccs_features_extract(S,sr)
        train_df.loc[i,"data"] = scaled_mfccs
        if k["Murmur"] != "Absent":
            mfccs_augment = augment_spec(mfccs)
            mfccs_augment = np.mean(mfccs_augment.T,axis=0)
            train_df = train_df.append(pd.DataFrame(
                data=[[
                    k["filenames"].replace(".wav","_augmentspect.wav"),
                    k["patient_id"],
                    k["Murmur"],
                    mfccs_augment
                ]],
                columns=list(train_df.columns.values)
            ))
    else:
        train_df.drop(index=i,axis=0)
        continue
print("\nTraining set class distribution (Post Spectrogram augment))\n",train_df["Murmur"].value_counts())


Extracting features and augmenting present class mfcc spectogram



  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
100%|██████████| 56879/56879 [19:00<00:00, 49.87it/s]


Training set class distribution (Post Spectrogram augment))
 Absent     42915
Present    20356
Name: Murmur, dtype: int64





In [79]:
# class balancing for training set
present_count = train_df["Murmur"].value_counts()[1]
print("Trim absent class data to match size of present class data and shuffle ")
absent_ids = grab_id(present_count,train_df.loc[train_df["Murmur"]=="Absent"]["patient_id"].value_counts())
train_df = pd.concat([
    train_df.loc[train_df["Murmur"]=="Present"],
    train_df.loc[(train_df["Murmur"]=="Absent")&(train_df["patient_id"].isin(absent_ids))]
]).sample(frac=1)
train_df = train_df.reset_index(drop=True)
print(train_df["Murmur"].value_counts())

Trim absent class data to match size of present class data and shuffle 
Absent     20382
Present    20356
Name: Murmur, dtype: int64


In [80]:

test_df["data"] = None
test_df = test_df.reset_index(drop=True)
for i,k in tqdm(test_df.copy().iterrows(),total=test_df.copy().shape[0]):
    S,sr = librosa.load(data_path+k["filenames"],sr=4000)
    test_scaled_mfccs, test_melspect = mfccs_features_extract(S,sr)
    test_df.loc[i,"data"] = test_scaled_mfccs
test_df = test_df.sample(frac=1).reset_index()
print("\nTesting set class distribution (final)\n",test_df["Murmur"].value_counts())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["data"] = None
100%|██████████| 604/604 [05:11<00:00,  1.94it/s]


Testing set class distribution (final)
 Absent     302
Present    302
Name: Murmur, dtype: int64





In [81]:
# manual splitting and assigning data to variables
X_test = np.array(test_df["data"].tolist())
X_train = np.array(train_df["data"].tolist())
y_test = test_df["Murmur"]
y_train = train_df["Murmur"]


# export data 
import pickle 

pickle_out_X_train = open("./1.0.1/pickled_data/X_train_autosklearn_seg.pickle","wb")
pickle.dump(X_train,pickle_out_X_train)
pickle_out_X_train.close()

pickle_out_X_test = open("./1.0.1/pickled_data/X_test_autosklearn_seg.pickle","wb")
pickle.dump(X_test,pickle_out_X_test)
pickle_out_X_test.close()


pickle_out_y_train = open("./1.0.1/pickled_data/y_train_autosklearn_seg.pickle","wb")
pickle.dump(y_train,pickle_out_y_train)
pickle_out_y_train.close()

pickle_out_y_test = open("./1.0.1/pickled_data/y_test_autosklearn_seg.pickle","wb")
pickle.dump(y_test,pickle_out_y_test)
pickle_out_y_test.close()

  X_train = np.array(train_df["data"].tolist())


In [82]:
X_test[0].shape

(40,)