In [13]:
pip freeze "requirements.txt"


appdirs==1.4.4
argon2-cffi==21.3.0
argon2-cffi-bindings==21.2.0
asttokens==2.0.5
attrs==21.4.0
audiomentations==0.24.0
audioread==2.1.9
auto-sklearn==0.14.6
backcall==0.2.0
beautifulsoup4==4.11.1
bleach==5.0.0
certifi==2021.10.8
cffi==1.15.0
charset-normalizer==2.0.12
click==8.1.3
cloudpickle==2.0.0
ConfigSpace==0.4.14
cycler==0.11.0
Cython==0.29.28
dask==2022.5.0
debugpy==1.6.0
decorator==5.1.1
defusedxml==0.7.1
distributed==2022.5.0
distro==1.7.0
emcee==3.1.1
entrypoints==0.4
et-xmlfile==1.1.0
executing==0.8.3
fastjsonschema==2.15.3
fonttools==4.33.3
fsspec==2022.3.0
future==0.18.2
HeapDict==1.0.1
idna==3.3
importlib-resources==5.7.1
ipykernel==6.13.0
ipython==8.3.0
ipython-genutils==0.2.0
jedi==0.18.1
Jinja2==3.1.2
joblib==1.1.0
jsonschema==4.4.0
jupyter-client==7.3.0
jupyter-core==4.10.0
jupyterlab-pygments==0.2.2
kiwisolver==1.4.2
liac-arff==2.5.0
librosa==0.9.1
llvmlite==0.36.0
locket==1.0.0
MarkupSafe==2.1.1
matplotlib==3.5.2
matplotlib-inline==0.1.3
mistune==0.8.4
msgpack==1.0.

In [14]:
import numpy as np
import librosa 
import librosa.display
import pandas as pd
import soundfile as sf
import sklearn
import matplotlib.pyplot as plt
from tqdm import tqdm
from audiomentations import *
import warnings
import scipy.signal
from pydub import AudioSegment
warnings.simplefilter(action='ignore', category=FutureWarning)

In [15]:
from distutils.dir_util import copy_tree
import os
import shutil
try:
    shutil.rmtree("./1.0.1/training_data_balanced/training_data")
except:
    os.mkdir("./1.0.1/training_data_balanced/training_data")
copy_tree("./1.0.1/training_data","./1.0.1/training_data_balanced/training_data")


['./1.0.1/training_data_balanced/training_data/13918.txt',
 './1.0.1/training_data_balanced/training_data/13918_AV.hea',
 './1.0.1/training_data_balanced/training_data/13918_AV.tsv',
 './1.0.1/training_data_balanced/training_data/13918_AV.wav',
 './1.0.1/training_data_balanced/training_data/13918_MV.hea',
 './1.0.1/training_data_balanced/training_data/13918_MV.tsv',
 './1.0.1/training_data_balanced/training_data/13918_MV.wav',
 './1.0.1/training_data_balanced/training_data/13918_PV.hea',
 './1.0.1/training_data_balanced/training_data/13918_PV.tsv',
 './1.0.1/training_data_balanced/training_data/13918_PV.wav',
 './1.0.1/training_data_balanced/training_data/13918_TV.hea',
 './1.0.1/training_data_balanced/training_data/13918_TV.tsv',
 './1.0.1/training_data_balanced/training_data/13918_TV.wav',
 './1.0.1/training_data_balanced/training_data/14241.txt',
 './1.0.1/training_data_balanced/training_data/14241_AV.hea',
 './1.0.1/training_data_balanced/training_data/14241_AV.tsv',
 './1.0.1/trai

In [16]:
# augments
augment = Compose([
    Reverse(p=0.8),
    PolarityInversion(p=1)
])

augment_spec = SpecCompose([
    SpecFrequencyMask(p=1)
])

def augment_signal(S,sr,outputpath):
    augmented_signal = augment(S,sr)
    sf.write(outputpath,augmented_signal,sr)


# for balancing
def grab_id(amt,val_srs):
    temp_idx=[]
    temp_count=0
    for i,k in val_srs.iteritems():
        if temp_count >= amt:
            break
        else:
            temp_count+=k
            temp_idx.append(i)
    return temp_idx

# feature extraction functions
def mfccs_features_extract(S,sr):
    mfccs_features = librosa.feature.mfcc(y=S, sr=sr, n_mfcc= 40)
    mfccs_features = librosa.decompose.nn_filter(mfccs_features)
    mfccs_scaled_features = np.mean(mfccs_features.T, axis= 0)
    return mfccs_scaled_features,mfccs_features


In [17]:
# paths
data_path = "./1.0.1/"
augment_path = "./1.0.1/training_data_balanced/"
records = data_path+ "RECORDS"

In [18]:
demographics = pd.read_csv(data_path+"training_data.csv")
print("Class distribution by patient id")
demographics["Murmur"].value_counts(normalize=True)

Class distribution by patient id


Absent     0.737792
Present    0.190021
Unknown    0.072187
Name: Murmur, dtype: float64

In [19]:
# read all audio recordings and augment
with open(records,'r') as r:
    filenames = r.readlines()

    # dataframe containing audio recording path with corresponding patient_id
    file_df = pd.DataFrame(filenames,columns=["filenames"])
    file_df["filenames"] = file_df["filenames"].str.strip() + '.wav'
    file_df["patient_id"] = file_df["filenames"].str.split("/").str[1]
    file_df["patient_id"] = file_df["patient_id"].str.split("_").str[0].astype(str)
    # merge with demographics to enrich dataframe
    demographics["Patient ID"] = demographics["Patient ID"].astype(str)
    file_df = file_df.merge(
        demographics[["Murmur","Patient ID","Murmur locations"]],
        how='left',
        left_on='patient_id',
        right_on='Patient ID'
    )
    file_df.pop("Patient ID")
# class distribution by audio recordings
print("Class distribution by audio recordings"+ "\n" ,file_df["Murmur"].value_counts()) #Heavily biased to absent class

# grabbing only present and absent class
file_df_present = file_df.loc[file_df["Murmur"]=="Present"]
file_df_absent = file_df.loc[file_df["Murmur"]=="Absent"]

# creating test df
test_present_ids = grab_id(250,file_df_present["patient_id"].value_counts())
test_absent_ids = grab_id(250,file_df_absent["patient_id"].value_counts())
test_ids = test_absent_ids+test_present_ids
test_df = file_df.loc[file_df["patient_id"].isin(test_ids)]

# create train df
train_df = file_df.loc[~(file_df["patient_id"].isin(test_ids)) & (file_df["Murmur"] != "Unknown")]
print("\nTraining set class distribution (Pre-Augment)\n",train_df["Murmur"].value_counts())
print("\nAugmenting present class audio data\n")
print(train_df)
# augmenting present class audio data
filtered_train = train_df.loc[train_df["Murmur"]!="Absent"].copy()
for i,k in tqdm(filtered_train.iterrows(),total=filtered_train.shape[0]):
    list_of_murmur_locations = list(k["Murmur locations"].split("+"))
    ausc_location = k["filenames"].split("/")[1].split(".")[0].split("_")[1]
    if ausc_location.strip() in list_of_murmur_locations:
        S,sr = librosa.load(data_path+k["filenames"],sr=4000)
        augmented_signal = augment(S,sr)
        augmented_signal_filename = k["filenames"].split(".")[0]+"_a1.wav"
        augment_signal(S,sr,augment_path+augmented_signal_filename)
        train_df = train_df.append(pd.DataFrame(
            data=[[
                augmented_signal_filename,k["patient_id"],k["Murmur"],k["Murmur locations"]
                ]],
            columns=list(train_df.columns.values)
            ))
    else:
        train_df = train_df.drop(index=train_df.loc[train_df["filenames"]==k["filenames"]].index)
train_df.pop("Murmur locations")
print("\nTraining set class distribution (Post Signal Augment)\n",train_df["Murmur"].value_counts())

Class distribution by audio recordings
 Absent     2391
Present     616
Unknown     156
Name: Murmur, dtype: int64

Training set class distribution (Pre-Augment)
 Absent     2141
Present     366
Name: Murmur, dtype: int64

Augmenting present class audio data

                       filenames patient_id   Murmur Murmur locations
16    training_data/14241_AV.wav      14241  Present      AV+MV+PV+TV
17    training_data/14241_MV.wav      14241  Present      AV+MV+PV+TV
18    training_data/14241_PV.wav      14241  Present      AV+MV+PV+TV
19    training_data/14241_TV.wav      14241  Present      AV+MV+PV+TV
20    training_data/14998_AV.wav      14998   Absent              NaN
...                          ...        ...      ...              ...
3158  training_data/85345_AV.wav      85345   Absent              NaN
3159  training_data/85345_PV.wav      85345   Absent              NaN
3160  training_data/85349_AV.wav      85349   Absent              NaN
3161  training_data/85349_PV.wav      85

100%|██████████| 366/366 [00:10<00:00, 35.42it/s]


Training set class distribution (Post Signal Augment)
 Absent     2141
Present     598
Name: Murmur, dtype: int64





In [20]:
train_df["data"] = np.array(None)
train_df = train_df.reset_index(drop=True)
# extracting features and augmenting mfcc spectogram for training set
print("\nExtracting features and augmenting present class mfcc spectogram\n")
for i,k in tqdm(train_df.copy().iterrows(),total=train_df.copy().shape[0]):
    S,sr = librosa.load(augment_path+k["filenames"],sr=4000)
    scaled_mfccs , mfccs = mfccs_features_extract(S,sr)
    train_df.loc[i,"data"] = scaled_mfccs
    if k["Murmur"] != "Absent":
        mfccs_augment = augment_spec(mfccs)
        mfccs_augment = np.mean(mfccs_augment.T,axis=0)
        train_df = train_df.append(pd.DataFrame(
            data=[[
                k["filenames"].replace(".wav","_augmentspect.wav"),
                k["patient_id"],
                k["Murmur"],
                mfccs_augment
            ]],
            columns=list(train_df.columns.values)
        ))
print("\nTraining set class distribution (Post Spectrogram augment))\n",train_df["Murmur"].value_counts())


Extracting features and augmenting present class mfcc spectogram



100%|██████████| 2739/2739 [05:19<00:00,  8.56it/s]


Training set class distribution (Post Spectrogram augment))
 Absent     2141
Present    1196
Name: Murmur, dtype: int64





In [21]:
# class balancing for training set
present_count = train_df["Murmur"].value_counts()[1]
print("Trim absent class data to match size of present class data and shuffle ")
absent_ids = grab_id(present_count,train_df.loc[train_df["Murmur"]=="Absent"]["patient_id"].value_counts())
train_df = pd.concat([
    train_df.loc[train_df["Murmur"]=="Present"],
    train_df.loc[(train_df["Murmur"]=="Absent")&(train_df["patient_id"].isin(absent_ids))]
]).sample(frac=1)
train_df = train_df.reset_index(drop=True)
print(train_df["Murmur"].value_counts())

Trim absent class data to match size of present class data and shuffle 
Present    1196
Absent     1196
Name: Murmur, dtype: int64


In [22]:

test_df["data"] = None
test_df = test_df.reset_index(drop=True)
for i,k in tqdm(test_df.copy().iterrows(),total=test_df.copy().shape[0]):
    S,sr = librosa.load(data_path+k["filenames"],sr=4000)
    test_scaled_mfccs, test_melspect = mfccs_features_extract(S,sr)
    test_df.loc[i,"data"] = test_scaled_mfccs
test_df = test_df.sample(frac=1).reset_index()
print("\nTesting set class distribution (final)\n",test_df["Murmur"].value_counts())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["data"] = None
100%|██████████| 500/500 [00:49<00:00, 10.09it/s]


Testing set class distribution (final)
 Present    250
Absent     250
Name: Murmur, dtype: int64





In [23]:
# manual splitting and assigning data to variables
X_test = np.array(test_df["data"].tolist())
X_train = np.array(train_df["data"].tolist())
y_test = test_df["Murmur"]
y_train = train_df["Murmur"]


# export data 
import pickle 

pickle_out_X_train = open("./1.0.1/pickled_data/X_train_autosklearn_v2.pickle","wb")
pickle.dump(X_train,pickle_out_X_train)
pickle_out_X_train.close()

pickle_out_X_test = open("./1.0.1/pickled_data/X_test_autosklearn_v2.pickle","wb")
pickle.dump(X_test,pickle_out_X_test)
pickle_out_X_test.close()


pickle_out_y_train = open("./1.0.1/pickled_data/y_train_autosklearn_v2.pickle","wb")
pickle.dump(y_train,pickle_out_y_train)
pickle_out_y_train.close()

pickle_out_y_test = open("./1.0.1/pickled_data/y_test_autosklearn_v2.pickle","wb")
pickle.dump(y_test,pickle_out_y_test)
pickle_out_y_test.close()

In [24]:
X_test[0].shape

(40,)