In [None]:
pip freeze "requirements.txt"


In [None]:
import numpy as np
import librosa 
import librosa.display
import pandas as pd
import soundfile as sf
import sklearn
import matplotlib.pyplot as plt
from tqdm import tqdm
from audiomentations import *
import warnings
import scipy.signal
from pydub import AudioSegment
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:

# for balancing
def grab_id(amt,val_srs):
    temp_idx=[]
    temp_count=0
    for i,k in val_srs.iteritems():
        if temp_count >= amt:
            break
        else:
            temp_count+=k
            temp_idx.append(i)
    return temp_idx

# feature extraction functions
def mfccs_features_extract(S,sr):
    mfccs_features = librosa.feature.mfcc(y=S, sr=sr, n_mfcc= 40)
    mfccs_features = librosa.decompose.nn_filter(mfccs_features)
    mfccs_scaled_features = np.mean(mfccs_features.T, axis= 0)
    return mfccs_scaled_features,mfccs_features


In [None]:
# paths
data_path = "./1.0.1/"
augment_path = "./1.0.1/training_data_balanced/"
records = data_path+ "RECORDS"

In [None]:
demographics = pd.read_csv(data_path+"training_data.csv")
print("Class distribution by patient id")
demographics["Murmur"].value_counts(normalize=True)

In [None]:
# read all audio recordings and augment
with open(records,'r') as r:
    filenames = r.readlines()

    # dataframe containing audio recording path with corresponding patient_id
    file_df = pd.DataFrame(filenames,columns=["filenames"])
    file_df["filenames"] = file_df["filenames"].str.strip() + '.wav'
    file_df["patient_id"] = file_df["filenames"].str.split("/").str[1]
    file_df["patient_id"] = file_df["patient_id"].str.split("_").str[0].astype(str)
    # merge with demographics to enrich dataframe
    demographics["Patient ID"] = demographics["Patient ID"].astype(str)
    file_df = file_df.merge(
        demographics[["Murmur","Patient ID","Murmur locations"]],
        how='left',
        left_on='patient_id',
        right_on='Patient ID'
    )
    file_df.pop("Patient ID")
# class distribution by audio recordings
print("Class distribution by audio recordings"+ "\n" ,file_df["Murmur"].value_counts()) #Heavily biased to absent class

# grabbing only present and absent class
file_df_present = file_df.loc[file_df["Murmur"]=="Present"]
file_df_absent = file_df.loc[file_df["Murmur"]=="Absent"]
file_df_unknown = file_df.loc[file_df["Murmur"]=="Unknown"]

# creating test df
test_present_ids = grab_id(250,file_df_present["patient_id"].value_counts())
test_absent_ids = grab_id(250,file_df_absent["patient_id"].value_counts())
test_unknown_ids = grab_id(50,file_df_unknown["patient_id"].value_counts())
test_ids = test_absent_ids+test_present_ids + test_unknown_ids
test_df = file_df.loc[file_df["patient_id"].isin(test_ids)]

# create train df
train_df = file_df.loc[~(file_df["patient_id"].isin(test_ids))]
print("\nTraining set class distribution (Pre-Augment)\n",train_df["Murmur"].value_counts())
print("\nAugmenting present class audio data\n")
print(train_df)
# augmenting present class audio data
filtered_train = train_df.loc[train_df["Murmur"]!="Absent"].copy()
for i,k in tqdm(filtered_train.iterrows(),total=filtered_train.shape[0]):
    list_of_murmur_locations = list(k["Murmur locations"].split("+"))
    ausc_location = k["filenames"].split("/")[1].split(".")[0].split("_")[1]
    if ausc_location.strip() in list_of_murmur_locations:
        pass
    else:
        train_df = train_df.drop(index=train_df.loc[train_df["filenames"]==k["filenames"]].index)
train_df.pop("Murmur locations")
print("\nTraining set class distribution (Post Signal Augment)\n",train_df["Murmur"].value_counts())

In [None]:
train_df["data"] = np.array(None)
train_df = train_df.reset_index(drop=True)
# extracting features and augmenting mfcc spectogram for training set
print("\nExtracting features and augmenting present class mfcc spectogram\n")
for i,k in tqdm(train_df.copy().iterrows(),total=train_df.copy().shape[0]):
    S,sr = librosa.load(augment_path+k["filenames"],sr=4000)
    scaled_mfccs , mfccs = mfccs_features_extract(S,sr)
    train_df.loc[i,"data"] = scaled_mfccs
print("\nTraining set class distribution (Post Spectrogram augment))\n",train_df["Murmur"].value_counts())

In [None]:

test_df["data"] = None
test_df = test_df.reset_index(drop=True)
for i,k in tqdm(test_df.copy().iterrows(),total=test_df.copy().shape[0]):
    S,sr = librosa.load(data_path+k["filenames"],sr=4000)
    test_scaled_mfccs, test_melspect = mfccs_features_extract(S,sr)
    test_df.loc[i,"data"] = test_scaled_mfccs
test_df = test_df.sample(frac=1).reset_index()
print("\nTesting set class distribution (final)\n",test_df["Murmur"].value_counts())

In [None]:
# manual splitting and assigning data to variables
X_test = np.array(test_df["data"].tolist())
X_train = np.array(train_df["data"].tolist())
y_test = test_df["Murmur"]
y_train = train_df["Murmur"]


# export data 
import pickle 

pickle_out_X_train = open("./1.0.1/pickled_data/X_train_imbalanced_noaug_unknown.pickle","wb")
pickle.dump(X_train,pickle_out_X_train)
pickle_out_X_train.close()

pickle_out_X_test = open("./1.0.1/pickled_data/X_test_imbalanced_noaug_unknown.pickle","wb")
pickle.dump(X_test,pickle_out_X_test)
pickle_out_X_test.close()


pickle_out_y_train = open("./1.0.1/pickled_data/y_train_imbalanced_noaug_unknown.pickle","wb")
pickle.dump(y_train,pickle_out_y_train)
pickle_out_y_train.close()

pickle_out_y_test = open("./1.0.1/pickled_data/y_test_imbalanced_noaug_unknown.pickle","wb")
pickle.dump(y_test,pickle_out_y_test)
pickle_out_y_test.close()

In [None]:
X_test[0].shape