In [None]:
%load_ext nb_black

# 0. Initiate

In [None]:
# Load packages
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import random
from scipy.io.wavfile import write
from tqdm import tqdm
import librosa
import librosa.display
import math

import warnings
warnings.simplefilter(action="ignore")

from team_code_old import *
from helper_code import *
from DeepNet.HumBugDB.lib.PyTorch.vggish.vggish_input import waveform_to_examples

In [None]:
# Define paths
data_folder = "data/00_raw/training_data/"
output_folder = "data/test/"

In [None]:
def get_features(data, recordings):
    # Extract the age group and replace with the (approximate) number of months for the middle of the age group.
    age_group = get_age(data)

    if compare_strings(age_group, 'Neonate'):
        age = 0.5
    elif compare_strings(age_group, 'Infant'):
        age = 6
    elif compare_strings(age_group, 'Child'):
        age = 6 * 12
    elif compare_strings(age_group, 'Adolescent'):
        age = 15 * 12
    elif compare_strings(age_group, 'Young Adult'):
        age = 20 * 12
    else:
        age = float('nan')

    # Extract sex. Use one-hot encoding.
    sex = get_sex(data)

    sex_features = np.zeros(2, dtype=int)
    if compare_strings(sex, 'Female'):
        sex_features[0] = 1
    elif compare_strings(sex, 'Male'):
        sex_features[1] = 1

    # Extract height and weight.
    height = get_height(data)
    weight = get_weight(data)

    # Extract pregnancy status.
    is_pregnant = get_pregnancy_status(data)

    # Extract recording locations and data. Identify when a location is present, and compute the mean, variance, and skewness of
    # each recording. If there are multiple recordings for one location, then extract features from the last recording.
    locations = get_locations(data)

    recording_locations = ['AV', 'MV', 'PV', 'TV', 'PhC']
    num_recording_locations = len(recording_locations)
    recording_features = np.zeros((num_recording_locations, 4), dtype=float)
    num_locations = len(locations)
    num_recordings = len(recordings)
    if num_locations==num_recordings:
        for i in range(num_locations):
            for j in range(num_recording_locations):
                if compare_strings(locations[i], recording_locations[j]) and np.size(recordings[i])>0:
                    recording_features[j, 0] = 1
                    recording_features[j, 1] = np.mean(recordings[i])
                    recording_features[j, 2] = np.var(recordings[i])
                    recording_features[j, 3] = sp.stats.skew(recordings[i])
    recording_features = recording_features.flatten()

    features = np.hstack(([age_group], [age], sex_features, [height], [weight], [is_pregnant], recording_features))

    return np.asarray(features)

# 1. Load data

In [None]:
# Load recordings.
def load_recordings(data_folder, data, get_frequencies=False):
    num_locations = get_num_locations(data)
    recording_information = data.split('\n')[1:num_locations+1]

    recordings = list()
    locations = list()
    frequencies = list()
    for i in range(num_locations):
        entries = recording_information[i].split(' ')
        recording_file = entries[2]
        filename = os.path.join(data_folder, recording_file)
        recording, frequency = load_wav_file(filename)
        recordings.append(recording)
        locations.append(entries[0])
        frequencies.append(frequency)

    if get_frequencies:
        return recordings, locations, frequencies
    else:
        return recordings, locations

In [None]:
 # Find the patient data files.
patient_files = find_patient_files(data_folder)
num_patient_files = len(patient_files)

murmur_classes = ['Present', 'Unknown', 'Absent']
num_murmur_classes = len(murmur_classes)
outcome_classes = ['Abnormal', 'Normal']
num_outcome_classes = len(outcome_classes)
features = list()
murmurs = list()
outcomes = list()
recordings = list()
locations = list()
for i in tqdm(range(num_patient_files)):

    # Load the current patient data and recordings.
    current_patient_data = load_patient_data(patient_files[i])
    current_recordings, current_locations = load_recordings(data_folder, current_patient_data)
    current_recordings = [r / 32768 for r in current_recordings]
    num_recordings_code = get_num_locations(current_patient_data)
    num_recordings = len(current_recordings)
    recordings.append(current_recordings)
    locations.append(current_locations)

    # Extract features.
    current_features = get_features(current_patient_data, current_recordings)
    current_features = np.insert(current_features,0,current_patient_data.split(" ")[0])
    current_features = np.insert(current_features,1,current_patient_data.split(" ")[2][:-3])
    current_features = np.insert(current_features,2,num_recordings)
    current_features = np.insert(current_features,3,num_recordings_code)
    features.append(current_features)

    # Extract labels and use one-hot encoding.
    ## Murmur
    current_murmur = np.zeros(num_murmur_classes, dtype=int)
    murmur = get_murmur(current_patient_data)
    if murmur in murmur_classes:
        j = murmur_classes.index(murmur)
        current_murmur[j] = 1
    murmurs.append(current_murmur)
    ## Outcome
    current_outcome = np.zeros(num_outcome_classes, dtype=int)
    outcome = get_outcome(current_patient_data)
    if outcome in outcome_classes:
        j = outcome_classes.index(outcome)
        current_outcome[j] = 1
    outcomes.append(current_outcome)

features = np.vstack(features)
murmurs = np.vstack(murmurs)
outcomes = np.vstack(outcomes)

# Combine dataframes
features_pd = pd.DataFrame(features, columns=["id", "hz", "num_recordings", "num_recordings_code", "age_group", "age", "female", "male", "height", "weight", "is_pregnant"]+[f"standard_recording_features_{i}_{t}" for t in ["1", "mean", "var", "skew"] for i in range(5)])#+[f"signatures{i}" for i in range(len(features[0])-12)])
murmurs_pd = pd.DataFrame(murmurs, columns=murmur_classes)
outcomes_pd = pd.DataFrame(outcomes, columns=outcome_classes)
recordings_pd = pd.DataFrame(columns=np.unique([item for sublist in locations for item in sublist]))
for i in range(len(recordings)):
    recordings_pd = recordings_pd.append({locations[i][0]: recordings[i][0]}, ignore_index=True)
    for j in range(1, len(recordings[i])):
        recordings_pd[locations[i][j]].iloc[i] = recordings[i][j]
complete_pd = pd.concat([features_pd.loc[:, [c for c in features_pd.columns if "standard_recording_features_" not in c]], murmurs_pd, outcomes_pd, recordings_pd], axis=1)

# Prep
features_pd["age_year"] = features_pd['age'].astype(float).astype(int, errors='ignore')/12
complete_pd["age_year"] = complete_pd['age'].astype(float).astype(int, errors='ignore')/12
complete_pd["murmur"]= [1 if p==1 else -1 if a==1 else 0 for p,a  in zip(complete_pd.Present, complete_pd.Absent)]
complete_pd["is_pregnant"] = complete_pd["is_pregnant"].apply(lambda x: 0 if x=="False" else 1)

# Get recordings length
for c in complete_pd.columns:
    if "location" in c and "length" not in c:
        complete_pd[c].loc[complete_pd[c].isnull()] = complete_pd[c].loc[complete_pd[c].isnull()].apply(lambda x: [])
        complete_pd[f"length_sec_{c}"] = complete_pd[c].apply(len)/complete_pd['hz'].astype(float).astype("Int64")

# Label nans correctly
complete_pd = complete_pd.replace("nan", np.nan)

In [None]:
complete_pd[['id', 'hz', 'num_recordings', 'num_recordings_code', 'age_group', 'age', 'female',
       'male', 'height', 'weight', 'is_pregnant', 'Present', 'Unknown',
       'Absent', 'Abnormal', 'Normal', 'age_year',
       'murmur', 'length_sec_location_0', 'length_sec_location_1',
       'length_sec_location_2', 'length_sec_location_3',
       'length_sec_location_4', 'length_sec_location_5']].to_csv("data/00_raw/complete_pd.csv", index=False)

In [None]:
complete_pd_long = pd.wide_to_long(complete_pd, ['length_sec_location_'], i='id', j='location').reset_index()

In [None]:
complete_pd_long[['id', 'hz', 'num_recordings', 'num_recordings_code', 'age_group', 'age', 'female',
       'male', 'height', 'weight', 'is_pregnant', 'Present', 'Unknown',
       'Absent', 'Abnormal', 'Normal', 'age_year',
       'murmur', 'location', 'length_sec_location_']].to_csv("data/00_raw/complete_pd_long.csv", index=False)

In [None]:
complete_pd_long.loc[complete_pd_long.length_sec_location_>0,:].shape

# 2. Data distributions

In [None]:
# Descriptive
pd.DataFrame(complete_pd.describe())

In [None]:
# Missing values
fig, ax = plt.subplots(figsize=(10,7))
sns.heatmap(complete_pd[['age', 'female', 'male', 'height', 'weight', 'is_pregnant', 'murmur']].astype(float).isna(), cbar=False, cmap="Greys", ax=ax)
plt.xticks(fontsize=12, rotation=0)
ax.set_yticks([])

In [None]:
# Missing values
fig, ax = plt.subplots(figsize=(10,7))
sns.heatmap(complete_pd.loc[complete_pd.age.isnull(),:].isnull(), cbar=False, cmap="Greys", ax=ax)

In [None]:
# Missing values
fig, ax = plt.subplots(figsize=(10,7))
sns.heatmap(complete_pd.loc[complete_pd.is_pregnant==1,:].isnull(), cbar=False, cmap="Greys", ax=ax)

In [None]:
# Age
fig, ax = plt.subplots(figsize=(10,7))
sns.histplot(complete_pd.weight, bins=50)

In [None]:
# Age
fig, ax = plt.subplots(figsize=(10,7))
sns.histplot(complete_pd.height, bins=50)

In [None]:
# Height, weight
fig, ax = plt.subplots(figsize=(10,7))
sns.scatterplot(complete_pd.weight, complete_pd.height, hue=complete_pd.age_year)

In [None]:
# Height, weight
fig, ax = plt.subplots(figsize=(10,7))
sns.scatterplot(complete_pd.weight, complete_pd.height, hue=complete_pd.female)


In [None]:
# Age
fig, ax = plt.subplots(figsize=(10,7))
sns.histplot(complete_pd.age_year, bins=50)

In [None]:
# Age
fig, ax = plt.subplots(figsize=(10,7))
sns.scatterplot(complete_pd.age_year, complete_pd.height, hue=complete_pd.female)

In [None]:
# Age
fig, ax = plt.subplots(figsize=(10,7))
sns.scatterplot(complete_pd.age_year, complete_pd.weight, hue=complete_pd.female)

In [None]:
# Pregnant
pd.DataFrame(complete_pd.loc[complete_pd.is_pregnant==1, :].describe())

In [None]:
# Labels
pd.crosstab([complete_pd.Abnormal, complete_pd.Normal], [complete_pd.Present, complete_pd.Unknown, complete_pd.Absent, complete_pd.murmur])

In [None]:
# Labels
pd.crosstab([complete_pd.age_year], [complete_pd.Present, complete_pd.Unknown, complete_pd.Absent, complete_pd.murmur])

In [None]:
# Labels
pd.crosstab([complete_pd.female], [complete_pd.Present, complete_pd.Unknown, complete_pd.Absent, complete_pd.murmur])

In [None]:
# Labels
pd.crosstab([complete_pd.num_recordings_code], [complete_pd.Present, complete_pd.Unknown, complete_pd.Absent, complete_pd.murmur])

In [None]:
# Weight and height
fig, ax = plt.subplots(figsize=(10,7))
sns.scatterplot(complete_pd.weight, complete_pd.height, hue=complete_pd.murmur)


In [None]:
recordings_lengths_aux = [[eval(f"complete_pd.length_sec_location_{i}"),complete_pd.murmur] for i in range(6)]
recordings_lengths = [e[0] for e in recordings_lengths_aux]
recordings_lengths = [item for sublist in recordings_lengths for item in sublist]
murmur_label = [e[1] for e in recordings_lengths_aux]
murmur_label = [item for sublist in murmur_label for item in sublist]
df_lengths = pd.DataFrame({'length': recordings_lengths, 'murmur': murmur_label})
df_lengths = df_lengths.loc[df_lengths.length>0, :]

In [None]:
len(df_lengths.loc[df_lengths.murmur==1])

In [None]:
# Length
fig, ax = plt.subplots(figsize=(10,20),nrows=4)
sns.histplot(df_lengths, x="length", hue="murmur", ax=ax[0], stat="count", binrange=[0,65], binwidth=1)
sns.histplot(df_lengths.loc[df_lengths.murmur==-1], x="length", ax=ax[1], stat="count", binrange=[0,65], binwidth=1, color="yellow")
sns.histplot(df_lengths.loc[df_lengths.murmur==0], x="length", ax=ax[2], stat="count", binrange=[0,65], binwidth=1, color="red")
sns.histplot(df_lengths.loc[df_lengths.murmur==1], x="length", ax=ax[3], stat="count", binrange=[0,65], binwidth=1, color="black")


In [None]:
# Length
fig, ax = plt.subplots(figsize=(10,20),nrows=4)
sns.histplot(complete_pd, x="num_recordings", hue="murmur", ax=ax[0], stat="count", binrange=[0,6], binwidth=1)
sns.histplot(complete_pd.loc[complete_pd.murmur==-1], x="num_recordings", ax=ax[1], stat="count", binrange=[0,6], binwidth=1, color="yellow")
sns.histplot(complete_pd.loc[complete_pd.murmur==0], x="num_recordings", ax=ax[2], stat="count", binrange=[0,6], binwidth=1, color="red")
sns.histplot(complete_pd.loc[complete_pd.murmur==1], x="num_recordings", ax=ax[3], stat="count", binrange=[0,6], binwidth=1, color="black")
