In [None]:
from google.colab import drive
import pandas as pd
import os
import wave
import numpy as np
import re

In [None]:
drive.mount('/content/drive') # Change if needed
raw_database_path = "" # Write the raw_database/ path
databases_infos_path = "" # Write a directory path where you are reading and saving .xlsx files with databases informations.

In [None]:
database_info = pd.read_excel(os.path.join(raw_database_path, "FetalPCGSpreadsheet.xlsx"))

In [None]:
print(database_info)

In [None]:
# Selects the columns that will be used and renames the BPM column, specifically the patient identifier ('Subject ID') and the heart rate ('CTG Heart-rate (BPM). Each number corresponds to the average fetal heart rate over 10 seconds of the signal, whenever available. Brackets denote unreported values')
database_info = database_info.rename(columns={'CTG Heart-rate (BPM). Each number corresponds to the average fetal heart-rate over 10 seconds of the signal, whenever available. Brackets denote unreported values': 'BPM'})
database_info = database_info[['Subject ID', 'BPM']]
print(database_info)

In [None]:
# Verify number of empty data
database_info.isna().sum()

In [None]:
# Remove empty data from subject ID
database_info = database_info.dropna(subset=['Subject ID']) # This can be done because the instances without an identifier were used to add extra comments to the same patient.

print(database_info, "\n", database_info.isna().sum())

In [None]:
# Adjusts the patient ID to match the file name, except for twins.
database_info["Subject ID"] = database_info["Subject ID"].astype(str) # Convert to string
database_info["Subject ID"] = database_info["Subject ID"].apply(lambda s_id: "f"+str(int(s_id[-3:])) if s_id.startswith("F93") else s_id)

print(database_info['Subject ID'])

In [None]:
# Renames the IDs of the twin instances where the notation is different. 
def transform_subject_id(subject_id):
    match = re.match(r'^(F?\d+-?\d*):?.*$', subject_id)
    if match:
        return 'f' + match.group(1).replace('F', '').lower()
    return subject_id

pattern = '^f\d+$'

database_info['Subject ID'] = database_info['Subject ID'].apply(transform_subject_id)

print(database_info['Subject ID'].to_string())

In [None]:
# Remove empty data from BPM
database_info = database_info.replace("[]", np.nan)
database_info = database_info.dropna(subset=['BPM'])
database_info = database_info.reset_index(drop=True) # The dropna causes the index to become non-sequential, so it is necessary to reset it.
print(database_info.to_string())

In [None]:
# Confirming that there are no missing data
database_info.isna().sum()

In [None]:
# Put the recordings that are labeled (with BPM) into a list and compare how many recordings remain after removing those without a label.

lista_gravacoes_feto = list(database_info['Subject ID'])
print("Número de gravações do feto com BPM:", len(lista_gravacoes_feto))
lista_gravacoes_feto_dataset_completo = [file_name for file_name in os.listdir(raw_database_path) if file_name.endswith(".wav") and file_name.startswith('f')]
print("Número de gravações totais do feto:", len(lista_gravacoes_feto_dataset_completo))


In [None]:
# Adding the theoretical duration of the audio based on the number of BPM records per audio, as each record corresponds to 10 seconds.
database_info['suposed_duration'] = database_info['BPM'].apply(lambda x: (x.count("-") + 1)*10)
database_info

Now adding more audio file information to dataset_info

In [None]:
def get_wav_info(file_name):
    with wave.open(file_name, 'rb') as wav_file:

        num_channels = wav_file.getnchannels()
        sample_width = wav_file.getsampwidth()
        frame_rate = wav_file.getframerate()
        num_frames = wav_file.getnframes()
        duration = num_frames / frame_rate
        return num_channels, sample_width, frame_rate, num_frames, duration

In [None]:
num_channels_list = []
sample_width_list = []
frame_rate_list = []
num_frames_list = []
duration_list = []

for subject_id in database_info['Subject ID']:
    file_name = os.path.join(raw_database_path, f"{subject_id}.wav")
    num_channels, sample_width, frame_rate, num_frames, duration = get_wav_info(file_name)
    num_channels_list.append(num_channels)
    sample_width_list.append(sample_width)
    frame_rate_list.append(frame_rate)
    num_frames_list.append(num_frames)
    duration_list.append(duration)

database_info['num_channels'] = num_channels_list
database_info['sample_width'] = sample_width_list
database_info['frame_rate'] = frame_rate_list
database_info['num_frames'] = num_frames_list
database_info['duration'] = duration_list

In [None]:
database_info

In [None]:
# Checking if the audio files are standardized.
print(database_info["num_channels"].value_counts())
print(database_info["frame_rate"].value_counts())
print(database_info["sample_width"].value_counts())

In [None]:
# Remove the frame_rate that is different
database_info = database_info[database_info["frame_rate"] != 8000]
print(database_info["frame_rate"].value_counts())

In [None]:
# Checking the difference in durations.
database_info["duration_diff"] = database_info["duration"] - database_info["suposed_duration"]
database_info

In [None]:
print(database_info["duration"].describe())
print(database_info["suposed_duration"].describe())

In [None]:
# Saving the clean .xlsx file
database_info.to_excel(os.path.join(databases_infos_path, "database_info.xlsx"))