In [3]:
import librosa
import soundfile as sf
import os
import re

In [4]:
# return list of just cm's lines from the full transcript
def extract_cm_lines(transcript_path):
    cm_lines = []
    with open(transcript_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line.startswith("{CM}"):
                # removing punctuation to match to segments later
                line_text = line[4:].strip().lower()
                line_text = line_text.replace('’', "'")
                line_text = re.sub(r"[^\w\s’'-]", '', line_text)
                cm_lines.append(line_text)
        with open("cm_lines.txt", "w") as f:
            for line in cm_lines:
                f.write(line + '\n')
    return cm_lines

In [5]:
def load_segments_and_text(segments_path, prefix):
    # segments is a dict with ID as key and timestamp tuple as value
    segments = {}
    with open(segments_path, 'r') as f:
        for line in f:
            if line.startswith(prefix):
                parts = line.strip().split()
                utt_id, start, end = parts[0], float(parts[2]), float(parts[3])
                segments[utt_id] = (start, end)

    return segments

In [None]:
# turning big wav into lots of small wavs
# prefix is either cm or non_cm
def extract_segments(audio_path, segments_dict, out_dir):
    os.makedirs(out_dir, exist_ok=True)
    # keeping original sr, may adjust when extracting spectrogram
    y, sr = librosa.load(audio_path, sr=None)  # sr is 16000
    for utt_id, (start, end) in segments_dict.items():
        start_sample = int(start * sr)
        end_sample = int(end * sr)
        clip = y[start_sample:end_sample]
        # e.g. output/cm/ CM-BBC_CMI_1995_06_21-0190.wav
        out_path = os.path.join(out_dir, f"{utt_id}.wav")
        sf.write(out_path, clip, sr)


In [None]:
#transcript file provides full transcripts, with {CM} preceding his lines. speaker changes with line breaks.
transcript_path = 'data/program_choinnich_speaker_labelled/transcripts/BBC_CMI_1995_06_21.txt'
#segments file provides ID and timestamps in seconds
segments_path = 'data/program_choinnich_speaker_labelled/data/segments'
#text file provides ID and first few words of the transcript and is normalised
# text_path = 'data/program_choinnich_speaker_labelled/data/text'
# audio is just wav file
audio_path = 'data/program_choinnich_speaker_labelled/wavs/BBC_CMI_1995_06_21.wav'

cm_lines = extract_cm_lines(transcript_path)

non_cm_segments = load_segments_and_text(segments_path, "BBC_CMI_1995_06_21")
cm_segments = load_segments_and_text(segments_path, "CM-BBC_CMI_1995_06_21")

# save cm and non-cm segments into different folders
extract_segments(audio_path, cm_segments, 'segmented_wavs/cm')
extract_segments(audio_path, non_cm_segments, 'segmented_wavs/non_cm')


16000
16000
