# SongSay Lyrics Alignment

Dataset:

.ass files - ground truth start time of each sentence; .txt files - previous predicted start time of each sentence; _all.txt files - previous predicted start time of each word in sentences; .wav files - 700 full sound tracks, roughly 3~5 minutes each; _source_3.wav files - separated vocal tracks

In [None]:
!pip install -r requirements.txt

In [None]:
##imports
import madmom
import crepe
import boto3
import os
import scipy
import numpy as np
from tqdm import tqdm
import re
import librosa

###Loading Audio Files

In [None]:
# Ensure the download directory exists
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

# Initialize S3 client with credentials
s3_client = boto3.client(
    "s3",
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    region_name=AWS_REGION,
)

def download_wav_files(bucket, prefix, download_dir, max_files=100):
    paginator = s3_client.get_paginator("list_objects_v2")
    pages = paginator.paginate(Bucket=bucket, Prefix=prefix)

    file_count = 0  # Counter for downloaded files

    for page in pages:
        if "Contents" in page:
            for obj in page["Contents"]:
                if file_count >= max_files:
                    print(f"Reached the limit of {max_files} files. Stopping.")
                    return

                key = obj["Key"]
                if key.endswith("source_3.wav"):
                    file_name = os.path.basename(key)
                    local_path = os.path.join(download_dir, file_name)

                    print(f"Downloading {key} to {local_path}...")
                    s3_client.download_file(bucket, key, local_path)
                    print(f"Downloaded {key}")
                    file_count += 1  # Increment the counter

# Download the first 100 matching files
download_wav_files(BUCKET_NAME, FOLDER_PREFIX, DOWNLOAD_DIR, max_files=100)


Downloading large-folder/1047_v360P_V1_source_3.wav to /work/audio/1047_v360P_V1_source_3.wav...
Downloaded large-folder/1047_v360P_V1_source_3.wav
Downloading large-folder/10617_v360P_V1_source_3.wav to /work/audio/10617_v360P_V1_source_3.wav...
Downloaded large-folder/10617_v360P_V1_source_3.wav
Downloading large-folder/10788_v360P_V1_source_3.wav to /work/audio/10788_v360P_V1_source_3.wav...
Downloaded large-folder/10788_v360P_V1_source_3.wav
Downloading large-folder/10932_v360P_V1_source_3.wav to /work/audio/10932_v360P_V1_source_3.wav...
Downloaded large-folder/10932_v360P_V1_source_3.wav
Downloading large-folder/10962_v360P_V1_source_3.wav to /work/audio/10962_v360P_V1_source_3.wav...
Downloaded large-folder/10962_v360P_V1_source_3.wav
Downloading large-folder/10965_v360P_V1_source_3.wav to /work/audio/10965_v360P_V1_source_3.wav...
Downloaded large-folder/10965_v360P_V1_source_3.wav
Downloading large-folder/10966_v360P_V1_source_3.wav to /work/audio/10966_v360P_V1_source_3.wav..

###Loading and Processing Text Files

In [None]:
# Ensure the download directory exists
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

# Define S3 bucket and folder
BUCKET_NAME = "sagemaker-eu-north-1-205098052109"
FOLDER_PREFIX = "large-folder/"

# # AWS credentials
AWS_ACCESS_KEY_ID = ""
AWS_SECRET_ACCESS_KEY = ""
AWS_REGION = ""

# Initialize S3 client with credentials
s3_client = boto3.client(
    "s3",
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    region_name=AWS_REGION,
)

def download_txt_files(bucket, prefix, download_dir):
    paginator = s3_client.get_paginator("list_objects_v2")
    pages = paginator.paginate(Bucket=bucket, Prefix=prefix)

    for page in tqdm(pages):
        if "Contents" in page:
            for obj in page["Contents"]:
                key = obj["Key"]
                if key.endswith(".txt"):
                    file_name = os.path.basename(key)
                    local_path = os.path.join(download_dir, file_name)

                    print(f"Downloading {key} to {local_path}...")
                    s3_client.download_file(bucket, key, local_path)
                    print(f"Downloaded {key}")

# Download all .txt files
download_txt_files(BUCKET_NAME, FOLDER_PREFIX, DOWNLOAD_DIR)


In [None]:
# Ensure the download directory exists
DOWNLOAD_DIR = "/work/ass_files/"
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

# Initialize S3 client with credentials
s3_client = boto3.client(
    "s3",
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    region_name=AWS_REGION,
)

def download_ass_files(bucket, prefix, download_dir, max_files=100):
    paginator = s3_client.get_paginator("list_objects_v2")
    pages = paginator.paginate(Bucket=bucket, Prefix=prefix)

    file_count = 0  # Counter for downloaded files

    for page in pages:
        if "Contents" in page:
            for obj in page["Contents"]:
                if file_count >= max_files:
                    print(f"Reached the limit of {max_files} files. Stopping.")
                    return

                key = obj["Key"]
                if key.endswith(".ass"):
                    file_name = os.path.basename(key)
                    local_path = os.path.join(download_dir, file_name)

                    print(f"Downloading {key} to {local_path}...")
                    s3_client.download_file(bucket, key, local_path)
                    print(f"Downloaded {key}")
                    file_count += 1

# Download the first 100 matching files
download_ass_files(BUCKET_NAME, FOLDER_PREFIX, DOWNLOAD_DIR, max_files=100)

Downloading large-folder/10284_V3.ass to /work/ass_files/10284_V3.ass...
Downloaded large-folder/10284_V3.ass
Downloading large-folder/10417_V3.ass to /work/ass_files/10417_V3.ass...
Downloaded large-folder/10417_V3.ass
Downloading large-folder/1047_V3.ass to /work/ass_files/1047_V3.ass...
Downloaded large-folder/1047_V3.ass
Downloading large-folder/10617_V3.ass to /work/ass_files/10617_V3.ass...
Downloaded large-folder/10617_V3.ass
Downloading large-folder/10662_V2.ass to /work/ass_files/10662_V2.ass...
Downloaded large-folder/10662_V2.ass
Downloading large-folder/10686_V2.ass to /work/ass_files/10686_V2.ass...
Downloaded large-folder/10686_V2.ass
Downloading large-folder/10739_V2.ass to /work/ass_files/10739_V2.ass...
Downloaded large-folder/10739_V2.ass
Downloading large-folder/10788_V3.ass to /work/ass_files/10788_V3.ass...
Downloaded large-folder/10788_V3.ass
Downloading large-folder/10932_V3.ass to /work/ass_files/10932_V3.ass...
Downloaded large-folder/10932_V3.ass
Downloading l

In [None]:
# parse_ass_file(input_file)

def parse_ass_file(input_file, output_directory):
    with open(input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Extract the file ID from the header
    file_id = None
    for line in lines:
        match = re.search(r"File:\s.*?(\d+)\.mp4", line)
        if match:
            file_id = match.group(1)
            break

    if not file_id:
        raise ValueError(f"File ID not found in the header of file {input_file}.")

    # Prepare the output filename
    output_file = os.path.join(output_directory, f"{file_id}.txt")

    # Extract the start and text columns from the [Events] section
    events_started = False
    rows = []
    for line in lines:
        if "[Events]" in line:
            events_started = True
            continue

        if events_started:
            # Match Dialogue lines
            match = re.match(r"Dialogue:\s\d+,(.*?),(.*?),Default,.*?,(.*)", line)
            if match:
                start_time = match.group(1).strip()
                text = match.group(3).strip()
                rows.append(f"{start_time},{text}")

    # Write to output file
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write("\n".join(rows))

    print(f"File '{output_file}' created successfully with {len(rows)} rows.")

In [None]:
def time_to_seconds(timestamp):
    """
    Convert a timestamp (hh:mm:ss.xx) to seconds.
    """
    parts = timestamp.split(":")
    hours = int(parts[0])
    minutes = int(parts[1])
    seconds = float(parts[2])
    return hours * 3600 + minutes * 60 + seconds

def parse_ass_file(input_file, output_directory):
    with open(input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Extract the file ID from the header
    file_id = None
    for line in lines:
        match = re.search(r"File:\s.*?(\d+)\.mp4", line)
        if match:
            file_id = match.group(1)
            break

    if not file_id:
        raise ValueError(f"File ID not found in the header of file {input_file}.")

    # Prepare the output filename
    output_file = os.path.join(output_directory, f"{file_id}.txt")

    # Extract the start time and text columns from the [Events] section
    events_started = False
    rows = []
    for line in lines:
        if "[Events]" in line:
            events_started = True
            continue

        if events_started:
            # Match Dialogue lines and extract the first timestamp and the text
            match = re.match(r"Dialogue:\s\d+,(.*?),.*?,Default,.*?,.*?,.*?,(.*)", line)
            if match:
                start_time = match.group(1).strip()  # Only the first timestamp
                start_time_seconds = time_to_seconds(start_time)  # Convert to seconds
                text = match.group(2).strip()       # The text content
                rows.append(f"{start_time_seconds:.2f},{text}")

    # Write to output file
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write("\n".join(rows))

    print(f"File '{output_file}' created successfully with {len(rows)} rows.")

def process_ass_files(input_directory, output_directory):
    # Ensure the output directory exists
    os.makedirs(output_directory, exist_ok=True)

    # Iterate over all files in the input directory
    for filename in os.listdir(input_directory):
        if filename.endswith('.ass'):
            input_file = os.path.join(input_directory, filename)
            try:
                parse_ass_file(input_file, output_directory)
            except ValueError as e:
                print(e)  # Log the error if file ID not found


input_directory = '/work/ass_files/'
output_directory = '/work/ground_truth/'

# Process all .ass files
process_ass_files(input_directory, output_directory)


File '/work/ground_truth/10284.txt' created successfully with 33 rows.
File '/work/ground_truth/10417.txt' created successfully with 50 rows.
File '/work/ground_truth/1047.txt' created successfully with 89 rows.
File '/work/ground_truth/10617.txt' created successfully with 44 rows.
File '/work/ground_truth/10662.txt' created successfully with 124 rows.
File '/work/ground_truth/10686.txt' created successfully with 80 rows.
File '/work/ground_truth/10739.txt' created successfully with 23 rows.
File '/work/ground_truth/10788.txt' created successfully with 22 rows.
File '/work/ground_truth/10932.txt' created successfully with 47 rows.
File '/work/ground_truth/10962.txt' created successfully with 49 rows.
File '/work/ground_truth/10965.txt' created successfully with 49 rows.
File '/work/ground_truth/10966.txt' created successfully with 28 rows.
File '/work/ground_truth/11018.txt' created successfully with 29 rows.
File '/work/ground_truth/11027.txt' created successfully with 57 rows.
File '

###Defining Feature Extraction Functions

In [None]:
# F_0 estimation

def estimate_pitch(y, sr, voicing_threshold=0.3, use_viterbi=False):
    """
    Estimate the fundamental frequency (pitch) of an audio file using the CREPE algorithm.

    Parameters
    ----------
    y: audio samples
    sr: sampling rate
    voicing_threshold : float, optional
        The confidence threshold above which a frame is considered voiced. Frames with confidence
        levels below this threshold are marked as unvoiced (i.e., set to 0 Hz).
        Default is 0.3.
    use_viterbi : bool, optional
        If True, apply Viterbi decoding to smooth the pitch track and obtain more consistent
        pitch estimates over time. Default is False.

    Returns
    -------
    time : np.ndarray
        A 1D numpy array containing time stamps for each frame in seconds.
    frequency : np.ndarray
        A 1D numpy array containing the estimated pitch for each frame in Hz. Unvoiced frames
        are set to 0 Hz.
    confidence : np.ndarray
        A 1D numpy array containing the confidence of the pitch estimate for each frame.
    activation : np.ndarray
        A 2D numpy array representing the activation matrix returned by the CREPE algorithm,
        which can be used to visualize the pitch estimation process.

    """


    time, frequency, confidence, activation = crepe.predict(y, sr, viterbi=use_viterbi)
    frequency[confidence < voicing_threshold] = 0 # so confidence is the max of each row of activation, where each row is a time stamp

    return time, frequency, confidence, activation

In [None]:
def generate_audio_features(audio_filename, output_dir):

    y, sr = librosa.load(audio_filename, sr=None)
    duration = librosa.get_duration(y=y, sr=sr)
    segment_length = 30  # Segment length in seconds

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Dictionary to store results
    audio_features = {}

    # Divide audio into 30-second segments
    num_segments = int(np.ceil(duration / segment_length))

    for i in range(num_segments):
        start_sample = int(i * segment_length * sr)
        end_sample = int(min((i + 1) * segment_length * sr, len(y)))
        segment = y[start_sample:end_sample]

        # Generate spectrogram
        S = librosa.stft(segment, hop_length = sr*0.01)
        S_db = librosa.amplitude_to_db(np.abs(S), ref=np.max)

        # Extract MFCCs
        mfccs = librosa.feature.mfcc(y=segment, sr=sr, n_mfcc=20, hop_length = sr*0.01)

        # F_0 estimation
        time, frequency, confidence, activation = estimate_pitch(segment, sr, voicing_threshold=0.3, use_viterbi=True)

        file_name, file_extension = os.path.splitext(audio_filename)

        # Create the segment name
        segment_name = f"{file_name}_segment_{i+1}{file_extension}"

        audio_features[segment_name] = {
            "spectrogram": S_db,
            "mfcc": mfccs[1:], # remove first row of mfcc
            "f_0 estimation": frequency
        }

    output_file = os.path.join(output_dir, f"{file_name}_features.pkl")
    with open(output_file, 'wb') as f:
        pickle.dump(audio_features, f)


    return audio_features

features = generate_audio_features('/work/11108_v360P_V1_source_3.wav', 'output_features')


2024-11-27 04:40:00.769927: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-27 04:40:01.019685: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-11-27 04:40:01.019712: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2024-11-27 04:40:01.045602: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-27 04:40:01.772015: W tensorflow/stream_executor/pla