In [74]:
# Imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import librosa
import librosa.display
import gc

from sklearn.preprocessing import LabelEncoder

In [75]:
import os
import pandas as pd

# === df_train + df_valid ===
BASE_DIR = '/Users/jeonsang-eon/ECE6254-Voice-Feature-Extraction/'
RAW_DATA_DIR = os.path.join(BASE_DIR, 'raw_data')
TRAIN_CSV_PATH = os.path.join(RAW_DATA_DIR, 'cv-valid-train.csv')

accents = ['england', 'indian', 'australia', 'african']
ages = ['teens', 'twenties', 'thirties', 'fourties', 'fifties', 'sixties']

df_train = pd.read_csv(TRAIN_CSV_PATH)

# Drop rows with missing gender
df_train = df_train.dropna(subset=['gender'])

# Filter accents of interest
df_train = df_train[df_train['accent'].isin(accents)]

# Drop rows with missing accent
df_train = df_train.dropna(subset=['accent'])

# Check if each file exists in RAW_DATA_DIR and remove rows where file is missing
df_train = df_train[df_train['filename'].apply(lambda x: os.path.exists(os.path.join(RAW_DATA_DIR, x)))]


df_train = df_train[['filename', 'gender', 'accent', 'age']]

print("File-existence filtered accent counts:")
print(df_train['accent'].value_counts())

File-existence filtered accent counts:
accent
england      14648
indian        4382
australia     4020
african       1133
Name: count, dtype: int64


In [76]:
# === df_test ===
TEST_CSV_PATH = os.path.join(RAW_DATA_DIR, 'cv-valid-test.csv')

df_test = pd.read_csv(TEST_CSV_PATH)

# Drop rows with missing gender
df_test = df_test.dropna(subset=['gender'])

# Filter accents of interest
df_test = df_test[df_test['accent'].isin(accents)]

# Drop rows with missing accent
df_test = df_test.dropna(subset=['accent'])

# Check if each file exists in RAW_DATA_DIR and remove rows where file is missing
df_test = df_test[df_test['filename'].apply(lambda x: os.path.exists(os.path.join(RAW_DATA_DIR, x)))]

df_test = df_test[['filename', 'gender', 'accent', 'age']]

print("File-existence filtered accent counts for test set:")
print(df_test['accent'].value_counts())

File-existence filtered accent counts for test set:
accent
england      298
indian        90
australia     90
african       24
Name: count, dtype: int64


In [77]:
# Work on a copy of the training data
df_temp = df_train.copy()

# Get all unique accents and count them
all_accents = df_temp['accent'].unique()
num_accents = len(all_accents)

# Identify (gender, age) combinations that are present in every accent
combo_counts = df_temp.groupby(['gender', 'age'])['accent'].nunique()
valid_combos = combo_counts[combo_counts == num_accents].index.tolist()

balanced_groups = []
resample_info_lines = []  # List to store resample factor info

# Process each valid (gender, age) combination
for gender, age in valid_combos:
    # Subset data for the current (gender, age) cell
    subset = df_temp[(df_temp['gender'] == gender) & (df_temp['age'] == age)]
    
    # Compute available counts per accent for this cell
    accent_counts = subset.groupby('accent').size()
    
    # Determine the target T: cannot exceed 1.5 times the minimum count and must be at most the maximum available count.
    T = min(accent_counts.max(), int(1.5 * accent_counts.min()))
    
    # For each accent, sample T records: oversample (with replacement) if needed or undersample otherwise.
    for accent, group in subset.groupby('accent'):
        current_count = len(group)
        factor = T / current_count
        if current_count < T:
            method = 'oversampled'
            sampled = group.sample(n=T, replace=True, random_state=42)
        else:
            method = 'undersampled'
            sampled = group.sample(n=T, replace=False, random_state=42)
        balanced_groups.append(sampled)
        
        # Record resample factor info for this group
        info_line = (f"Gender: {gender}, Age: {age}, Accent: {accent}, "
                     f"Original count: {current_count}, Target count: {T}, "
                     f"Factor: {factor:.2f} ({method})")
        resample_info_lines.append(info_line)


df_balanced = pd.concat(balanced_groups, ignore_index=True)


# Save resample factor information to a text file
output_dir = os.path.join(BASE_DIR, "data")  # Using absolute path
os.makedirs(output_dir, exist_ok=True)
resample_txt_path = os.path.join(output_dir, 'train_dataset_info.txt')
with open(resample_txt_path, "w") as f:
    f.write("Resample Factor Information:\n")
    for line in resample_info_lines:
        f.write(line + "\n")
print("Resample factor information saved to", resample_txt_path)




Resample factor information saved to /Users/jeonsang-eon/ECE6254-Voice-Feature-Extraction/data/train_dataset_info.txt


In [78]:
# Check the total number of samples per accent to verify they are equal
accent_totals = df_balanced.groupby('accent').size()
print("\nTotal samples per accent:")
print(accent_totals)

# Display the balanced Accent x Age confusion matrix
confusion_matrix_age = pd.crosstab(df_balanced['accent'], df_balanced['age'])
print("Balanced Accent x Age Confusion Matrix:")
print(confusion_matrix_age)

# Display the balanced Accent x Gender confusion matrix
confusion_matrix_gender = pd.crosstab(df_balanced['accent'], df_balanced['gender'])
print("\nBalanced Accent x Gender Confusion Matrix:")
print(confusion_matrix_gender)

print(f"\nTotal number of records in balanced dataset: {len(df_balanced)}")


Total samples per accent:
accent
african      1264
australia    1264
england      1264
indian       1264
dtype: int64
Balanced Accent x Age Confusion Matrix:
age        fifties  fourties  sixties  teens  thirties  twenties
accent                                                          
african        107        67       48    180       382       480
australia      107        67       48    180       382       480
england        107        67       48    180       382       480
indian         107        67       48    180       382       480

Balanced Accent x Gender Confusion Matrix:
gender     female  male
accent                 
african       230  1034
australia     230  1034
england       230  1034
indian        230  1034

Total number of records in balanced dataset: 5056


In [79]:
# Exclude rows that are in df_balanced from df_train
df_valid_candidate = df_train[~df_train['filename'].isin(df_balanced['filename'])]

balanced_valid_groups = []
# For each accent, sample exactly 500 records to balance the accent distribution.
# (If a group has less than 500 records, it will sample all available records.)
for accent, group in df_valid_candidate.groupby('accent'):
    n_samples = 500 if len(group) >= 500 else len(group)
    sampled_group = group.sample(n=n_samples, random_state=42)
    balanced_valid_groups.append(sampled_group)

# Combine the groups to create the balanced validation DataFrame
df_valid = pd.concat(balanced_valid_groups, ignore_index=True)



# Print the number of samples per accent in the validation set
accent_counts = df_valid['accent'].value_counts()
print("Accent counts in validation set:")
print(accent_counts)


Accent counts in validation set:
accent
australia    500
england      500
indian       500
african      436
Name: count, dtype: int64


In [80]:
from sklearn.preprocessing import LabelEncoder
import os

# Create a label encoder and fit on the 'accent' column of df_balanced
label_encoder = LabelEncoder()
df_balanced['accent_encoded'] = label_encoder.fit_transform(df_balanced['accent'])

# Use the same label encoder to transform the 'accent' column of df_valid
df_valid['accent_encoded'] = label_encoder.transform(df_valid['accent'])

df_test['accent_encoded'] = label_encoder.transform(df_test['accent'])

# Print counts for verification
print("Balanced dataset accent counts:")
print(df_balanced['accent_encoded'].value_counts())
print("\nValidation dataset accent counts:")
print(df_valid['accent_encoded'].value_counts())
print("\nLabel classes:")
print(label_encoder.classes_)

# Append the accent label mapping information to the existing info text file
txt_out_path = os.path.join(BASE_DIR, 'data', 'label_mapping_info.txt')
with open(txt_out_path, "a") as f:  # open in append mode
    f.write("\nAccent Label Mapping:\n")
    for encoded_value, accent in enumerate(label_encoder.classes_):
        f.write(f"{encoded_value}: {accent}\n")

print("Accent label mapping appended to", txt_out_path)


Balanced dataset accent counts:
accent_encoded
0    1264
1    1264
2    1264
3    1264
Name: count, dtype: int64

Validation dataset accent counts:
accent_encoded
1    500
2    500
3    500
0    436
Name: count, dtype: int64

Label classes:
['african' 'australia' 'england' 'indian']
Accent label mapping appended to /Users/jeonsang-eon/ECE6254-Voice-Feature-Extraction/data/label_mapping_info.txt


In [81]:
# Save the balanced validation DataFrame to a CSV file
df_valid_csv_path = os.path.join(output_dir, 'df_valid.csv')
df_valid.to_csv(df_valid_csv_path, index=False)
print("Balanced validation DataFrame saved to", df_valid_csv_path)

# Save the balanced DataFrame to a CSV file in the same directory
df_balanced_csv_path = os.path.join(output_dir, 'df_train_balanced.csv')
df_balanced.to_csv(df_balanced_csv_path, index=False)
print("Balanced DataFrame saved to", df_balanced_csv_path)

# Save the df_test DataFrame to a CSV file
output_dir = os.path.join(BASE_DIR, 'data')
os.makedirs(output_dir, exist_ok=True)
df_test_csv_path = os.path.join(output_dir, 'df_test.csv')
df_test.to_csv(df_test_csv_path, index=False)
print("Test DataFrame saved to", df_test_csv_path)

Balanced validation DataFrame saved to /Users/jeonsang-eon/ECE6254-Voice-Feature-Extraction/data/df_valid.csv
Balanced DataFrame saved to /Users/jeonsang-eon/ECE6254-Voice-Feature-Extraction/data/df_train_balanced.csv
Test DataFrame saved to /Users/jeonsang-eon/ECE6254-Voice-Feature-Extraction/data/df_test.csv


Completed df_balance, df_valid, and df_test

## MFCC and tempogram

In [82]:
import os
import librosa
import numpy as np
import gc
from tqdm import tqdm
from scipy.signal import butter, lfilter

def lowpass_filter(data, sr, cutoff=4000, order=5):
    """
    Apply a Butterworth low-pass filter to the data.
    
    Parameters:
        data (np.ndarray): Audio time series.
        sr (int): Sampling rate of the audio.
        cutoff (float): Cutoff frequency in Hz (default 4000 Hz).
        order (int): Filter order; higher order means a steeper rolloff.
        
    Returns:
        np.ndarray: Filtered audio signal.
    """
    nyquist = 0.5 * sr
    normal_cutoff = cutoff / nyquist  # normalize the frequency
    b, a = butter(order, normal_cutoff, btype='low', analog=False)
    filtered_data = lfilter(b, a, data)
    return filtered_data

def extract_features(file_path, h=13):
    """
    Extract MFCC features and a tempogram with the same height (13 rows) from an audio file 
    after applying a low-pass filter.
    
    Parameters:
        file_path (str): Path to the audio file.
        h (int): Number of MFCCs to extract (default 13 so that MFCC and tempogram heights are equal).
        
    Returns:
        tuple: (mfccs, tempogram) where:
            - mfccs (np.ndarray): MFCC feature matrix with shape (13, time_frames).
            - tempogram (np.ndarray): Tempogram feature matrix with 13 rows (time_frames).
    """
    # Load the audio signal
    audio, sr = librosa.load(file_path)
    
    # Apply low-pass filter to remove frequencies above 4000 Hz (using cutoff=5000 Hz here)
    audio = lowpass_filter(audio, sr, cutoff=5000)
    
    # Define window and hop lengths based on the sampling rate
    win_length = int(0.025 * sr)   # 25 ms window length
    hop_length = int(0.01 * sr)    # 10 ms hop length
    n_fft = win_length           # using window length as n_fft
    
    # Extract MFCC features with h=13 so that MFCC matrix has height 13.
    mfccs = librosa.feature.mfcc(
        y=audio,
        sr=sr,
        n_mfcc=h,
        n_fft=n_fft,
        hop_length=hop_length,
        win_length=win_length,
        window='hann'
    )
    
    # Compute onset envelope required for tempogram calculation
    oenv = librosa.onset.onset_strength(y=audio, sr=sr, hop_length=hop_length)
    
    # Compute tempogram from the onset envelope
    tempogram = librosa.feature.tempogram(onset_envelope=oenv, sr=sr, hop_length=hop_length)
    
    # Keep only the first 13 rows of the tempogram to match the MFCC height
    tempogram = tempogram[:h, :]
    
    return mfccs, tempogram


# === Process Training Data (df_balanced) ===
all_features_train = []
print("Extracting features (MFCC & tempogram) for training set:")
for idx, row in tqdm(df_balanced.iterrows(), total=len(df_balanced), desc="Train features"):
    path = os.path.join(RAW_DATA_DIR, row['filename'])
    mfcc, tempogram = extract_features(path)
    all_features_train.append((mfcc, tempogram))

# === Process Validation Data (df_valid) ===
all_features_valid = []
print("Extracting features (MFCC & tempogram) for validation set:")
for idx, row in tqdm(df_valid.iterrows(), total=len(df_valid), desc="Valid features"):
    path = os.path.join(RAW_DATA_DIR, row['filename'])
    mfcc, tempogram = extract_features(path)
    all_features_valid.append((mfcc, tempogram))

# === Process Test Data (df_test) ===
all_features_test = []
print("Extracting features (MFCC & tempogram) for test set:")
for idx, row in tqdm(df_test.iterrows(), total=len(df_test), desc="Test features"):
    path = os.path.join(RAW_DATA_DIR, row['filename'])
    mfcc, tempogram = extract_features(path)
    all_features_test.append((mfcc, tempogram))

gc.collect()


Extracting features (MFCC & tempogram) for training set:


Train features: 100%|██████████| 5056/5056 [04:55<00:00, 17.10it/s]


Extracting features (MFCC & tempogram) for validation set:


Valid features: 100%|██████████| 1936/1936 [01:54<00:00, 16.85it/s]


Extracting features (MFCC & tempogram) for test set:


Test features: 100%|██████████| 502/502 [00:24<00:00, 20.64it/s]


1754

In [83]:
import os
import numpy as np
import gc
from tensorflow.keras.utils import to_categorical

# Assume all_features_train, all_features_valid, all_features_test have been computed
# Each element is a tuple: (mfcc, tempogram) where:
#   mfcc has shape (n_mfcc, time_frames), e.g., (13, T)
#   tempogram has shape (13, time_frames)

# Compute the median number of time frames across training, validation, and test sets (using MFCC)
time_frames_train = [features[0].shape[1] for features in all_features_train]
time_frames_valid = [features[0].shape[1] for features in all_features_valid]
time_frames_test = [features[0].shape[1] for features in all_features_test]
all_time_frames = time_frames_train + time_frames_valid + time_frames_test
median_time_frames = int(np.median(all_time_frames))
print("Median time frames (train+valid+test):", median_time_frames)

# Define a helper function to pad or truncate a feature matrix along the time axis
def pad_or_truncate(feature_matrix, target_length):
    """Pad with zeros or truncate the feature matrix along the time axis to have target_length frames."""
    current_length = feature_matrix.shape[1]
    if current_length < target_length:
        padded = np.pad(feature_matrix, ((0, 0), (0, target_length - current_length)), mode='constant')
    else:
        padded = feature_matrix[:, :target_length]
    return padded

# Process Training Data
padded_mfccs_train = []
padded_tempograms_train = []
for mfcc, tempogram in all_features_train:
    padded_mfccs_train.append(pad_or_truncate(mfcc, median_time_frames))
    padded_tempograms_train.append(pad_or_truncate(tempogram, median_time_frames))

# Process Validation Data
padded_mfccs_valid = []
padded_tempograms_valid = []
for mfcc, tempogram in all_features_valid:
    padded_mfccs_valid.append(pad_or_truncate(mfcc, median_time_frames))
    padded_tempograms_valid.append(pad_or_truncate(tempogram, median_time_frames))

# Process Test Data
padded_mfccs_test = []
padded_tempograms_test = []
for mfcc, tempogram in all_features_test:
    padded_mfccs_test.append(pad_or_truncate(mfcc, median_time_frames))
    padded_tempograms_test.append(pad_or_truncate(tempogram, median_time_frames))

# Combine MFCC and tempogram into a single feature by stacking along a new channel dimension.
# For each sample, the combined feature shape will be (2, 13, median_time_frames)
features_train = []
for mfcc, tempogram in zip(padded_mfccs_train, padded_tempograms_train):
    combined_feature = np.stack([mfcc, tempogram], axis=0)
    features_train.append(combined_feature)

features_valid = []
for mfcc, tempogram in zip(padded_mfccs_valid, padded_tempograms_valid):
    combined_feature = np.stack([mfcc, tempogram], axis=0)
    features_valid.append(combined_feature)

features_test = []
for mfcc, tempogram in zip(padded_mfccs_test, padded_tempograms_test):
    combined_feature = np.stack([mfcc, tempogram], axis=0)
    features_test.append(combined_feature)



Median time frames (train+valid+test): 388


In [84]:
# Create X arrays by stacking all combined features
X_train = np.stack(features_train)   # Shape: (num_train, 2, 13, median_time_frames)
X_valid = np.stack(features_valid)   # Shape: (num_valid, 2, 13, median_time_frames)
X_test = np.stack(features_test)     # Shape: (num_test, 2, 13, median_time_frames)

# One-hot encode labels
y_train = to_categorical(df_balanced['accent_encoded'].values)
y_valid = to_categorical(df_valid['accent_encoded'].values)
y_test = to_categorical(df_test['accent_encoded'].values)

# Save each dataset to a compressed NPZ file
output_dir = os.path.join(BASE_DIR, "data")
os.makedirs(output_dir, exist_ok=True)

train_npz_path = os.path.join(output_dir, "train-dataset.npz")
np.savez_compressed(train_npz_path, X=X_train, y=y_train)
print("Saved training dataset to", train_npz_path)

valid_npz_path = os.path.join(output_dir, "valid-dataset.npz")
np.savez_compressed(valid_npz_path, X=X_valid, y=y_valid)
print("Saved validation dataset to", valid_npz_path)

test_npz_path = os.path.join(output_dir, "test-dataset.npz")
np.savez_compressed(test_npz_path, X=X_test, y=y_test)
print("Saved test dataset to", test_npz_path)

gc.collect()


Saved training dataset to /Users/jeonsang-eon/ECE6254-Voice-Feature-Extraction/data/train-dataset.npz
Saved validation dataset to /Users/jeonsang-eon/ECE6254-Voice-Feature-Extraction/data/valid-dataset.npz
Saved test dataset to /Users/jeonsang-eon/ECE6254-Voice-Feature-Extraction/data/test-dataset.npz


0