In [89]:
# Imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import librosa
import librosa.display
import gc

from sklearn.preprocessing import LabelEncoder

In [90]:
import os
import pandas as pd

# === df_train + df_valid ===
BASE_DIR = '/Users/jeonsang-eon/ECE6254-Voice-Feature-Extraction/'
RAW_DATA_DIR = os.path.join(BASE_DIR, 'raw_data')
TRAIN_CSV_PATH = os.path.join(RAW_DATA_DIR, 'cv-valid-train.csv')

accents = ['us', 'england', 'indian', 'australia']
ages = ['teens', 'twenties', 'thirties', 'fourties', 'fifties', 'sixties']

df_train = pd.read_csv(TRAIN_CSV_PATH)

# Drop rows with missing gender
df_train = df_train.dropna(subset=['gender'])

# Filter accents of interest
df_train = df_train[df_train['accent'].isin(accents)]

# Drop rows with missing accent
df_train = df_train.dropna(subset=['accent'])

# Check if each file exists in RAW_DATA_DIR and remove rows where file is missing
df_train = df_train[df_train['filename'].apply(lambda x: os.path.exists(os.path.join(RAW_DATA_DIR, x)))]


df_train = df_train[['filename', 'gender', 'accent', 'age']]

print("File-existence filtered accent counts:")
print(df_train['accent'].value_counts())

File-existence filtered accent counts:
accent
us           29777
england      14648
indian        4382
australia     4020
Name: count, dtype: int64


In [91]:
# === df_test ===
TEST_CSV_PATH = os.path.join(RAW_DATA_DIR, 'cv-valid-test.csv')

df_test = pd.read_csv(TEST_CSV_PATH)

# Drop rows with missing gender
df_test = df_test.dropna(subset=['gender'])

# Filter accents of interest
df_test = df_test[df_test['accent'].isin(accents)]

# Drop rows with missing accent
df_test = df_test.dropna(subset=['accent'])

# Check if each file exists in RAW_DATA_DIR and remove rows where file is missing
df_test = df_test[df_test['filename'].apply(lambda x: os.path.exists(os.path.join(RAW_DATA_DIR, x)))]

df_test = df_test[['filename', 'gender', 'accent', 'age']]

print("File-existence filtered accent counts for test set:")
print(df_test['accent'].value_counts())

File-existence filtered accent counts for test set:
accent
us           626
england      298
indian        90
australia     90
Name: count, dtype: int64


In [92]:
# Work on a copy of the training data
df_temp = df_train.copy()

# Get all unique accents and count them
all_accents = df_temp['accent'].unique()
num_accents = len(all_accents)

# Identify (gender, age) combinations that are present in every accent
combo_counts = df_temp.groupby(['gender', 'age'])['accent'].nunique()
valid_combos = combo_counts[combo_counts == num_accents].index.tolist()

balanced_groups = []
resample_info_lines = []  # List to store resample factor info

# Process each valid (gender, age) combination
for gender, age in valid_combos:
    # Subset data for the current (gender, age) cell
    subset = df_temp[(df_temp['gender'] == gender) & (df_temp['age'] == age)]
    
    # Compute available counts per accent for this cell
    accent_counts = subset.groupby('accent').size()
    
    # Determine the target T: cannot exceed 1.5 times the minimum count and must be at most the maximum available count.
    T = min(accent_counts.max(), int(1.5 * accent_counts.min()))
    
    # For each accent, sample T records: oversample (with replacement) if needed or undersample otherwise.
    for accent, group in subset.groupby('accent'):
        current_count = len(group)
        factor = T / current_count
        if current_count < T:
            method = 'oversampled'
            sampled = group.sample(n=T, replace=True, random_state=42)
        else:
            method = 'undersampled'
            sampled = group.sample(n=T, replace=False, random_state=42)
        balanced_groups.append(sampled)
        
        # Record resample factor info for this group
        info_line = (f"Gender: {gender}, Age: {age}, Accent: {accent}, "
                     f"Original count: {current_count}, Target count: {T}, "
                     f"Factor: {factor:.2f} ({method})")
        resample_info_lines.append(info_line)


df_balanced = pd.concat(balanced_groups, ignore_index=True)


# Save resample factor information to a text file
output_dir = os.path.join(BASE_DIR, "data")  # Using absolute path
os.makedirs(output_dir, exist_ok=True)
resample_txt_path = os.path.join(output_dir, 'train_dataset_info.txt')
with open(resample_txt_path, "w") as f:
    f.write("Resample Factor Information:\n")
    for line in resample_info_lines:
        f.write(line + "\n")
print("Resample factor information saved to", resample_txt_path)




Resample factor information saved to /Users/jeonsang-eon/ECE6254-Voice-Feature-Extraction/data/train_dataset_info.txt


In [93]:
# Check the total number of samples per accent to verify they are equal
accent_totals = df_balanced.groupby('accent').size()
print("\nTotal samples per accent:")
print(accent_totals)

# Display the balanced Accent x Age confusion matrix
confusion_matrix_age = pd.crosstab(df_balanced['accent'], df_balanced['age'])
print("Balanced Accent x Age Confusion Matrix:")
print(confusion_matrix_age)

# Display the balanced Accent x Gender confusion matrix
confusion_matrix_gender = pd.crosstab(df_balanced['accent'], df_balanced['gender'])
print("\nBalanced Accent x Gender Confusion Matrix:")
print(confusion_matrix_gender)

print(f"\nTotal number of records in balanced dataset: {len(df_balanced)}")


Total samples per accent:
accent
australia    2724
england      2724
indian       2724
us           2724
dtype: int64
Balanced Accent x Age Confusion Matrix:
age        fifties  fourties  sixties  teens  thirties  twenties
accent                                                          
australia      665        67       73    198       769       952
england        665        67       73    198       769       952
indian         665        67       73    198       769       952
us             665        67       73    198       769       952

Balanced Accent x Gender Confusion Matrix:
gender     female  male
accent                 
australia    1052  1672
england      1052  1672
indian       1052  1672
us           1052  1672

Total number of records in balanced dataset: 10896


In [94]:
# Exclude rows that are in df_balanced from df_train
df_valid_candidate = df_train[~df_train['filename'].isin(df_balanced['filename'])]

balanced_valid_groups = []
# For each accent, sample exactly 500 records to balance the accent distribution.
# (If a group has less than 500 records, it will sample all available records.)
for accent, group in df_valid_candidate.groupby('accent'):
    n_samples = 500 if len(group) >= 500 else len(group)
    sampled_group = group.sample(n=n_samples, random_state=42)
    balanced_valid_groups.append(sampled_group)

# Combine the groups to create the balanced validation DataFrame
df_valid = pd.concat(balanced_valid_groups, ignore_index=True)



# Print the number of samples per accent in the validation set
accent_counts = df_valid['accent'].value_counts()
print("Accent counts in validation set:")
print(accent_counts)


Accent counts in validation set:
accent
australia    500
england      500
indian       500
us           500
Name: count, dtype: int64


In [95]:
from sklearn.preprocessing import LabelEncoder
import os

# Create a label encoder and fit on the 'accent' column of df_balanced
label_encoder = LabelEncoder()
df_balanced['accent_encoded'] = label_encoder.fit_transform(df_balanced['accent'])

# Use the same label encoder to transform the 'accent' column of df_valid
df_valid['accent_encoded'] = label_encoder.transform(df_valid['accent'])

df_test['accent_encoded'] = label_encoder.transform(df_test['accent'])

# Print counts for verification
print("Balanced dataset accent counts:")
print(df_balanced['accent_encoded'].value_counts())
print("\nValidation dataset accent counts:")
print(df_valid['accent_encoded'].value_counts())
print("\nLabel classes:")
print(label_encoder.classes_)

# Append the accent label mapping information to the existing info text file
txt_out_path = os.path.join(BASE_DIR, 'data', 'label_mapping_info.txt')
with open(txt_out_path, "a") as f:  # open in append mode
    f.write("\nAccent Label Mapping:\n")
    for encoded_value, accent in enumerate(label_encoder.classes_):
        f.write(f"{encoded_value}: {accent}\n")

print("Accent label mapping appended to", txt_out_path)


Balanced dataset accent counts:
accent_encoded
0    2724
1    2724
2    2724
3    2724
Name: count, dtype: int64

Validation dataset accent counts:
accent_encoded
0    500
1    500
2    500
3    500
Name: count, dtype: int64

Label classes:
['australia' 'england' 'indian' 'us']
Accent label mapping appended to /Users/jeonsang-eon/ECE6254-Voice-Feature-Extraction/data/label_mapping_info.txt


In [96]:
# Save the balanced validation DataFrame to a CSV file
df_valid_csv_path = os.path.join(output_dir, 'df_valid.csv')
df_valid.to_csv(df_valid_csv_path, index=False)
print("Balanced validation DataFrame saved to", df_valid_csv_path)

# Save the balanced DataFrame to a CSV file in the same directory
df_balanced_csv_path = os.path.join(output_dir, 'df_train_balanced.csv')
df_balanced.to_csv(df_balanced_csv_path, index=False)
print("Balanced DataFrame saved to", df_balanced_csv_path)

# Save the df_test DataFrame to a CSV file
output_dir = os.path.join(BASE_DIR, 'data')
os.makedirs(output_dir, exist_ok=True)
df_test_csv_path = os.path.join(output_dir, 'df_test.csv')
df_test.to_csv(df_test_csv_path, index=False)
print("Test DataFrame saved to", df_test_csv_path)

Balanced validation DataFrame saved to /Users/jeonsang-eon/ECE6254-Voice-Feature-Extraction/data/df_valid.csv
Balanced DataFrame saved to /Users/jeonsang-eon/ECE6254-Voice-Feature-Extraction/data/df_train_balanced.csv
Test DataFrame saved to /Users/jeonsang-eon/ECE6254-Voice-Feature-Extraction/data/df_test.csv


df_balance하고 df_valid, df_test하고 완성

## MFCC 뽑기

In [97]:
import os
import librosa
import numpy as np
from tensorflow.keras.utils import to_categorical
import gc
from tqdm import tqdm

# Ensure RAW_DATA_DIR is defined (the folder containing your audio files)
# For example:
# RAW_DATA_DIR = '/path/to/your/audio/files'

def extract_mfcc(file_path, n_mfcc=20):
    audio, sr = librosa.load(file_path)
    win_length = int(0.025 * sr)   # 25ms window length
    hop_length = int(0.01 * sr)    # 10ms hop length
    n_fft = win_length           # Using window length as n_fft (can be adjusted)
    mfccs = librosa.feature.mfcc(
        y=audio,
        sr=sr,
        n_mfcc=n_mfcc,
        n_fft=n_fft,
        hop_length=hop_length,
        win_length=win_length,
        window='hann'
    )
    return mfccs

# === Process Training Data (df_balanced) ===
all_mfccs_train = []
print("Extracting MFCCs for training set:")
for idx, row in tqdm(df_balanced.iterrows(), total=len(df_balanced), desc="Train MFCCs"):
    path = os.path.join(RAW_DATA_DIR, row['filename'])
    mfcc = extract_mfcc(path)
    all_mfccs_train.append(mfcc)

# === Process Validation Data (df_valid) ===
all_mfccs_valid = []
print("Extracting MFCCs for validation set:")
for idx, row in tqdm(df_valid.iterrows(), total=len(df_valid), desc="Valid MFCCs"):
    path = os.path.join(RAW_DATA_DIR, row['filename'])
    mfcc = extract_mfcc(path)
    all_mfccs_valid.append(mfcc)

# === Process Test Data (df_test) ===
all_mfccs_test = []
print("Extracting MFCCs for test set:")
for idx, row in tqdm(df_test.iterrows(), total=len(df_test), desc="Test MFCCs"):
    path = os.path.join(RAW_DATA_DIR, row['filename'])
    mfcc = extract_mfcc(path)
    all_mfccs_test.append(mfcc)

gc.collect()


gc.collect()


Extracting MFCCs for training set:


Train MFCCs: 100%|██████████| 10896/10896 [02:20<00:00, 77.74it/s] 


Extracting MFCCs for validation set:


Valid MFCCs: 100%|██████████| 2000/2000 [00:25<00:00, 78.06it/s]


Extracting MFCCs for test set:


Test MFCCs: 100%|██████████| 1104/1104 [00:14<00:00, 76.81it/s]


0

In [98]:
# Compute the median number of time frames across both training and validation MFCCs
time_frames_train = [mfcc.shape[1] for mfcc in all_mfccs_train]
time_frames_valid = [mfcc.shape[1] for mfcc in all_mfccs_valid]
time_frames_test = [mfcc.shape[1] for mfcc in all_mfccs_test]
all_time_frames = time_frames_train + time_frames_valid + time_frames_test
median_time_frames = int(np.median(all_time_frames))
print("Median time frames (train+valid):", median_time_frames)

# Pad or truncate MFCCs to have the same number of time frames (equal to the median)
def pad_or_truncate(mfcc, target_length):
    if mfcc.shape[1] < target_length:
        padded = np.pad(mfcc, ((0, 0), (0, target_length - mfcc.shape[1])), mode='constant')
    else:
        padded = mfcc[:, :target_length]
    return padded

padded_mfccs_train = [pad_or_truncate(mfcc, median_time_frames) for mfcc in all_mfccs_train]
padded_mfccs_valid = [pad_or_truncate(mfcc, median_time_frames) for mfcc in all_mfccs_valid]
padded_mfccs_test = [pad_or_truncate(mfcc, median_time_frames) for mfcc in all_mfccs_test]

Median time frames (train+valid): 381


In [99]:
# Create X arrays and one-hot encode labels
X_train = np.stack(padded_mfccs_train)
X_valid = np.stack(padded_mfccs_valid)
X_test = np.stack(padded_mfccs_test)
y_train = to_categorical(df_balanced['accent_encoded'].values)
y_valid = to_categorical(df_valid['accent_encoded'].values)
y_test = to_categorical(df_test['accent_encoded'].values)

# Save each dataset to an NPZ file
output_dir = os.path.join(BASE_DIR, "data")
os.makedirs(output_dir, exist_ok=True)

train_npz_path = os.path.join(output_dir, "train-dataset.npz")
np.savez(train_npz_path, X=X_train, y=y_train)
print("Saved training dataset to", train_npz_path)

valid_npz_path = os.path.join(output_dir, "valid-dataset.npz")
np.savez(valid_npz_path, X=X_valid, y=y_valid)
print("Saved validation dataset to", valid_npz_path)

test_npz_path = os.path.join(output_dir, "test-dataset.npz")
np.savez(test_npz_path, X=X_test, y=y_test)
print("Saved test dataset to", test_npz_path)


Saved training dataset to /Users/jeonsang-eon/ECE6254-Voice-Feature-Extraction/data/train-dataset.npz
Saved validation dataset to /Users/jeonsang-eon/ECE6254-Voice-Feature-Extraction/data/valid-dataset.npz
Saved test dataset to /Users/jeonsang-eon/ECE6254-Voice-Feature-Extraction/data/test-dataset.npz
