In [1]:
# Imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import librosa
import librosa.display
import gc

from sklearn.preprocessing import LabelEncoder

In [2]:
import os
import pandas as pd

# === df_train + df_valid ===
BASE_DIR = '/Users/jeonsang-eon/ECE6254-Voice-Feature-Extraction/'
RAW_DATA_DIR = os.path.join(BASE_DIR, 'raw_data')
TRAIN_CSV_PATH = os.path.join(RAW_DATA_DIR, 'cv-valid-train.csv')

accents = ['us' ,'england', 'indian', 'australia']
ages = ['teens', 'twenties', 'thirties', 'fourties', 'fifties', 'sixties']

df_train = pd.read_csv(TRAIN_CSV_PATH)

# Drop rows with missing gender
df_train = df_train.dropna(subset=['gender'])

# Filter accents of interest
df_train = df_train[df_train['accent'].isin(accents)]

# Drop rows with missing accent
df_train = df_train.dropna(subset=['accent'])

# Check if each file exists in RAW_DATA_DIR and remove rows where file is missing
df_train = df_train[df_train['filename'].apply(lambda x: os.path.exists(os.path.join(RAW_DATA_DIR, x)))]


df_train = df_train[['filename', 'gender', 'accent', 'age']]

print("File-existence filtered accent counts:")
print(df_train['accent'].value_counts())

File-existence filtered accent counts:
accent
us           29777
england      14648
indian        4382
australia     4020
Name: count, dtype: int64


In [3]:
# === df_test ===
TEST_CSV_PATH = os.path.join(RAW_DATA_DIR, 'cv-valid-test.csv')

df_test = pd.read_csv(TEST_CSV_PATH)

# Drop rows with missing gender
df_test = df_test.dropna(subset=['gender'])

# Filter accents of interest
df_test = df_test[df_test['accent'].isin(accents)]

# Drop rows with missing accent
df_test = df_test.dropna(subset=['accent'])

# Check if each file exists in RAW_DATA_DIR and remove rows where file is missing
df_test = df_test[df_test['filename'].apply(lambda x: os.path.exists(os.path.join(RAW_DATA_DIR, x)))]

df_test = df_test[['filename', 'gender', 'accent', 'age']]

print("File-existence filtered accent counts for test set:")
print(df_test['accent'].value_counts())

File-existence filtered accent counts for test set:
accent
us           626
england      298
indian        90
australia     90
Name: count, dtype: int64


In [4]:
# Work on a copy of the training data
df_temp = df_train.copy()

# Get all unique accents and count them
all_accents = df_temp['accent'].unique()
num_accents = len(all_accents)

# Identify (gender, age) combinations that are present in every accent
combo_counts = df_temp.groupby(['gender', 'age'])['accent'].nunique()
valid_combos = combo_counts[combo_counts == num_accents].index.tolist()

balanced_groups = []
resample_info_lines = []  # List to store resample factor info

# Process each valid (gender, age) combination
for gender, age in valid_combos:
    # Subset data for the current (gender, age) cell
    subset = df_temp[(df_temp['gender'] == gender) & (df_temp['age'] == age)]
    
    # Compute available counts per accent for this cell
    accent_counts = subset.groupby('accent').size()
    
    # Determine the target T: cannot exceed 1.5 times the minimum count and must be at most the maximum available count.
    T = min(accent_counts.max(), int(1.5 * accent_counts.min()))
    
    # For each accent, sample T records: oversample (with replacement) if needed or undersample otherwise.
    for accent, group in subset.groupby('accent'):
        current_count = len(group)
        factor = T / current_count
        if current_count < T:
            method = 'oversampled'
            sampled = group.sample(n=T, replace=True, random_state=42)
        else:
            method = 'undersampled'
            sampled = group.sample(n=T, replace=False, random_state=42)
        balanced_groups.append(sampled)
        
        # Record resample factor info for this group
        info_line = (f"Gender: {gender}, Age: {age}, Accent: {accent}, "
                     f"Original count: {current_count}, Target count: {T}, "
                     f"Factor: {factor:.2f} ({method})")
        resample_info_lines.append(info_line)


df_balanced = pd.concat(balanced_groups, ignore_index=True)


# Save resample factor information to a text file
output_dir = os.path.join(BASE_DIR, "data")  # Using absolute path
os.makedirs(output_dir, exist_ok=True)
resample_txt_path = os.path.join(output_dir, 'train_dataset_info.txt')
with open(resample_txt_path, "w") as f:
    f.write("Resample Factor Information:\n")
    for line in resample_info_lines:
        f.write(line + "\n")
print("Resample factor information saved to", resample_txt_path)




Resample factor information saved to /Users/jeonsang-eon/ECE6254-Voice-Feature-Extraction/data/train_dataset_info.txt


In [5]:
# Check the total number of samples per accent to verify they are equal
accent_totals = df_balanced.groupby('accent').size()
print("\nTotal samples per accent:")
print(accent_totals)

# Display the balanced Accent x Age confusion matrix
confusion_matrix_age = pd.crosstab(df_balanced['accent'], df_balanced['age'])
print("Balanced Accent x Age Confusion Matrix:")
print(confusion_matrix_age)

# Display the balanced Accent x Gender confusion matrix
confusion_matrix_gender = pd.crosstab(df_balanced['accent'], df_balanced['gender'])
print("\nBalanced Accent x Gender Confusion Matrix:")
print(confusion_matrix_gender)

print(f"\nTotal number of records in balanced dataset: {len(df_balanced)}")


Total samples per accent:
accent
australia    2724
england      2724
indian       2724
us           2724
dtype: int64
Balanced Accent x Age Confusion Matrix:
age        fifties  fourties  sixties  teens  thirties  twenties
accent                                                          
australia      665        67       73    198       769       952
england        665        67       73    198       769       952
indian         665        67       73    198       769       952
us             665        67       73    198       769       952

Balanced Accent x Gender Confusion Matrix:
gender     female  male
accent                 
australia    1052  1672
england      1052  1672
indian       1052  1672
us           1052  1672

Total number of records in balanced dataset: 10896


In [6]:
# Exclude rows that are in df_balanced from df_train
df_valid_candidate = df_train[~df_train['filename'].isin(df_balanced['filename'])]

balanced_valid_groups = []
# For each accent, sample exactly 500 records to balance the accent distribution.
# (If a group has less than 500 records, it will sample all available records.)
for accent, group in df_valid_candidate.groupby('accent'):
    n_samples = 500 if len(group) >= 500 else len(group)
    sampled_group = group.sample(n=n_samples, random_state=42)
    balanced_valid_groups.append(sampled_group)

# Combine the groups to create the balanced validation DataFrame
df_valid = pd.concat(balanced_valid_groups, ignore_index=True)



# Print the number of samples per accent in the validation set
accent_counts = df_valid['accent'].value_counts()
print("Accent counts in validation set:")
print(accent_counts)


Accent counts in validation set:
accent
australia    500
england      500
indian       500
us           500
Name: count, dtype: int64


In [7]:
from sklearn.preprocessing import LabelEncoder
import os

# Create a label encoder and fit on the 'accent' column of df_balanced
label_encoder = LabelEncoder()
df_balanced['accent_encoded'] = label_encoder.fit_transform(df_balanced['accent'])

# Use the same label encoder to transform the 'accent' column of df_valid
df_valid['accent_encoded'] = label_encoder.transform(df_valid['accent'])

df_test['accent_encoded'] = label_encoder.transform(df_test['accent'])

# Print counts for verification
print("Balanced dataset accent counts:")
print(df_balanced['accent_encoded'].value_counts())
print("\nValidation dataset accent counts:")
print(df_valid['accent_encoded'].value_counts())
print("\nLabel classes:")
print(label_encoder.classes_)

# Append the accent label mapping information to the existing info text file
txt_out_path = os.path.join(BASE_DIR, 'data', 'label_mapping_info.txt')
with open(txt_out_path, "a") as f:  # open in append mode
    f.write("\nAccent Label Mapping:\n")
    for encoded_value, accent in enumerate(label_encoder.classes_):
        f.write(f"{encoded_value}: {accent}\n")

print("Accent label mapping appended to", txt_out_path)


Balanced dataset accent counts:
accent_encoded
0    2724
1    2724
2    2724
3    2724
Name: count, dtype: int64

Validation dataset accent counts:
accent_encoded
0    500
1    500
2    500
3    500
Name: count, dtype: int64

Label classes:
['australia' 'england' 'indian' 'us']
Accent label mapping appended to /Users/jeonsang-eon/ECE6254-Voice-Feature-Extraction/data/label_mapping_info.txt


In [8]:
# Save the balanced validation DataFrame to a CSV file
df_valid_csv_path = os.path.join(output_dir, 'df_valid.csv')
df_valid.to_csv(df_valid_csv_path, index=False)
print("Balanced validation DataFrame saved to", df_valid_csv_path)

# Save the balanced DataFrame to a CSV file in the same directory
df_balanced_csv_path = os.path.join(output_dir, 'df_train_balanced.csv')
df_balanced.to_csv(df_balanced_csv_path, index=False)
print("Balanced DataFrame saved to", df_balanced_csv_path)

# Save the df_test DataFrame to a CSV file
output_dir = os.path.join(BASE_DIR, 'data')
os.makedirs(output_dir, exist_ok=True)
df_test_csv_path = os.path.join(output_dir, 'df_test.csv')
df_test.to_csv(df_test_csv_path, index=False)
print("Test DataFrame saved to", df_test_csv_path)

Balanced validation DataFrame saved to /Users/jeonsang-eon/ECE6254-Voice-Feature-Extraction/data/df_valid.csv
Balanced DataFrame saved to /Users/jeonsang-eon/ECE6254-Voice-Feature-Extraction/data/df_train_balanced.csv
Test DataFrame saved to /Users/jeonsang-eon/ECE6254-Voice-Feature-Extraction/data/df_test.csv


Completed df_balance, df_valid, and df_test

## Mutlifeatures

In [12]:
import os
import librosa
import numpy as np
import gc
import scipy.stats
import pandas as pd
from tqdm import tqdm
from scipy.signal import butter, lfilter

def lowpass_filter(data, sr, cutoff=4000, order=5):
    """
    Apply a Butterworth low-pass filter to the data.
    """
    nyquist = 0.5 * sr
    normal_cutoff = cutoff / nyquist
    b, a = butter(order, normal_cutoff, btype='low', analog=False)
    return lfilter(b, a, data)

def extract_global_features(file_path, n_mfcc=13):
    """
    Extract a concise set of global (file-level) features for accent classification:

    1. Energy (sum of squares)
    2. RMS Energy
    3. Zero-Crossing Rate
    4. MFCC (mean & std across time)
    5. Delta MFCC (mean & std)
    6. Fundamental Frequency (mean, std)
    7. Jitter (based on F0 contour)
    8. (Optional) Spectral Centroid & Bandwidth (mean)
    """
    # Load the audio file (mono).
    audio, sr = librosa.load(file_path)
    
    # Optional: Low-pass filter to remove frequencies above 5000 Hz
    audio = lowpass_filter(audio, sr, cutoff=5000)
    
    features = {}

    # 1. Energy: sum of squares
    features['energy'] = np.sum(audio**2)
    
    # 2. RMS Energy
    features['rms'] = np.sqrt(np.mean(audio**2))
    
    # 3. Zero-Crossing Rate (total count)
    zcr = np.sum(librosa.zero_crossings(audio, pad=False))
    features['zero_crossings'] = zcr
    
    # 4. MFCC (mean & std across time)
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    features['mfcc_mean'] = np.mean(mfcc, axis=1)  # shape: (n_mfcc,)
    features['mfcc_std']  = np.std(mfcc, axis=1)
    
    # 5. Delta MFCC (mean & std)
    mfcc_delta = librosa.feature.delta(mfcc)
    features['mfcc_delta_mean'] = np.mean(mfcc_delta, axis=1)
    features['mfcc_delta_std']  = np.std(mfcc_delta, axis=1)
    
    # 6. Fundamental Frequency (using librosa.yin)
    #    We'll compute mean & std of F0
    f0 = librosa.yin(y=audio, fmin=80, fmax=300)
    features['f0_mean'] = np.nanmean(f0)
    features['f0_std']  = np.nanstd(f0)
    
    # 7. Jitter Feature: relative mean absolute difference of consecutive F0 values
    diff_f0 = np.abs(np.diff(f0))
    mean_f0 = np.nanmean(f0)
    features['jitter'] = np.nanmean(diff_f0) / mean_f0 if mean_f0 > 0 else np.nan
    
    # 8. (Optional) Spectral Centroid & Bandwidth (mean)
    #    Comment out if you want fewer features / faster extraction
    spec_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)
    features['spectral_centroid_mean'] = np.mean(spec_centroid)

    spec_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=sr)
    features['spectral_bandwidth_mean'] = np.mean(spec_bandwidth)
    
    return features

# === Process Training Data (df_balanced) ===
all_features_train = []
print("Extracting features for training set:")
for idx, row in tqdm(df_balanced.iterrows(), total=len(df_balanced), desc="Train features"):
    path = os.path.join(RAW_DATA_DIR, row['filename'])
    feats = extract_global_features(path, n_mfcc=13)
    all_features_train.append(feats)

# === Process Validation Data (df_valid) ===
all_features_valid = []
print("Extracting features for validation set:")
for idx, row in tqdm(df_valid.iterrows(), total=len(df_valid), desc="Valid features"):
    path = os.path.join(RAW_DATA_DIR, row['filename'])
    feats = extract_global_features(path, n_mfcc=13)
    all_features_valid.append(feats)

# === Process Test Data (df_test) ===
all_features_test = []
print("Extracting features for test set:")
for idx, row in tqdm(df_test.iterrows(), total=len(df_test), desc="Test features"):
    path = os.path.join(RAW_DATA_DIR, row['filename'])
    feats = extract_global_features(path, n_mfcc=13)
    all_features_test.append(feats)

gc.collect()

# --- Save extracted features to CSV files ---
output_dir = os.path.join(BASE_DIR, "data")
os.makedirs(output_dir, exist_ok=True)

df_train_features = pd.DataFrame(all_features_train)
df_valid_features = pd.DataFrame(all_features_valid)
df_test_features  = pd.DataFrame(all_features_test)

train_csv_path = os.path.join(output_dir, "train_features.csv")
df_train_features.to_csv(train_csv_path, index=False)
print("Saved training features to", train_csv_path)

valid_csv_path = os.path.join(output_dir, "valid_features.csv")
df_valid_features.to_csv(valid_csv_path, index=False)
print("Saved validation features to", valid_csv_path)

test_csv_path = os.path.join(output_dir, "test_features.csv")
df_test_features.to_csv(test_csv_path, index=False)
print("Saved test features to", test_csv_path)


Extracting features for training set:


Train features: 100%|██████████| 10896/10896 [39:33<00:00,  4.59it/s] 


Extracting features for validation set:


Valid features: 100%|██████████| 2000/2000 [07:20<00:00,  4.54it/s]


Extracting features for test set:


Test features: 100%|██████████| 1104/1104 [03:46<00:00,  4.88it/s]


Saved training features to /Users/jeonsang-eon/ECE6254-Voice-Feature-Extraction/data/train_features.csv
Saved validation features to /Users/jeonsang-eon/ECE6254-Voice-Feature-Extraction/data/valid_features.csv
Saved test features to /Users/jeonsang-eon/ECE6254-Voice-Feature-Extraction/data/test_features.csv


In [13]:
# 기존 데이터프레임에 추출된 feature들을 열 단위로 병합합니다.
df_balanced_updated = pd.concat([df_balanced.reset_index(drop=True), df_train_features.reset_index(drop=True)], axis=1)
df_valid_updated    = pd.concat([df_valid.reset_index(drop=True), df_valid_features.reset_index(drop=True)], axis=1)
df_test_updated     = pd.concat([df_test.reset_index(drop=True), df_test_features.reset_index(drop=True)], axis=1)

# 병합된 데이터프레임의 앞부분 일부를 출력해서 제대로 병합되었는지 확인해봅니다.
print("Updated df_balanced (first 5 rows):")
print(df_balanced_updated.head())

print("\nUpdated df_valid (first 5 rows):")
print(df_valid_updated.head())

print("\nUpdated df_test (first 5 rows):")
print(df_test_updated.head())

# 필요하다면, 병합한 결과를 CSV 파일로 저장할 수 있습니다.
output_dir = os.path.join(BASE_DIR, "data")
balanced_csv_path = os.path.join(output_dir, "df_balanced_updated.csv")
df_balanced_updated.to_csv(balanced_csv_path, index=False)
print("Updated balanced data saved to", balanced_csv_path)

valid_csv_path = os.path.join(output_dir, "df_valid_updated.csv")
df_valid_updated.to_csv(valid_csv_path, index=False)
print("Updated valid data saved to", valid_csv_path)

test_csv_path = os.path.join(output_dir, "df_test_updated.csv")
df_test_updated.to_csv(test_csv_path, index=False)
print("Updated test data saved to", test_csv_path)


Updated df_balanced (first 5 rows):
                           filename  gender     accent      age  \
0  cv-valid-train/sample-033345.mp3  female  australia  fifties   
1  cv-valid-train/sample-140829.mp3  female  australia  fifties   
2  cv-valid-train/sample-086038.mp3  female  australia  fifties   
3  cv-valid-train/sample-033921.mp3  female  australia  fifties   
4  cv-valid-train/sample-026004.mp3  female  australia  fifties   

   accent_encoded      energy       rms  zero_crossings  \
0               0   33.239445  0.024938            6092   
1               0  145.872191  0.034694            3281   
2               0  176.136347  0.044118           10197   
3               0   25.483028  0.013635            3743   
4               0  660.516930  0.079196           16344   

                                           mfcc_mean  \
0  [-537.9019644079377, 94.94824728793137, -4.160...   
1  [-519.6272294839237, 116.2643277268801, 50.740...   
2  [-388.4322792946102, 119.9831987336