In [29]:
import os
import pandas as pd
import librosa

data_dir = 'data'

# List to store data dictionaries
data_list = []

# Define audio and feature parameters
sample_rate = 16000 
n_mfcc = 13 
n_fft = 400 
hop_length = 160 

# Walk through the main data directory
for speaker_dir in os.listdir(data_dir):
    if os.path.isdir(os.path.join(data_dir, speaker_dir)):
        speaker_path = os.path.join(data_dir, speaker_dir)
        transcripts_dir = os.path.join(speaker_path, speaker_dir, 'transcript')
        # For each speaker, traverse their directory
        for root, _, files in os.walk(speaker_path):
            for file in files:
                if file.endswith('.wav'):
                    file_path = os.path.join(root, file)
                    
                    # Extract the transcript from the transcripts directory
                    base_filename, _ = os.path.splitext(file)
                    transcript_path = os.path.join(transcripts_dir, f'{base_filename}.txt')
                    
                    if os.path.isfile(transcript_path):
                        with open(transcript_path, 'r', encoding='utf-8') as file:
                            transcript = file.read().strip()
                    else:
                        transcript = "Transcript not found"
                    
                    # Load the audio file
                    audio, _ = librosa.load(file_path, sr=sample_rate)
                    
                    # Extract MFCC features
                    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
                    
                    # Append the data as a dictionary to the list
                    data_list.append({'file_path': file_path, 'transcript': transcript, 'mfcc_features': mfccs.T})
                    
metadata = pd.DataFrame(data_list)

# Saving the metadata to a CSV file for reference
metadata.to_csv('metadata.csv', index=False)
print("pre-processing done!")

pre-processing done!


In [30]:
import pandas as pd

metadata = pd.read_csv('metadata.csv')

In [31]:
from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(metadata, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(val_data, test_size=0.5, random_state=42)

In [32]:
train_features = list(train_data['mfcc_features'])
val_features = list(val_data['mfcc_features'])
test_features = list(test_data['mfcc_features'])

train_labels = list(train_data['transcript'])
val_labels = list(val_data['transcript'])
test_labels = list(test_data['transcript'])

In [33]:
# Example: One-hot encode labels for classification
from sklearn.preprocessing import LabelBinarizer

label_encoder = LabelBinarizer()
train_labels = label_encoder.fit_transform(train_labels)
val_labels = label_encoder.transform(val_labels)
test_labels = label_encoder.transform(test_labels)

In [37]:
# Train a model