In [19]:
import os
import torch
import librosa
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from torch.utils.data import Dataset, DataLoader

In [20]:
AUDIO_DIR = "E:\\Momenta_task\\LJSpeech-1.1\\wavs"
METADATA_PATH = "E:\\Momenta_task\\LJSpeech-1.1\\metadata.csv"

In [21]:
metadata = pd.read_csv(METADATA_PATH, sep="|", header=None, 
                      names=["ID", "Transcript", "Normalized"], 
                      usecols=["ID"])
metadata['Label'] = np.random.randint(0, 2, size=len(metadata))  # Replace with real labels

train_df, temp_df = train_test_split(metadata, test_size=0.2, stratify=metadata['Label'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['Label'])

In [22]:
def extract_features(audio_path):
    try:
        y, sr = librosa.load(audio_path, sr=16000, duration=4)  # Fixed duration
        y = librosa.util.fix_length(y, size=64000)  # 4 seconds
        
        # MFCC with stable dimensions
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40, n_fft=512, hop_length=256)
        mfcc = mfcc[:, :250]  # Fix time dimension
        
        # Normalize
        mfcc = (mfcc - np.mean(mfcc)) / np.std(mfcc)
        return torch.FloatTensor(mfcc.T)  # (250, 40)
    
    except Exception as e:
        print(f"Error processing {audio_path}: {str(e)}")
        return torch.zeros((250, 40))  # Fallback

In [23]:
class AudioDataset(Dataset):
    def __init__(self, df, audio_dir):
        self.df = df
        self.audio_dir = audio_dir
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        audio_file = self.df.iloc[idx]['ID'] + '.wav'  # LJSpeech uses .wav
        audio_path = os.path.join(self.audio_dir, audio_file)
        features = extract_features(audio_path)
        label = self.df.iloc[idx]['Label']
        return features, torch.tensor(label)

def collate_fn(batch):
    features, labels = zip(*batch)
    return torch.stack(features), torch.stack(labels)

# Initialize loaders
train_loader = DataLoader(AudioDataset(train_df, AUDIO_DIR), 
                         batch_size=32, collate_fn=collate_fn, shuffle=True)
val_loader = DataLoader(AudioDataset(val_df, AUDIO_DIR), batch_size=32, collate_fn=collate_fn)
test_loader = DataLoader(AudioDataset(test_df, AUDIO_DIR), batch_size=32, collate_fn=collate_fn)

In [24]:
class VGGLSTM(torch.nn.Module):
    def __init__(self):
        super().__init__()
        
        # Stable CNN for fixed input size
        self.cnn = torch.nn.Sequential(
            torch.nn.Conv1d(40, 64, kernel_size=5),
            torch.nn.ReLU(),
            torch.nn.MaxPool1d(4),
            torch.nn.Conv1d(64, 128, kernel_size=3),
            torch.nn.ReLU(),
            torch.nn.MaxPool1d(2)
        )
        
        # LSTM with attention
        self.lstm = torch.nn.LSTM(
            input_size=128,
            hidden_size=64,
            num_layers=1,
            bidirectional=True,
            batch_first=True
        )
        
        self.fc = torch.nn.Linear(128, 2)

    def forward(self, x):
        # x shape: (batch, time, features)
        x = x.permute(0, 2, 1)  # (batch, features, time)
        
        # CNN processing
        x = self.cnn(x)
        x = x.permute(0, 2, 1)  # (batch, time, features)
        
        # LSTM processing
        out, (h_n, c_n) = self.lstm(x)
        
        # Use final hidden state
        out = self.fc(out[:, -1, :])
        return out

In [26]:
model = VGGLSTM()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
criterion = torch.nn.CrossEntropyLoss()

# %% [8] Training Loop (CPU-optimized)
for epoch in range(5):
    model.train()
    total_loss = 0
    
    for inputs, labels in tqdm(train_loader):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.long())  # Fix here
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    # Validation
    model.eval()
    val_preds = []
    val_labels = []
    
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            preds = torch.argmax(outputs, dim=1)
            val_preds.extend(preds.numpy())
            val_labels.extend(labels.numpy())  # labels already converted in dataset
    
    val_f1 = f1_score(val_labels, val_preds)
    print(f"Epoch {epoch+1} | Loss: {total_loss/len(train_loader):.4f} | Val F1: {val_f1:.4f}")

100%|████████████████████████████████████████████████████████████████████████████████| 328/328 [05:16<00:00,  1.03it/s]


Epoch 1 | Loss: 0.6939 | Val F1: 0.0000


100%|████████████████████████████████████████████████████████████████████████████████| 328/328 [01:54<00:00,  2.86it/s]


Epoch 2 | Loss: 0.6931 | Val F1: 0.3518


100%|████████████████████████████████████████████████████████████████████████████████| 328/328 [01:54<00:00,  2.87it/s]


Epoch 3 | Loss: 0.6930 | Val F1: 0.6403


100%|████████████████████████████████████████████████████████████████████████████████| 328/328 [01:54<00:00,  2.85it/s]


Epoch 4 | Loss: 0.6926 | Val F1: 0.6372


100%|████████████████████████████████████████████████████████████████████████████████| 328/328 [01:55<00:00,  2.84it/s]


Epoch 5 | Loss: 0.6925 | Val F1: 0.4608


In [None]:
def evaluate(model, loader):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for inputs, labels in loader:
            outputs = model(inputs)
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.numpy())
            all_labels.extend(labels.numpy())
    
    print(classification_report(all_labels, all_preds))
    print(f"F1 Score: {f1_score(all_labels, all_preds):.4f}")

print("Test Set Evaluation:")
evaluate(model, test_loader)

Test Set Evaluation:
