In [1]:
import os
import json
import numpy as np
import pandas as pd
import torch
import torchaudio
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

**1. DATA PRE-PROCESSING**

1.1 READ DATA

In [2]:
# Read training data
train_df = pd.read_csv("../input/common-voice/cv-valid-train.csv")
train_df.head()

# Filter rows with missing gender or accent
train_filter_df = train_df[train_df["gender"].notnull()]
train_filter_df = train_filter_df[train_filter_df["accent"].notnull()]

def gender_to_labels(gender):
    """
    Convert gender to one-hot label.
    Args:
        gender (str): Gender in string format.

    Returns:
        list: One-hot encoded gender label.
    """
    genders = ["female", "male", "other"]
    onehot = [0] * len(genders)

    if gender in genders:
        index = genders.index(gender)
        onehot[index] = 1
    else:
        print("Invalid gender:", gender)

    return onehot

def accent_to_onehot(accent):
    """
    Convert accent to one-hot label.
    Args:
        accent (str): Accent in string format.

    Returns:
        list: One-hot encoded accent label.
    """
    accents = ["us", "australia", "england", "canada", "philippines", "ireland", "hongkong", "indian", "malaysia", "newzealand", "scotland", "singapore", "southatlandtic", "african", "wales", "bermuda"]
    onehot = [0] * len(accents)
    index = accents.index(accent)
    onehot[index] = 1

    return onehot

# Apply gender and accent conversion to labels
train_filter_df["gender_label"] = train_filter_df['gender'].apply(gender_to_labels)
train_filter_df["accent_label"] = train_filter_df['accent'].apply(accent_to_onehot)


1.2 CONVERT DATA

In [3]:
# Limit dataset size
train_filter_df = train_filter_df.head(1200)

# Set dataset path and target properties
DATASET_PATH = "/kaggle/input/common-voice/cv-valid-train/"
target_sample_rate = 16000
target_length = 160000

def pad_or_trim(waveform, target_length):
    """
    Pad or trim a waveform to the target length.
    Args:
        waveform (torch.Tensor): Input waveform.
        target_length (int): Desired length of the output waveform.

    Returns:
        torch.Tensor: Padded or trimmed waveform.
    """
    length = waveform.shape[1]
    
    if length > target_length:
        waveform = waveform[:, :target_length]
    elif length < target_length:
        padding = torch.zeros(waveform.shape[0], target_length - length)
        waveform = torch.cat((waveform, padding), dim=1)
    
    return waveform

# Initialize arrays for audio, gender labels, and accent labels
all_audio = []
all_gender_labels = []
all_accent_labels = []

# Process each row in the filtered dataframe
for index, row in train_filter_df.iterrows():
    audio_path = os.path.join(DATASET_PATH, row['filename'])
    waveform, sample_rate = torchaudio.load(audio_path)

    # Pad or trim waveform to target length
    waveform = pad_or_trim(waveform, target_length)

    all_audio.append(waveform)
    all_gender_labels.append(row['gender_label'])
    all_accent_labels.append(row['accent_label'])

# Convert lists to NumPy arrays
inputs = np.array(all_audio)
inputs_gender_labels = np.array(all_gender_labels)
inputs_accent_labels = np.array(all_accent_labels)



1.3 SPLIT DATASET

In [4]:
class AudioDataset(Dataset):
    def __init__(self, inputs, gender_labels, accent_labels):
        self.inputs = inputs
        self.gender_labels = gender_labels
        self.accent_labels = accent_labels

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.gender_labels[idx], self.accent_labels[idx]

# Prepare the data for training
# Convert NumPy arrays to float32
inputs = [audio_input.numpy().astype(np.float32) for audio_input in inputs]

# Convert NumPy arrays to PyTorch tensors
inputs = torch.tensor(inputs)
inputs_gender_labels = torch.tensor(inputs_gender_labels, dtype=torch.float32)
inputs_accent_labels = torch.tensor(inputs_accent_labels, dtype=torch.float32)

# Split the dataset
train_inputs, val_inputs, train_gender_labels, val_gender_labels, train_accent_labels, val_accent_labels = train_test_split(
    inputs, inputs_gender_labels, inputs_accent_labels, test_size=0.2, random_state=42)

# Create training and validation datasets
train_dataset = AudioDataset(train_inputs, train_gender_labels, train_accent_labels)
val_dataset = AudioDataset(val_inputs, val_gender_labels, val_accent_labels)

# Create data loaders
batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)



**2. RNN-GENDER CLASSIFIER**

In [5]:
# Define the RNN model for gender recognition
class GenderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(GenderRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Pass input through RNN layers
        out, _ = self.rnn(x)
        
        # Pass the output of the last RNN layer through a fully connected layer
        out = self.fc(out[:, -1, :])
        
        return out

# Hyperparameters
input_size = 160000
hidden_size = 64
num_layers = 2
output_size = len(train_dataset[0][1])  # Number of gender labels
batch_size = 32
num_epochs = 20
learning_rate = 0.001

# Initialize the RNN model, loss function and optimizer
model = GenderRNN(input_size, hidden_size, num_layers, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
for epoch in range(num_epochs):
    for i, (audio, gender_labels, _) in enumerate(train_dataloader):
        # Forward pass
        outputs = model(audio)
        loss = criterion(outputs, torch.max(gender_labels, 1)[1])

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Print the loss for this epoch
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

# Test the model
with torch.no_grad():
    correct = 0
    total = 0
    for audio, gender_labels, _ in val_dataloader:
        outputs = model(audio)
        _, predicted = torch.max(outputs.data, 1)
        total += gender_labels.size(0)
        correct += (predicted == torch.max(gender_labels, 1)[1]).sum().item()

    print(f"Test Accuracy: {100 * correct / total}%")

Epoch [1/20], Loss: 0.9264
Epoch [2/20], Loss: 0.2022
Epoch [3/20], Loss: 0.0895
Epoch [4/20], Loss: 0.0121
Epoch [5/20], Loss: 0.0105
Epoch [6/20], Loss: 0.0032
Epoch [7/20], Loss: 0.0050
Epoch [8/20], Loss: 0.0070
Epoch [9/20], Loss: 0.0067
Epoch [10/20], Loss: 0.0015
Epoch [11/20], Loss: 0.0042
Epoch [12/20], Loss: 0.0021
Epoch [13/20], Loss: 0.0016
Epoch [14/20], Loss: 0.0022
Epoch [15/20], Loss: 0.0062
Epoch [16/20], Loss: 0.0015
Epoch [17/20], Loss: 0.0010
Epoch [18/20], Loss: 0.0015
Epoch [19/20], Loss: 0.0013
Epoch [20/20], Loss: 0.0014
Test Accuracy: 59.166666666666664%


**3. 1DCNN-GENDER AND ACCENT CLASSIFIER**

In [6]:
class Simple1DCNN(nn.Module):
    def __init__(self, input_length, num_classes_gender, num_classes_accent):
        super(Simple1DCNN, self).__init__()

        # Define convolutional layers
        self.conv1 = nn.Conv1d(1, 16, kernel_size=3, stride=1, padding=1)
        self.pool1 = nn.MaxPool1d(kernel_size=2, stride=2)
        
        self.conv2 = nn.Conv1d(16, 32, kernel_size=3, stride=1, padding=1)
        self.pool2 = nn.MaxPool1d(kernel_size=2, stride=2)
        
        self.conv3 = nn.Conv1d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool3 = nn.MaxPool1d(kernel_size=2, stride=2)

        # Calculate the length of the fully connected layer input
        self.fc_length = input_length // (2**3)
        
        # Define fully connected layers for gender and accent classification
        self.fc_gender = nn.Linear(64 * self.fc_length, num_classes_gender)
        self.fc_accent = nn.Linear(64 * self.fc_length, num_classes_accent)

    def forward(self, x):
        # Pass input through convolutional and pooling layers
        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool2(F.relu(self.conv2(x)))
        x = self.pool3(F.relu(self.conv3(x)))

        # Flatten the tensor
        x = x.view(x.size(0), -1)

        # Pass the output through fully connected layers for gender and accent
        out_gender = self.fc_gender(x)
        out_accent = self.fc_accent(x)

        return out_gender, out_accent

# Model initialization
input_length = 160000
num_classes_gender = 3
num_classes_accent = 16
model = Simple1DCNN(input_length, num_classes_gender, num_classes_accent)

# Set device for the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, gender_labels, accent_labels in tqdm(train_dataloader):
        # Move inputs and labels to the device
        inputs = inputs.to(device)
        gender_labels = torch.argmax(gender_labels, dim=1).to(device)
        accent_labels = torch.argmax(accent_labels, dim=1).to(device)
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        out_gender, out_accent = model(inputs)
        loss_gender = criterion(out_gender, gender_labels)
        loss_accent = criterion(out_accent, accent_labels)
        
        # Calculate total loss and backpropagate
        loss = loss_gender + loss_accent
        loss.backward()
        optimizer.step()
        
        # Accumulate loss
        running_loss += loss.item()

    # Print loss for the current epoch
    print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_dataloader)}")

    # Evaluation loop
    model.eval()
    with torch.no_grad():
        gender_correct = 0
        accent_correct = 0
        total = 0
        for inputs, gender_labels, accent_labels in val_dataloader:
            inputs = inputs.to(device)
            gender_labels = gender_labels.to(device)
            accent_labels = accent_labels.to(device)
            
            out_gender, out_accent = model(inputs)
            _, gender_predicted = torch.max(out_gender, 1)
            _, accent_predicted = torch.max(out_accent, 1)
            
            gender_labels = torch.argmax(gender_labels, dim=1)
            accent_labels = torch.argmax(accent_labels, dim=1)
            gender_correct += (gender_predicted == gender_labels).sum().item()
            accent_correct += (accent_predicted == accent_labels).sum().item()
            total += gender_labels.size(0)
        
        print(f"Validation Accuracy (Gender): {gender_correct/total * 100:.2f}%")
        print(f"Validation Accuracy (Accent): {accent_correct/total * 100:.2f}%")

100%|██████████| 30/30 [03:29<00:00,  6.98s/it]


Epoch 1, Loss: 10.685899114608764
Validation Accuracy (Gender): 77.08%
Validation Accuracy (Accent): 25.42%


100%|██████████| 30/30 [03:19<00:00,  6.66s/it]


Epoch 2, Loss: 2.2978541294733685
Validation Accuracy (Gender): 77.08%
Validation Accuracy (Accent): 32.92%


100%|██████████| 30/30 [03:20<00:00,  6.69s/it]


Epoch 3, Loss: 1.9255502303441365
Validation Accuracy (Gender): 77.08%
Validation Accuracy (Accent): 37.08%
