In [4]:
# Imports and constants

import librosa
import os
import pandas
import torchaudio
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.functional import pad
from torch.utils.data import DataLoader, Dataset

WAV_PATH = "CREMA-D/AudioWAV/"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SAMPLE_RATE = 16000

In [5]:
# Load the dataset

with open("CREMA-D/VideoDemographics.csv") as csv_file:
    label_to_num = {"Male": 0, "Female": 1}
    sexes = pandas.read_csv(csv_file).to_dict(orient='list')['Sex']
    data, labels = [], []
    maxl = 0

    for file in os.listdir(WAV_PATH):
        audio = librosa.load(WAV_PATH + file, sr=SAMPLE_RATE, mono=False)[0]
        mfcc = torchaudio.transforms.MFCC(sample_rate=SAMPLE_RATE)(torch.tensor(audio))
        data.append(mfcc)

        labels.append(label_to_num[sexes[int(file.split('_')[0][-1]) - 1]])

        maxl = max(maxl, mfcc.size(1))

    data = [pad(mfcc, (0, maxl - mfcc.size(1))) if mfcc.size(1) < maxl else mfcc for mfcc in data]

    split_idx = int(2 / 3 * len(data)) + 1
    train_features, train_labels = data[ : split_idx], labels[ : split_idx]
    test_features, test_labels = data[split_idx : ], labels[split_idx : ]

train_features = torch.stack(train_features)
train_labels = torch.tensor(train_labels, dtype=torch.long)
test_features = torch.stack(test_features)
test_labels = torch.tensor(test_labels, dtype=torch.long)



In [6]:
class AudioDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

train_dataset = AudioDataset(train_features, train_labels)
test_dataset = AudioDataset(test_features, test_labels)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [7]:


class GenderClassificationModel(nn.Module):
    def __init__(self):
        super(GenderClassificationModel, self).__init__()
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(64000, 128)
        self.fc2 = nn.Linear(128, 2)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

model = GenderClassificationModel()

In [8]:

CELoss = nn.CrossEntropyLoss()
AdamOpt = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10

for epoch in range(num_epochs):
    running_loss = 0.0
    model.train().to(device=DEVICE)
    for inputs, labels in train_loader:
        inputs = inputs.unsqueeze(1)
        AdamOpt.zero_grad()
        outputs = model(inputs)
        loss = CELoss(outputs, labels)
        loss.backward()
        AdamOpt.step()
        running_loss += loss.item()

    print(f'Epoch {epoch + 1}, Loss: {running_loss / len(train_loader)}')

print('Finished Training')


Epoch 1, Loss: 2.2881007893727374
Epoch 2, Loss: 0.5124909718258258
Epoch 3, Loss: 0.5134516316346633
Epoch 4, Loss: 0.5046506917629486
Epoch 5, Loss: 0.49209693953012806
Epoch 6, Loss: 0.46869257875742054
Epoch 7, Loss: 0.45229366794228554
Epoch 8, Loss: 0.41323056396765584
Epoch 9, Loss: 0.3854704375068347
Epoch 10, Loss: 0.3184789529022498
Finished Training


In [9]:
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.unsqueeze(1)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy: {100 * correct / total}%')


Accuracy: 74.19354838709677%


In [79]:
from collections import Counter

def size_mfccs(mfcc):
    l = mfcc.size(1)
    
    if l < maxl:
        mfcc = [pad(mfcc, (0, maxl - l))]
    else:
        chunks = list(torch.split(mfcc, maxl, dim=1))
        if chunks[-1].size(1) < maxl:
            chunks[-1] = pad(chunks[-1], (0, maxl - chunks[-1].size(1)))
        mfcc = chunks
    
    return mfcc

def single_prediction(mfcc):
    preds = []
    for inputs in mfcc:
        inputs = inputs.unsqueeze(0).unsqueeze(0)
        outputs = model(inputs)
        _, pred = torch.max(outputs, 1)
        preds.append(pred.item())
    return num_to_label[Counter(preds).most_common(1)[0][0]]

num_to_label = {0: "Male", 1: "Female"}

wav_cristina = librosa.load("Cristina.wav", sr=SAMPLE_RATE, mono=False)[0]
wav_cosmin = librosa.load("Cosmin.wav", sr=SAMPLE_RATE, mono=False)[0]

mfcc_cristina = torchaudio.transforms.MFCC(sample_rate=SAMPLE_RATE)(torch.tensor(wav_cristina))
mfcc_cosmin = torchaudio.transforms.MFCC(sample_rate=SAMPLE_RATE)(torch.tensor(wav_cosmin))

mfcc_cristina = size_mfccs(mfcc_cristina)
mfcc_cosmin = size_mfccs(mfcc_cosmin)

with torch.no_grad():
    print("Cristina ?:", single_prediction(mfcc_cristina))

    print("Cosmin ?:", single_prediction(mfcc_cosmin))

Cristina ?: Female
Cosmin ?: Female
