# Gender classification

## Libraries

In [156]:
import tarfile
import os
import pandas as pd
import numpy as np
import librosa
import matplotlib.pyplot as plt
import soundfile as sf
import opensmile
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from torchinfo import summary

## Default Device

In [101]:
if torch.cuda.is_available(): print(torch.cuda.get_device_name())
else: print('cpu')
default_device = 'cuda' if torch.cuda.is_available() else 'cpu'

NVIDIA GeForce GTX 1660 Ti


## Extract .tar.gz file

In [2]:
input_tar_gz_file  = 'cv-corpus-12.0-delta-2022-12-07-en.tar.gz'
output_dir  = 'extracted_files'

In [3]:
# Create the output folder if it doesn't exist
os.makedirs(output_dir , exist_ok=True)

In [3]:
# Open the .tar.gz file
with tarfile.open(input_tar_gz_file, 'r:gz') as tar:
    tar.extractall(path=output_dir)

## Open Dataset

In [4]:
df = pd.read_csv('./extracted_files/cv-corpus-12.0-delta-2022-12-07/en/other.tsv', sep='\t')
df = df[df['gender'].isin(['male', 'female'])]
filtered_df = df[['path', 'gender']]
filtered_df.loc[:, 'gender'] = filtered_df['gender'].map({'male': 1, 'female': 0})

## Add gaussian noise for data augmentation

In [5]:
def add_gaussian_noise(signal, noise_level=0.005):
    """Add Gaussian noise to the signal"""
    noise = np.random.normal(0, noise_level, len(signal))
    return signal + noise

In [6]:
sound_dir = './extracted_files/cv-corpus-12.0-delta-2022-12-07/en/clips'

In [7]:
for index, row in filtered_df[filtered_df['gender'] == 0].iterrows():
    
    waveform, sr = librosa.load(f'{sound_dir}/{row['path']}')
    noisy = add_gaussian_noise(waveform)

    sf.write(f'{sound_dir}/noisy_{row['path'].split('.')[0]}.wav', noisy, sr)

    new_row = pd.DataFrame({'path': [f'noisy_{row['path'].split('.')[0]}.wav'], 'gender': [0]})

    filtered_df = pd.concat([filtered_df, new_row], ignore_index=True)

In [8]:
filtered_df['gender'].value_counts()

gender
1    16911
0    11136
Name: count, dtype: int64

In [105]:
paths, labels = np.array(filtered_df['path']), np.array(filtered_df['gender'])

## Dataloader

In [125]:
class CustomLoader(Dataset):
    
    def __init__(self, paths, labels, data_dir, target_sr):
        self.paths = paths
        self.labels = labels
        self.dir = data_dir
        self.target_sr = target_sr
        self.smile = opensmile.Smile(feature_set=opensmile.FeatureSet.ComParE_2016,
                                     feature_level=opensmile.FeatureLevel.Functionals,
                                     sampling_rate=target_sr)

    def __len__(self):
        return len(self.paths)
    
    def __getitem__(self, index) :

        waveform, sr = librosa.load(f'{self.dir}/{self.paths[index]}')
        waveform = librosa.resample(waveform, orig_sr=sr, target_sr=self.target_sr)

        wave = self.smile(waveform, self.target_sr).reshape([len(self.smile.feature_names),])
        label = self.labels[index]

        return wave, label

In [141]:
X_train, X_test, y_train, y_test = train_test_split(paths, 
                                                    labels, 
                                                    test_size=0.2, 
                                                    shuffle=True,
                                                    stratify=labels)

In [168]:
batch_size = 128
target_sr = 16000
train_set = CustomLoader(paths=X_train, labels=y_train, data_dir=sound_dir, target_sr=target_sr)
valid_set = CustomLoader(paths=X_test, labels=y_test, data_dir=sound_dir, target_sr=target_sr)
train_loader = DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(dataset=valid_set, batch_size=batch_size, shuffle=True)

## Model

In [162]:
class MLP(nn.Module):

    def __init__(self):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(in_features=6373, out_features=2048)
        self.fc2 = nn.Linear(in_features=2048, out_features=1024)
        self.fc3 = nn.Linear(in_features=1024, out_features=512)
        self.fc4 = nn.Linear(in_features=512, out_features=256)
        self.fc5 = nn.Linear(in_features=256, out_features=64)
        self.fc6 = nn.Linear(in_features=64, out_features=1)
        self.sigmoid = nn.Sigmoid() # For binary classification
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.relu(x)
        x = self.fc4(x)
        x = self.relu(x)
        x = self.fc5(x)
        x = self.relu(x)
        x = self.fc6(x)
        x = self.sigmoid(x)
        return x

In [163]:
mlp = MLP().to(default_device)

In [169]:
summary(mlp, [batch_size, 6373])

Layer (type:depth-idx)                   Output Shape              Param #
MLP                                      [128, 1]                  --
├─Linear: 1-1                            [128, 2048]               13,053,952
├─ReLU: 1-2                              [128, 2048]               --
├─Linear: 1-3                            [128, 1024]               2,098,176
├─ReLU: 1-4                              [128, 1024]               --
├─Linear: 1-5                            [128, 512]                524,800
├─ReLU: 1-6                              [128, 512]                --
├─Linear: 1-7                            [128, 256]                131,328
├─ReLU: 1-8                              [128, 256]                --
├─Linear: 1-9                            [128, 64]                 16,448
├─ReLU: 1-10                             [128, 64]                 --
├─Linear: 1-11                           [128, 1]                  65
├─Sigmoid: 1-12                          [128, 1]       

## rest