# Gender classification

## Libraries

In [100]:
import tarfile
import os
import pandas as pd
import numpy as np
import librosa
import matplotlib.pyplot as plt
import soundfile as sf
import opensmile
import torch
from torch.utils.data import DataLoader, Dataset

## Default Device

In [101]:
if torch.cuda.is_available(): print(torch.cuda.get_device_name())
else: print('cpu')
default_device = 'cuda' if torch.cuda.is_available() else 'cpu'

NVIDIA GeForce GTX 1660 Ti


## Extract .tar.gz file

In [2]:
input_tar_gz_file  = 'cv-corpus-12.0-delta-2022-12-07-en.tar.gz'
output_dir  = 'extracted_files'

In [3]:
# Create the output folder if it doesn't exist
os.makedirs(output_dir , exist_ok=True)

In [3]:
# Open the .tar.gz file
with tarfile.open(input_tar_gz_file, 'r:gz') as tar:
    tar.extractall(path=output_dir)

## Open Dataset

In [4]:
df = pd.read_csv('./extracted_files/cv-corpus-12.0-delta-2022-12-07/en/other.tsv', sep='\t')
df = df[df['gender'].isin(['male', 'female'])]
filtered_df = df[['path', 'gender']]
filtered_df.loc[:, 'gender'] = filtered_df['gender'].map({'male': 1, 'female': 0})

## Add gaussian noise for data augmentation

In [5]:
def add_gaussian_noise(signal, noise_level=0.005):
    """Add Gaussian noise to the signal"""
    noise = np.random.normal(0, noise_level, len(signal))
    return signal + noise

In [6]:
sound_dir = './extracted_files/cv-corpus-12.0-delta-2022-12-07/en/clips'

In [7]:
for index, row in filtered_df[filtered_df['gender'] == 0].iterrows():
    
    waveform, sr = librosa.load(f'{sound_dir}/{row['path']}')
    noisy = add_gaussian_noise(waveform)

    sf.write(f'{sound_dir}/noisy_{row['path'].split('.')[0]}.wav', noisy, sr)

    new_row = pd.DataFrame({'path': [f'noisy_{row['path'].split('.')[0]}.wav'], 'gender': [0]})

    filtered_df = pd.concat([filtered_df, new_row], ignore_index=True)

In [8]:
filtered_df['gender'].value_counts()

gender
1    16911
0    11136
Name: count, dtype: int64

## Dataloader

## rest

In [71]:
rand = int(np.random.random()*len(filtered_df))
rand

27699

In [72]:
filtered_df['path'][rand]

'noisy_common_voice_en_36509320.wav'

In [73]:
filtered_df['gender'][rand]

0

In [74]:
waveform, sr = librosa.load(f'{sound_dir}/{filtered_df['path'][rand]}')

In [75]:
waveform.shape

(71442,)

In [96]:
sample_rate = 16000
waveform = librosa.resample(waveform, orig_sr=sr, target_sr=sample_rate)

In [94]:
smile = opensmile.Smile(feature_set=opensmile.FeatureSet.ComParE_2016,
                        feature_level=opensmile.FeatureLevel.Functionals,
                        sampling_rate=sample_rate)

In [97]:
smile(waveform, sample_rate).reshape([len(smile.feature_names),])

array([ 2.118374  ,  0.25      ,  0.89912283, ..., 48.841408  ,
       82.55347   , 40.46855   ], dtype=float32)

In [99]:
smile(waveform, sample_rate).shape

(1, 6373, 1)