# Gender classification

## Libraries

In [147]:
import tarfile
import os
import pandas as pd
import numpy as np
import librosa
import matplotlib.pyplot as plt
import soundfile as sf

## Extract .tar.gz file

In [48]:
input_tar_gz_file  = 'cv-corpus-12.0-delta-2022-12-07-en.tar.gz'
output_dir  = 'extracted_files'

In [49]:
# Create the output folder if it doesn't exist
os.makedirs(output_dir , exist_ok=True)

In [50]:
# Open the .tar.gz file
with tarfile.open(input_tar_gz_file, 'r:gz') as tar:
    tar.extractall(path=output_dir)

## Open Dataset

In [102]:
df = pd.read_csv('./extracted_files/cv-corpus-12.0-delta-2022-12-07/en/other.tsv', sep='\t')
df = df[df['gender'].isin(['male', 'female'])]
filtered_df = df[['path', 'gender']]
filtered_df.loc[:, 'gender'] = filtered_df['gender'].map({'male': 1, 'female': 0})

## Add gaussian noise for data augmentation

In [104]:
def add_gaussian_noise(signal, noise_level=0.005):
    """Add Gaussian noise to the signal"""
    noise = np.random.normal(0, noise_level, len(signal))
    return signal + noise

In [143]:
sound_dir = './extracted_files/cv-corpus-12.0-delta-2022-12-07/en/clips'

In [159]:
for index, row in filtered_df[filtered_df['gender'] == 0].iterrows():
    
    waveform, sr = librosa.load(f'{sound_dir}/{row['path']}')
    noisy = add_gaussian_noise(waveform)

    sf.write(f'{sound_dir}/noisy_{row['path'].split('.')[0]}.wav', noisy, sr)

    new_row = pd.DataFrame({'path': [f'noisy_{row['path'].split('.')[0]}.wav'], 'gender': [0]})

    filtered_df = pd.concat([filtered_df, new_row], ignore_index=True)

In [161]:
filtered_df

Unnamed: 0,path,gender
0,common_voice_en_34925857.mp3,0
1,common_voice_en_34925861.mp3,0
2,common_voice_en_34925867.mp3,0
3,common_voice_en_34925868.mp3,0
4,common_voice_en_34925869.mp3,0
...,...,...
28042,noisy_common_voice_en_36528129.wav,0
28043,noisy_common_voice_en_36528130.wav,0
28044,noisy_common_voice_en_36528131.wav,0
28045,noisy_common_voice_en_36528132.wav,0


## rest