# Imports

In [1]:
import os

import librosa
import matplotlib.pyplot as plt
import numpy as np
import json
import pandas as pd
from scipy.io import wavfile
import scipy.stats as stats
import re
import aiofiles
import asyncio
import time
import nest_asyncio
import parselmouth

# Extract MFCCs

In [None]:
def extract_MFCCs(files):
    data_list = []
    print("Extracting features...")
    for i, file in enumerate(files):
        if file.endswith('.wav'):
            file_path = os.path.join(current_path,"MLS", "Full_split", file)
            audio_data, sample_rate = librosa.load(file_path)
            mfccs = librosa.feature.mfcc(y=audio_data, sr=sample_rate)
            mfccs_mean = list(np.mean(mfccs.T, axis= 0))
        
            file_path = os.path.join(current_path,"MLS", "Full_split", file)
            splited_file = file.split('_')
            if data_dict[int(splited_file[0])] == 'F':
                gender = 0
            if data_dict[int(splited_file[0])] == 'M': 
                gender = 1
        
        sample_features = [str(file), mfccs_mean, gender]
        data_list.append(sample_features)
        
        print(f"\r{i/len(files)*100:.3f}% complete",end='')
    # print(f"{i/len(files)*100} percent complete")
    print("\nMFCCs features extracted successfully")
    return data_list

In [None]:
current_path = os.getcwd()
file_path = os.path.join(current_path,"MLS", "Full_split")
audio_files = os.listdir(file_path)
data_list = extract_MFCCs(audio_files)
df = pd.DataFrame(data_list, columns = ['FileName', 'MFCCs', 'Gender'])
df.to_csv('data/MSL_split_MFCCs_data.csv', index=False)

In [None]:
def get_features(frequencies):

  nobs, minmax, mean, variance, skew, kurtosis =  stats.describe(frequencies)
  median   = np.median(frequencies)
  mode     = stats.mode(frequencies).mode[0]
  std      = np.std(frequencies)
  low,peak = minmax
  q75,q25  = np.percentile(frequencies, [75 ,25])
  iqr      = q75 - q25
  return nobs, mean, skew, kurtosis, median, mode, std, low, peak, q25, q75, iqr

In [None]:
df = pd.read_csv('data/MSL_split_MFCCs_data.csv')
df.head()

In [None]:
import json
features_list = []
for i in range(len(df)):
    nobs, mean, skew, kurtosis, median, mode, std, low, peak, q25, q75, iqr = get_features(json.loads(df['MFCCs'][i]))
    data = [df['FileName'][i], nobs, mean, skew, kurtosis, median, mode, std, low, peak, q25, q75, iqr, df['Gender'][i]]
    features_list.append(data)
    print(f"\r{i/len(df)*100:.3f}% complete",end='')
df_final = pd.DataFrame(features_list, columns = ['FileName', 'nobs', 'mean', 'skew', 'kurtosis', 'median', 'mode', 'std', 'low', 'peak', 'q25', 'q75', 'iqr', 'Gender'])
df_final.to_csv('data/MLS_split_MFCCs_statistics_data.csv', index=False)


# Extract frequency features

## Get metadata

In [6]:
current_path = os.getcwd()
source = os.path.join(current_path, "MLS", "Full")

metadata = pd.read_csv(source + "/metainfo.csv")
data = metadata[['SPEAKER','GENDER']]
data_dict = dict(zip(data.SPEAKER, data.GENDER))
data_dict
filtered_dict = {k:v for k,v in data_dict.items() if 'F' in v}
one_speaker = data.loc[data['SPEAKER'] == 12249]
metadata

Unnamed: 0,SPEAKER,GENDER,PARTITION,MINUTES,BOOK ID,TITLE,CHAPTER,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12
0,12249,F,train,35645,10229,Canaã,Capítulo X,,,,,,
1,12249,F,train,65432,10229,Canaã,Capítulo III,,,,,,
2,12249,F,train,40280,10229,Canaã,Capítulo IV,,,,,,
3,12249,F,train,11896,12742,Cartas de Inglaterra,O Natal,,,,,,
4,12249,F,train,21897,12742,Cartas de Inglaterra,A Irlanda e a Liga Agraria,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1256,7925,F,test,3294,6390,Antologia Brasileira,Coletânea em Prosa e Verso de Escritores Nacio...,Volume 1,O Caboclo do Amazonas,,,,
1257,7925,F,test,4524,6390,Antologia Brasileira,Coletânea em Prosa e Verso de Escritores Nacio...,Volume 1,O Campo dos Ciganos,,,,
1258,7925,F,test,3861,6390,Antologia Brasileira,Coletânea em Prosa e Verso de Escritores Nacio...,Volume 1,O Berço,,,,
1259,7925,F,test,2511,6390,Antologia Brasileira,Coletânea em Prosa e Verso de Escritores Nacio...,Volume 1,Escravo,,,,


## Example:

In [None]:
current_path = os.getcwd()
file_path = os.path.join(current_path,"MLS", "Full_split", "3050_2941_000000.wav_4.wav")
audio_data, sample_rate = librosa.load(file_path)
step = int(3200)
window_frequencies = []
top_freq = []

ft = np.fft.fft(audio_data)
magnitude = np.abs(ft)
frequency = np.linspace(0, sample_rate, len(magnitude))

plt.figure()
plt.plot(frequency[:int(len(frequency)/2)], magnitude[:int(len(frequency)/2)])
plt.xlabel("Frequency")
plt.ylabel("Magnitude")
plt.show()



## Extract features:

In [7]:
def get_frequencies(files):
    frequencies_list = []
    gender_list = []
    FileName_list = []
    print("Extracting frequencies...")
    for k, file in enumerate(files):
        if file.endswith('.wav'):
            file_path = os.path.join(current_path,"MLS", "Full", file)
            splited_file = file.split('_')
            if data_dict[int(splited_file[0])] == 'F':
                gender = 0
            if data_dict[int(splited_file[0])] == 'M': 
                gender = 1
            audio_data, sample_rate = librosa.load(file_path)

            step = int(3200) #3200 sampling points every 1/5 sec
            window_frequencies = []

            for i in range(0,len(audio_data),step):
                ft = np.fft.fft(audio_data[i:i+step]) #fft returns the list N complex numbers
                freqs = np.fft.fftfreq(len(ft)) #fftq tells you the frequencies associated with the coefficients
                imax = np.argmax(np.abs(ft))
                freq = freqs[imax]
                freq_in_hz = abs(freq *sample_rate)
                window_frequencies.append(freq_in_hz)
            FileName_list.append(file)
            gender_list.append(gender)
            frequencies_list.append(window_frequencies)
            print(f"\r{k/len(files)*100:.3f}% complete",end='')
    return frequencies_list, gender_list, FileName_list

In [8]:
def get_features(frequencies):

  nobs, minmax, mean, variance, skew, kurtosis =  stats.describe(frequencies)
  median   = np.median(frequencies)
  mode     = stats.mode(frequencies).mode[0]
  std      = np.std(frequencies)
  low,peak = minmax
  q75,q25  = np.percentile(frequencies, [75 ,25])
  iqr      = q75 - q25
  return nobs, mean, skew, kurtosis, median, mode, std, low, peak, q25, q75, iqr

In [9]:
current_path = os.getcwd()
file_path = os.path.join(current_path,"MLS", "Full")
audio_files = os.listdir(file_path)
frequencies, gender_list, FileName_list = get_frequencies(audio_files)
features_list = []
for i, frequency in enumerate(frequencies):
    nobs, mean, skew, kurtosis, median, mode, std, low, peak, q25, q75, iqr = get_features(frequency)
    features_list.append([FileName_list[i], nobs, mean, skew, kurtosis, median, mode, std, low, peak, q25, q75, iqr, gender_list[i]])
df = pd.DataFrame(features_list, columns = ['FileName', 'nobs', 'mean', 'skew', 'kurtosis', 'median', 'mode', 'std', 'low', 'peak', 'q25', 'q75', 'iqr', 'Gender'])
df.to_csv('data/MLS_Features_data.csv', index=False)

Extracting frequencies...
99.885% complete

# Extract MFCCs

## Get Metadata

In [2]:
current_path = os.getcwd()
source = os.path.join(current_path, "MLS", "Full_split")

metadata = pd.read_csv(source + "/metainfo.csv")
data = metadata[['SPEAKER','GENDER']]
data_dict = dict(zip(data.SPEAKER, data.GENDER))

## Extract MFCCs

In [4]:
async def extract_MFCCs(count, file):
    async with sem:
        file_path = os.path.join(current_path,"MLS", "Full", file)
        
        audio_data, sample_rate =librosa.load(file_path)
        
        mfccs = librosa.feature.mfcc(y=audio_data, sr=sample_rate)
        
        mfccs_mean = list(np.mean(mfccs.T, axis= 0))
        
        splited_file = file.split('_')
        if data_dict[int(splited_file[0])] == 'F':
            gender = 0
        
        if data_dict[int(splited_file[0])] == 'M': 
            gender = 1
        
        
        sample_features = mfccs_mean
        sample_features.insert(0,str(file))
        sample_features.append(gender)
        
        string = ','.join(str(item) for item in sample_features)
        print(f"\r{count}/{len(audio_files)}",end='')
        async with aiofiles.open('MLS_MFCCs_test.csv', mode='a') as f:
            await f.write(f'\n{string}')
        #features_list.append(sample_features)
    
        return 

In [5]:
# #Calculo de tempo de disparo
start_time = time.time()

#inicio do Loop
loop = asyncio.get_event_loop()

#Controle de requisições por vez
sem = asyncio.Semaphore(600)

#Array de tasks
sents = []

nest_asyncio.apply()

#Coleta as recomendações para envio
gender_list = []
file_list = []
features_list = []


current_path = os.getcwd()
file_path = os.path.join(current_path,"MLS", "Full")
audio_files = os.listdir(file_path)

for k, file in enumerate(audio_files):
    if file.endswith('.wav'):
        sent = asyncio.ensure_future(extract_MFCCs(count=k+1, file=file))
        sents.append(sent)
    else:
        pass
 
done, _ = loop.run_until_complete(asyncio.wait(sents))

872/872

# Extract f0

## Get Metadata

In [3]:
current_path = os.getcwd()
source = os.path.join(current_path, "MLS", "Full")

metadata = pd.read_csv(source + "/metainfo.csv")
data = metadata[['SPEAKER','GENDER']]
data_dict = dict(zip(data.SPEAKER, data.GENDER))

## Extract Features

In [4]:
async def extract_F0(count, file):
    async with sem:
        file_path = os.path.join(current_path,"MLS", "Full", file)
        
        audio_data = parselmouth.Sound(file_path)
        pitch = audio_data.to_pitch()
        pitch_values = pitch.selected_array['frequency']
        

        nobs_pitch, minmax_pitch, mean_pitch, variance_pitch, skew_pitch, kurtosis_pitch =  stats.describe(pitch_values)
        median_pitch   = np.median(pitch_values)
        mode_pitch     = stats.mode(pitch_values).mode[0]
        std_pitch      = np.std(pitch_values)
        low_pitch,peak_pitch = minmax_pitch
        q75_pitch,q25_pitch  = np.percentile(pitch_values, [75 ,25])
        iqr_pitch      = q75_pitch - q25_pitch
        
        file_path = os.path.join(current_path,"MLS", "Full_split", file)
        
        splited_file = file.split('_')
        if data_dict[int(splited_file[0])] == 'F':
            gender = 0
        
        if data_dict[int(splited_file[0])] == 'M': 
            gender = 1
        
        
        sample_features = [nobs_pitch, mean_pitch, skew_pitch, kurtosis_pitch, median_pitch, mode_pitch, std_pitch, low_pitch, peak_pitch, q25_pitch, q75_pitch, iqr_pitch]
        sample_features.insert(0,str(file))
        sample_features.append(gender)
        
        string = ','.join(str(item) for item in sample_features)
        print(f"\r{count}/{len(audio_files)}",end='')
        async with aiofiles.open('MLS_split_F0_test.csv', mode='a') as f:
            await f.write(f'\n{string}')
        #features_list.append(sample_features)
    
        return 

In [5]:
# #Calculo de tempo de disparo
start_time = time.time()

#inicio do Loop
loop = asyncio.get_event_loop()

#Controle de requisições por vez
sem = asyncio.Semaphore(600)

#Array de tasks
sents = []

nest_asyncio.apply()

#Coleta as recomendações para envio
gender_list = []
file_list = []
features_list = []


current_path = os.getcwd()
file_path = os.path.join(current_path,"MLS", "Full")
audio_files = os.listdir(file_path)

for k, file in enumerate(audio_files):
    if file.endswith('.wav'):
        sent = asyncio.ensure_future(extract_F0(count=k+1, file=file))
        sents.append(sent)
    else:
        pass
 
done, _ = loop.run_until_complete(asyncio.wait(sents))

872/872