# Imports

In [1]:
#!/usr/bin/python
# -*- coding: utf-8 -*-

import os
import librosa
import asyncio
import numpy as np
import pandas as pd
import scipy.stats as stats
import time
import nest_asyncio
from speechpy import feature
import aiofiles
import csv
import math
import parselmouth
import matplotlib.pyplot as plt
import amfm_decompy.pYAAPT as pYAAPT

# Frequency Features

## Extract Features

In [2]:
async def sleep_sent(count):
    print(f'Time: {time.time() - start_time:.2f} Count Sent: {count}')
    await asyncio.sleep(0.1)

In [3]:
async def get_frequencies(file):

    file_path = os.path.join(project_root,"CETUC", "Full", file)

    if file[0] == 'F':
        gender = 0
    if file[0] == 'M': 
        gender = 1
    audio_data, sample_rate = librosa.load(file_path, sr=16000)

    step = int(3200) #3200 sampling points every 1/5 sec
    window_frequencies = []

    for i in range(0,len(audio_data),step):
        ft = np.fft.fft(audio_data[i:i+step]) #fft returns the list N complex numbers
        freqs = librosa.fft_frequencies(sr=16000, n_fft=len(ft))
        freqs = np.fft.fftfreq(len(ft)) #fftq tells you the frequencies associated with the coefficients
        imax = np.argmax(np.abs(ft))
        freq = freqs[imax]
        freq_in_hz = abs(freq *sample_rate)
        window_frequencies.append(freq_in_hz)

    
    return window_frequencies, gender, file

In [4]:
async def get_features(count, file):
    async with sem:
        frequencies, gender, file_name = await get_frequencies(file)

        nobs, minmax, mean, variance, skew, kurtosis =  stats.describe(frequencies)
        median   = np.median(frequencies)
        mode     = stats.mode(frequencies).mode[0]
        std      = np.std(frequencies)
        low,peak = minmax
        q75,q25  = np.percentile(frequencies, [75 ,25])
        iqr      = q75 - q25

        features_list.append([file_name, nobs, mean, skew, kurtosis, median, mode, std, low, peak, q25, q75, iqr, gender])

        print(f"\r{count}/{len(audio_files)}", end='')

        return nobs, mean, skew, kurtosis, median, mode, std, low, peak, q25, q75, iqr

In [5]:
# #Calculo de tempo de disparo
start_time = time.time()

#inicio do Loop
loop = asyncio.get_event_loop()

#Controle de requisições por vez
sem = asyncio.Semaphore(6000)

#Array de tasks
sents = []

nest_asyncio.apply()

#Coleta as recomendações para envio
gender_list = []
file_list = []
features_list = []


project_root =  os.path.dirname(os.path.dirname(os.getcwd()))

file_path = os.path.join(project_root,"CETUC", "Full")
audio_files = os.listdir(file_path)
for k, file in enumerate(audio_files):
    sent = asyncio.ensure_future(get_features(count=k+1, file=file))
    #sent = asyncio.create_task(get_features(count=k+1, files=audio_files))
    
    sents.append(sent)
 
done, _ = loop.run_until_complete(asyncio.wait(sents))

26075/100998

KeyboardInterrupt: 

26481/100998

## Save to dataframe and to .csv

In [None]:
dataframe_features = pd.DataFrame(features_list, columns = ['FileName', 'nobs', 'mean', 'skew', 'kurtosis', 'median', 'mode', 'std', 'low', 'peak', 'q25', 'q75', 'iqr', 'Gender'])
dataframe_features.to_csv('data/CETUC/Features_data.csv', index=False)

# MFCCs 

Extract MFCCs

In [None]:
async def extract_MFCCs(count, file):
    async with sem:
        file_path = os.path.join(project_root,"CETUC", "Full", file)
        
        audio_data, sample_rate =librosa.load(file_path)
        
        mfccs = librosa.feature.mfcc(y=audio_data, sr=sample_rate)
        
        mfccs_mean = list(np.mean(mfccs.T, axis= 0))
        if file[0] == 'F':
            gender = 0
        if file[0] == 'M': 
            gender = 1
        
        sample_features = mfccs_mean
        sample_features.insert(0,str(file))
        sample_features.append(gender)
        print(f"\r{count}/{len(audio_files)}",end='')
        string = ','.join(str(item) for item in sample_features)
        # async with aiofiles.open('CETUC_MFCCs_test.csv', mode='a') as f:
        #     await f.write(f'\n{string}')
        features_list.append(sample_features)
        
        
        return 

In [None]:
# #Calculo de tempo de disparo
start_time = time.time()

#inicio do Loop
loop = asyncio.get_event_loop()

#Controle de requisições por vez
sem = asyncio.Semaphore(600)

#Array de tasks
sents = []

nest_asyncio.apply()

#Coleta as recomendações para envio
gender_list = []
file_list = []
features_list = []


project_root =  os.path.dirname(os.path.dirname(os.getcwd()))
file_path = os.path.join(project_root,"CETUC", "Full")
audio_files = os.listdir(file_path)

for k, file in enumerate(audio_files):
    sent = asyncio.ensure_future(extract_MFCCs(count=k+1, file=file))
    
    sents.append(sent)
 
done, _ = loop.run_until_complete(asyncio.wait(sents))

In [None]:
dataframe_features = pd.DataFrame(features_list, columns = ['FileName',
                                                            'MFCC_1',  'MFCC_2',  'MFCC_3',  'MFCC_4',  'MFCC_5',
                                                            'MFCC_6',  'MFCC_7',  'MFCC_8',  'MFCC_9',  'MFCC_10',
                                                            'MFCC_11',  'MFCC_12',  'MFCC_13',  'MFCC_14',  'MFCC_15',
                                                            'MFCC_16',  'MFCC_17',  'MFCC_18',  'MFCC_19',  'MFCC_20',
                                                            'Gender'])
dataframe_features

In [None]:
dataframe_features.to_csv('data/CETUC/MFCCs_data.csv', index=False)

# F0

## Extract F0

In [4]:
async def extract_F0(count, file):
    async with sem:
        file_path = os.path.join(project_root,"CETUC", "Full", file)
        audio_data = parselmouth.Sound(file_path)
        pitch = audio_data.to_pitch()
        pitch_values = pitch.selected_array['frequency']
        

        nobs_pitch, minmax_pitch, mean_pitch, variance_pitch, skew_pitch, kurtosis_pitch =  stats.describe(pitch_values)
        median_pitch   = np.median(pitch_values)
        mode_pitch     = stats.mode(pitch_values).mode[0]
        std_pitch      = np.std(pitch_values)
        low_pitch,peak_pitch = minmax_pitch
        q75_pitch,q25_pitch  = np.percentile(pitch_values, [75 ,25])
        iqr_pitch      = q75_pitch - q25_pitch

        
        if file[0] == 'F':
            gender = 0
        if file[0] == 'M': 
            gender = 1
        
        sample_features = [nobs_pitch, mean_pitch, skew_pitch, kurtosis_pitch, median_pitch, mode_pitch, std_pitch, low_pitch, peak_pitch, q25_pitch, q75_pitch, iqr_pitch]
        sample_features.insert(0,str(file))
        sample_features.append(gender)
        print(f"\r{count}/{len(audio_files)}",end='')
        string = ','.join(str(item) for item in sample_features)
        # async with aiofiles.open('CETUC_F0_test.csv', mode='a') as f:
        #     await f.write(f'\n{string}')
        features_list.append(sample_features)
        

        return 

In [None]:
# #Calculo de tempo de disparo
start_time = time.time()

#inicio do Loop
loop = asyncio.get_event_loop()

#Controle de requisições por vez
sem = asyncio.Semaphore(600)

#Array de tasks
sents = []

nest_asyncio.apply()

#Coleta as recomendações para envio
gender_list = []
file_list = []
features_list = []


project_root =  os.path.dirname(os.path.dirname(os.getcwd()))
file_path = os.path.join(project_root,"CETUC", "Full")
audio_files = os.listdir(file_path)

for k, file in enumerate(audio_files):
    sent = asyncio.ensure_future(extract_F0(count=k+1, file=file))
    
    sents.append(sent)
 
done, _ = loop.run_until_complete(asyncio.wait(sents))

In [None]:
dataframe_features = pd.DataFrame(features_list, columns = ['FileName',
                                                            'nobs_pitch', 'mean_pitch', 'skew_pitch', 
                                                            'kurtosis_pitch', 'median_pitch', 'mode_pitch', 
                                                            'std_pitch', 'low_pitch', 'peak_pitch', 
                                                            'q25_pitch', 'q75_pitch', 'iqr_pitch',
                                                            'Gender'])
dataframe_features

In [None]:
dataframe_features.to_csv('data/CETUC/F0_data.csv', index=False)