# Imports

In [1]:
#!/usr/bin/python
# -*- coding: utf-8 -*-

import os
import librosa
import asyncio
import numpy as np
import pandas as pd
import scipy.stats as stats
import time
import nest_asyncio
from speechpy import feature

# Frequency Features

## Extract Features

In [None]:
async def sleep_sent(count):
    print(f'Time: {time.time() - start_time:.2f} Count Sent: {count}')
    await asyncio.sleep(0.1)

In [None]:
async def get_frequencies(file):

    file_path = os.path.join(current_path,"CETUC", "Full", file)
    if file[0] == 'F':
        gender = 0
    if file[0] == 'M': 
        gender = 1
    audio_data, sample_rate = await librosa.load(file_path, sr=16000)

    step = int(3200) #3200 sampling points every 1/5 sec
    window_frequencies = []

    for i in range(0,len(audio_data),step):
        ft = np.fft.fft(audio_data[i:i+step]) #fft returns the list N complex numbers
        freqs = librosa.fft_frequencies(sr=16000, n_fft=len(ft))
        freqs = np.fft.fftfreq(len(ft)) #fftq tells you the frequencies associated with the coefficients
        imax = np.argmax(np.abs(ft))
        freq = freqs[imax]
        freq_in_hz = abs(freq *sample_rate)
        window_frequencies.append(freq_in_hz)

    
    return window_frequencies, gender, file

In [None]:
async def get_features(count, file):
    async with sem:
        frequencies, gender, file_name = await get_frequencies(file)

        nobs, minmax, mean, variance, skew, kurtosis =  stats.describe(frequencies)
        median   = np.median(frequencies)
        mode     = stats.mode(frequencies).mode[0]
        std      = np.std(frequencies)
        low,peak = minmax
        q75,q25  = np.percentile(frequencies, [75 ,25])
        iqr      = q75 - q25

        features_list.append([file_name, nobs, mean, skew, kurtosis, median, mode, std, low, peak, q25, q75, iqr, gender])

        print(f"\r{count}/{len(audio_files)}", end='')

        return nobs, mean, skew, kurtosis, median, mode, std, low, peak, q25, q75, iqr

In [None]:
# #Calculo de tempo de disparo
start_time = time.time()

#inicio do Loop
loop = asyncio.get_event_loop()

#Controle de requisições por vez
sem = asyncio.Semaphore(6000)

#Array de tasks
sents = []

nest_asyncio.apply()

#Coleta as recomendações para envio
gender_list = []
file_list = []
features_list = []


current_path = os.getcwd()
file_path = os.path.join(current_path,"CETUC", "Full")
audio_files = os.listdir(file_path)

for k, file in enumerate(audio_files):
    sent = asyncio.ensure_future(get_features(count=k+1, file=file))
    #sent = asyncio.create_task(get_features(count=k+1, files=audio_files))
    
    sents.append(sent)
 
done, _ = loop.run_until_complete(asyncio.wait(sents))

## Save to dataframe and to .csv

In [None]:
dataframe_features = pd.DataFrame(features_list, columns = ['FileName', 'nobs', 'mean', 'skew', 'kurtosis', 'median', 'mode', 'std', 'low', 'peak', 'q25', 'q75', 'iqr', 'Gender'])
dataframe_features.to_csv('data/CETUC_Features_data.csv', index=False)

# Log Mel 

In [10]:
async def LMFE(count, file):
    async with sem:
        file_path = os.path.join(current_path,"CETUC", "Full", file)
        if file[0] == 'F':
            gender = 0
        if file[0] == 'M': 
            gender = 1
        audio_data, sample_rate = librosa.load(file_path, sr=16000)
        
        logenergy = feature.lmfe(audio_data, sampling_frequency=sample_rate,
                                    frame_length=0.020, frame_stride=0.01,
                                    num_filters=40, fft_length=512,
                                    low_frequency=0, high_frequency=None)

        print(len(logenergy))

        features_list.append([file, logenergy, gender])
                                    
        print(f"\r{count}/{len(audio_files)}", end='')

        return logenergy

In [11]:
# #Calculo de tempo de disparo
start_time = time.time()

#inicio do Loop
loop = asyncio.get_event_loop()

#Controle de requisições por vez
sem = asyncio.Semaphore(3000)

#Array de tasks
sents = []

nest_asyncio.apply()

#Coleta as recomendações para envio
gender_list = []
file_list = []
features_list = []


current_path = os.getcwd()
file_path = os.path.join(current_path,"CETUC", "Full")
audio_files = os.listdir(file_path)

for k, file in enumerate(audio_files[:1]):
    sent = asyncio.ensure_future(LMFE(count=k+1, file=file))
    #sent = asyncio.create_task(LMFE(count=k+1, file=audio_files))
    
    sents.append(sent)
 
done, _ = loop.run_until_complete(asyncio.wait(sents))

471
1/100998

In [None]:
dataframe_features = pd.DataFrame(features_list, columns = ['FileName', 'LMFE', 'Gender'])
dataframe_features.to_csv('data/CETUC_LMFE_data.csv', index=False)

In [None]:
current_path = os.getcwd()
file_path = os.path.join(current_path,"CETUC", "Full")
audio_files = os.listdir(file_path)
len(audio_files[50000:])