# Step 1) Data Preparation - Frame Segmentation, Feature Extraction, and Dataset Creation - 

In [177]:
import pandas as pd
import csv
import re   #regular expression library

#import matplotlib
import essentia
import essentia.standard as estd

import sys

import matplotlib
from os.path import isfile, join 
from os import listdir
from math import floor
import numpy as np

#importing parselmouth library for jitter and shimmer calculations based on Praat
import parselmouth
from parselmouth.praat import call

from IPython.display import display

path_to_annot = "preprocessing/csv/"
path_to_music = "preprocessing/audio/"
path_to_intervals = "preprocessing/scripts/"

frame_dataset = 'frame_analysis.csv'

sampling_rate = 44100.0

files = ['8842c1f0-e261-4069-bd59-768bb9a3315c.wav',
         'b11237b9-d45b-4b3a-a97b-ab7d198f927f.wav',
         'f7bcb9af-6abb-4192-ae3d-37fa811034ce.wav',
         '7448d9c2-5261-4e70-bd98-6ed8416f908f.wav']

apply_file = 'apply/a025eef0-d130-496f-9581-f5ce40bd1783.wav'
apply_frames = 'apply_analysis.csv'

Here, the audio files are loaded, and they sections are created based on the annotations of vocal/instrumental. 
Following the approach of the 'paper', 2-3 second audio clips are extracted. This is divided into 2 parts: identifying vocal/non vocal, then chopping up vocal/non vocal into segments.

## Identifying vocal sections 

The identify_vocal_sections function returns 2 lists: first is a list of intervals for vocal sections and last is a list of intervals for instrumental sections. Function assumes that path is a csv file that already has timing information about vocal vs non vocal sections. In our case, this file was generated out of annotations created manually using Praat software. Function also receives the path to the directory where all the relevant music files are. The name of an audio file is expected to have the format mbid.wav and its corresponding annotations should have the name mbid.csv

In [178]:
#return a dictionary of 2 lists: one for vocal intervals and one for instrumental intervals. The dictionary key is the recording mbid
#unit of the intervals is seconds
def identify_vocal_sections(path_to_annot, path_to_music): 
    annot_files = [fi for fi in listdir(path_to_annot) if isfile(join(path_to_annot, fi))]
    music_files = [fi for fi in listdir(path_to_music) if isfile(join(path_to_music, fi))]
    
    files2segments = {}
    for af in annot_files:
        segments = { 'vocal':[], 'instrumental':[] }
        af_dataframe = pd.read_csv(join(path_to_annot, af), header=0, encoding = "ISO-8859-1") 
        if not af_dataframe.dropna().empty:  #for now to get around the failure of pandas to read one of my csvs..
            re_string_v = "(v|V)ocal*"
            re_string_i = "(i|I)nstrumental*"
            vocal = [x[1].tolist() for x in af_dataframe.iterrows() if re.match(re_string_v, str(x[1][0]))]
            instrumental = [x[1].tolist() for x in af_dataframe.iterrows() if re.match(re_string_i, str(x[1][0]))]

            vocal_df = pd.DataFrame(vocal)
            segments['vocal'] = list(zip(vocal_df.iloc[:, 1], vocal_df.iloc[:, 2]))
        
            instrumental_df = pd.DataFrame(instrumental)
            segments['instrumental'] = list(zip(instrumental_df.iloc[:, 1], instrumental_df.iloc[:, 2]))
            files2segments[af[:-4]] = segments
        
    return files2segments

In [179]:
files2segments = identify_vocal_sections(path_to_annot, path_to_music)
#print(files2segments['f7bcb9af-6abb-4192-ae3d-37fa811034ce']['vocal'])

The following function receives a list of tuples, each representing the start and end time of an interval. It chops the intervals into sections of length section_length at most (sections could be shorter if they were at the end of an interval)

In [180]:
#section length in seconds
def chop_sections(intervals, section_length):
    sections = []
    for interval in intervals:
        if interval[1] - interval[0] < section_length:
            sections.append(interval)
        else:
            section_start_samples = np.arange(interval[0], interval[1], section_length)
            section_intervals = zip(section_start_samples[:-1], section_start_samples[1:])
            sections.extend(section_intervals)
    
    return sections

Demonstrating the use of chop sections on a single file. Here, the sections are chopped to segments of 1 seconds

In [181]:
instrumental_sections = chop_sections(files2segments['7448d9c2-5261-4e70-bd98-6ed8416f908f']['instrumental'], 1)
vocal_sections = chop_sections(files2segments['7448d9c2-5261-4e70-bd98-6ed8416f908f']['vocal'], 1)
print(len(vocal_sections))
print(len(instrumental_sections))
#print("sections: {}".format(instrumental_sections))

2973
1119


## Generation and Analysis of Audio Fragments
This portion generates and analyzes the relevant audio fragments. The audio sections are chopped into frames based on frame_duration

In [182]:
#Audio file needs to be 44100 
frame_duration = 40/1000 #40 ms

#utility functions to move from frame number to seconds and vice versa
def s_to_frame(sampling_rate, time_s): 
    return floor(time_s * sampling_rate)

def frame_to_s(sampling_rate, frame):
    return float(frame)/float(sampling_rate)

frame_samples = s_to_frame(44100, frame_duration)

#overlap is if we want to have overlaps between consecutive frames
#estd_audio and prsltng_audio are the 
def get_audio_frames(section_intervals, sampling_rate, frame_size, overlap_ms, estd_audio): 
    analysis_output = []  # Here we'll store the analysis results for each chunk (frame) of the audio file
    
    if frame_size is None:
        frame_size = 44100          #If no frame_size is given, use frames corresponding to 1 second
    if frame_size % 2 != 0:
        frame_size = frame_size + 1 # Make frame size even
    
    audio_frames_essentia = []
    
    frame_overlap = s_to_frame(sampling_rate, float(overlap_ms/1000.0))
    all_frame_intervals = []
   
    for section_interval in section_intervals:
        start_s = section_interval[0]
        end_s = section_interval[1]
        start = int(s_to_frame(sampling_rate, start_s))
        end = int(s_to_frame(sampling_rate, end_s))
        
        #to account for overlapping:
        if end - start <= frame_overlap: #disregard sections shorter than the overlap
            continue
            
        #adjusting for the overlap
        adj_end = int(end - frame_overlap)
        
        frame_intervals_start = np.arange(start, adj_end, (frame_size-frame_overlap))
        
        frame_intervals = list(zip(frame_intervals_start[:-1], (frame_intervals_start[1:] + int(frame_overlap))))
        all_frame_intervals.extend(frame_intervals)
        
        for (f_start, f_end) in frame_intervals:
            audio_frames_essentia.append(audio[int(f_start):int(f_end)])
   
    return audio_frames_essentia, all_frame_intervals

In [183]:
# Demonstrating the use of get_audio_frames on the instrumental sections of a single file
loader = estd.MonoLoader(filename=path_to_music +'7448d9c2-5261-4e70-bd98-6ed8416f908f.wav')
audio = loader()

segmented_audio_essentia, frame_intervals = get_audio_frames(instrumental_sections, sampling_rate, frame_samples, 10, audio)

In [184]:
print(len(segmented_audio_essentia))
print(frame_intervals[:5])

35808
[(11433333, 11435097), (11434656, 11436420), (11435979, 11437743), (11437302, 11439066), (11438625, 11440389)]


The following cells: 1) Cut the instrumental and vocal intervals corresponding to our files into frame intervals. 2) Write the timing(s) information of these frame intervals into files, to be passed on for processing by external scripts as explained below. 3) Return a list of the essentia audio frames corresponding to each interval, for us to compute our features from the audio array. 

The result is a dictionary as follows:
    intervals = { mbid : { 'vocal': { 'audio' : list, 'frame_intervals': list }, 'instrumental': { 'audio' : list, 'frame_intervals': list } } } (Note: replace with proper code snippet)

In [185]:
def frame_intervals2files(filename, frame_intervals):
    with open(path_to_intervals + filename, 'w') as fp:
        fp.write('\n'.join('{:.5f}, {:.5f}'.format(frame_to_s(sampling_rate, x[0]), 
                                          frame_to_s(sampling_rate, x[1])) for x in frame_intervals))

In [186]:
#For each file, generate all the necessary segments from it, and dump the intervals in seconds to a file for 
#other script to process it
intervals = {}

for f in files:
    loader = estd.MonoLoader(filename=path_to_music + f)
    audio = loader()
    instrumental_sections = files2segments[f[:-4]]['instrumental']
    vocal_sections = files2segments[f[:-4]]['vocal']
    
    intervals[f[:-4]] = { 'vocal': {'audio': [], 'frame_intervals': [] }, 
                        'instrumental':{'audio': [], 'frame_intervals': [] } }
        
    segmented_audio_instrumental, frame_intervals_instrumental = get_audio_frames(instrumental_sections, sampling_rate, frame_samples, 10, audio)
    segmented_audio_vocal, frame_intervals_vocal = get_audio_frames(vocal_sections, sampling_rate, frame_samples, 10, audio)
    
    intervals[f[:-4]]['vocal']['audio'] = segmented_audio_vocal
    intervals[f[:-4]]['vocal']['frame_intervals'] = frame_intervals_vocal
    intervals[f[:-4]]['instrumental']['audio'] = segmented_audio_instrumental
    intervals[f[:-4]]['instrumental']['frame_intervals'] = frame_intervals_instrumental
    
    frame_intervals2files('instrumental_'+f[:-4]+'.csv', frame_intervals_instrumental)
    frame_intervals2files('vocal_'+f[:-4]+'.csv', frame_intervals_vocal)

## Feature Extraction 1) Jitter and Shimmer Calculations from parseltongue_analysis.py

After writing the frame intervals to files in the cells above, the parseltongue_analysis.py script must be run on the generated files. To reproduce the results it might be necessary to check the locations of files and where the script is. The script was not included in the notebook because it causes it to crash. However, running it from the command line finishes in around 4 minutes and generates the necessary contents. Parameters of shimmer and jitter can be found on the first few lines of parseltongue_analysis.py, and more info about these parameters and the calculations can be found in the Praat Manual

Update: Currently, Jitter and Shimmer values are not included in the dataset. A discussion on the perspectives regarding these features can be found in the accompanying report.

## Feature Extraction 2) Essentia Features and Delta MFCCs

First, the essentia features will be run on the extracted audio fragments. Then, the files generated from parseltongue_analysis.py will be read and merged with the essentia features. The resulting datasets will be saved in the file indicated by frame_dataset variable of the first cell. Essentia features calculated on single frames are calculated by the essentia_analysis function.

To get the derivatives of MFCCs, the equation
$$d_{t}= \frac{\sum_{n=1}^N n(c_{t + n} - c_{t - n})}{2\sum_{n=1}^N n^2}$$
from [1] was used. Since this implementation of derivatives takes into account future frames as well as past frames, all frame mfccs will be calculated after the single frame level essentia feature extraction of the essentia_analysis function. 


In [187]:
def delta(window): #window is a np array of 5 x 13. The function implements N as 2.
    res = np.zeros(13)
    N = 2
    t = 2 #the central index of an array len(5)
    
    for coef in range(0, len(res)): #for all the coefficients
        num = 0
        denom = 0
        for n in range(1, N):
            num = num + float(n*(window[t+n][coef] - window[t-n][coef]))
            denom = denom + float(n*n)
            
        res[coef] = float(num/(2*denom))
    return res

In [188]:
def essentia_analysis(audio, mbid, start_s, end_s, label): #receives the relevant audio subset for analysis
    frame_output = {
        'mbid': mbid ,
        'start': start_s,
        'end': end_s,
        'vocal': int(label == 'vocal')
    }
    
    # Extract MFCC coefficients (why is hann used with the MFCC coefficients? Maybe I should try and find a paper
    # on that)
    w_algo = estd.Windowing(type = 'hann')
    spectrum_algo = estd.Spectrum()
    mfcc_algo = estd.MFCC()
    spec = spectrum_algo(w_algo(audio))
    _, mfcc_coeffs = mfcc_algo(spec)
    frame_output.update({'mfcc_{0}'.format(j): mfcc_coeffs[j] for j in range(0, len(mfcc_coeffs))})
        
    #Extract Pitch Statistics
    eq_loudness_algo = estd.EqualLoudness()
    processed_audio = eq_loudness_algo(audio_frame)
    
    pitch_algo = estd.PitchMelodia(guessUnvoiced = True)
    pitch_result = pitch_algo(processed_audio)
    
    mean_algo = estd.Mean()
    pitch_mean = mean_algo(pitch_result[0])
    std_dev = np.std(pitch_result[0])
    minimum = np.min(pitch_result[0])
    maximum = np.max(pitch_result[0])
    mean_pitchconf = np.mean(pitch_result[1])
    
    frame_output.update({'pitch_confidence_mean': mean_pitchconf, 
                        'pitch_stddev': std_dev,
                        'pitch_maximum': maximum,
                        'pitch_mean': pitch_mean, 
                        'pitch_minimum': minimum })
    
    return frame_output

DISCLAIMER: Execution time

The second cell take lots of time to execute!

In [189]:
#testing function on one audio frame
mbid = '8842c1f0-e261-4069-bd59-768bb9a3315c'
v = 'vocal'
i = 5
audio_frame = intervals[mbid][v]['audio'][i]
start_s = intervals[mbid][v]['frame_intervals'][i][0]
end_s = intervals[mbid][v]['frame_intervals'][i][1]

test_frame = essentia_analysis(audio_frame, mbid, start_s, end_s, v)

In [190]:
frame_list = []
for mbid in intervals.keys(): #go over all mbid names
    for v in intervals[mbid].keys(): #vocal/instrumental
        for i, audio_frames in enumerate(intervals[mbid][v]['audio']):
            start_s = intervals[mbid][v]['frame_intervals'][i][0]
            end_s = intervals[mbid][v]['frame_intervals'][i][1]
            frame_output = essentia_analysis(audio_frames, mbid, start_s, end_s, v)
            frame_list.append(frame_output)


In [191]:
frame_info_df = pd.DataFrame(frame_list)
#frame_info_df.to_csv('temp_without_deltas.csv')
print(len(frame_list))
display(frame_info_df[:5]) #displaying first 5 rows of the dataset

396204


Unnamed: 0,end,mbid,mfcc_0,mfcc_1,mfcc_10,mfcc_11,mfcc_12,mfcc_2,mfcc_3,mfcc_4,...,mfcc_7,mfcc_8,mfcc_9,pitch_confidence_mean,pitch_maximum,pitch_mean,pitch_minimum,pitch_stddev,start,vocal
0,9013032,8842c1f0-e261-4069-bd59-768bb9a3315c,-898.696045,173.708099,-17.574087,6.142464,1.878799,-1.725494,-2.42152,-14.171719,...,-14.982674,-1.562737,-19.733223,0.0,0.0,0.0,0.0,0.0,9011268,1
1,9014355,8842c1f0-e261-4069-bd59-768bb9a3315c,-917.660217,183.330566,-11.43421,2.022614,3.519569,13.719147,-15.431164,-8.970795,...,-10.624817,-6.943726,-13.495575,0.0,0.0,0.0,0.0,0.0,9012591,1
2,9015678,8842c1f0-e261-4069-bd59-768bb9a3315c,-922.92157,186.002686,4.789326,7.186165,2.732925,20.652985,-10.885296,-1.362686,...,-3.433777,-7.902298,-4.743416,0.0,0.0,0.0,0.0,0.0,9013914,1
3,9017001,8842c1f0-e261-4069-bd59-768bb9a3315c,-898.149292,209.407623,0.085117,6.842201,2.207619,18.181244,-26.062233,-12.646114,...,-8.68251,-4.784363,-4.648674,0.0,0.0,0.0,0.0,0.0,9015237,1
4,9018324,8842c1f0-e261-4069-bd59-768bb9a3315c,-880.911011,218.87616,-5.20822,3.049095,11.509537,13.176208,-30.289192,-19.948208,...,-13.471031,-11.653229,-6.105812,0.0,0.0,0.0,0.0,0.0,9016560,1


## Calculating the Deltas
Although Pool Aggregator exists in essentia and this has the potential of calculating several statistics relating to value pools, there is no direct way to get the derivatives of MFCCs in a flexible manner (for example, varying the number of frames taken in the derivatve window). So in the Step 1) notebook, a function to calculate the derivatives using the above formula was implemented.
This section adds the mfcc deltas to the features dataframe using the deltas function defined above
First, the 1st order derivative is calculated, then the second order is calculated.

In [192]:
def make_window(indexes, i, frame_info_df): #indexes is an array of indexes in frame_info_df that fullfil a certain condition
                                            #i is the index of the indexes array that we want to create a window for
                                           
    #generate the mfccs window array: 5 x 13 array
    mfcc_window_array = []
    for j in range(i-2, i+3): #from i-2 to i+2 inclusive
        new_row = []
        if j<0 or j>= len(indexes):
            new_row = np.zeros((1, 13))
        else:
            #create a 1x13 array with the mfcc coefficients of the relevant index
            new_row = [frame_info_df.iloc[indexes[j]][mfcc_labels].values]
                 
        if len(mfcc_window_array) == 0:
            mfcc_window_array = new_row
        else:
            mfcc_window_array = np.append(mfcc_window_array, new_row, axis=0)
    return mfcc_window_array

def set_mfcc_deltas_in_df(frame_info_df, mfcc_delta_labels, mfcc_labels): #can be used for first order or second order derivative. 
                                                           #function returns a dataframe with columns of mfcc_delta labels set
    #add mfcc derivative columns to the frame analysis df
    frame_info_df[mfcc_delta_labels] = pd.DataFrame([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], index=frame_info_df.index)

    #1. return them based on sorted order of start indexes per mbid
    frame_info_df.sort_values(['start'], inplace=True)
    mbids = frame_info_df['mbid'].unique()

    for mbid in mbids:
        indexes = frame_info_df.loc[frame_info_df['mbid'] == mbid].index
      
        for i in range(0, len(indexes)):
            mfcc_window_array = make_window(indexes, i, frame_info_df)  
            delta_mfccs = delta(mfcc_window_array)
            #assign each mfcc index:
            for mfcc_id in range(0, len(mfcc_delta_labels)):
                frame_info_df.at[indexes[i], mfcc_delta_labels[mfcc_id]] = delta_mfccs[mfcc_id] 
    return frame_info_df

DISCLAIMER: The following cell takes lots of time to execute. It could be made more efficient by reusing windows from consecutive indexes in the above function, but for now this was not implemented and rather a new window is created for each frame index. 

In [195]:
mfcc_delta_labels = ['d_mfcc_0', 'd_mfcc_1', 'd_mfcc_2', 'd_mfcc_3', 'd_mfcc_4', 'd_mfcc_5', 
              'd_mfcc_6', 'd_mfcc_7', 'd_mfcc_8', 'd_mfcc_9', 'd_mfcc_10', 
              'd_mfcc_11', 'd_mfcc_12']
mfcc_labels = ['mfcc_0', 'mfcc_1', 'mfcc_2', 'mfcc_3', 'mfcc_4', 'mfcc_5', 
              'mfcc_6', 'mfcc_7', 'mfcc_8', 'mfcc_9', 'mfcc_10', 
              'mfcc_11', 'mfcc_12']

frame_info_df = set_mfcc_deltas_in_df(frame_info_df, mfcc_delta_labels, mfcc_labels)
frame_info_df.sort_values(['start','mbid'], inplace=True)

display(frame_info_df[:10])

Unnamed: 0,end,mbid,mfcc_0,mfcc_1,mfcc_10,mfcc_11,mfcc_12,mfcc_2,mfcc_3,mfcc_4,...,d_mfcc_3,d_mfcc_4,d_mfcc_5,d_mfcc_6,d_mfcc_7,d_mfcc_8,d_mfcc_9,d_mfcc_10,d_mfcc_11,d_mfcc_12
81623,6395006,b11237b9-d45b-4b3a-a97b-ab7d198f927f,-822.269653,114.73732,-3.547222,-1.562574,-13.89045,13.18354,8.423687,2.79834,...,-3,-2,-15,-12,-14,-24,-4,9,12,-1
81624,6396329,b11237b9-d45b-4b3a-a97b-ab7d198f927f,-764.307983,82.565025,-3.509365,-5.950539,-4.364056,14.276085,3.125572,1.845177,...,-3,-1,-1,2,5,-3,5,-6,0,-1
81625,6397652,b11237b9-d45b-4b3a-a97b-ab7d198f927f,-792.079285,79.074844,4.59971,-5.323975,-10.983316,17.313128,15.569201,-0.522655,...,17,-18,29,14,20,19,7,-6,-19,0
81626,6398975,b11237b9-d45b-4b3a-a97b-ab7d198f927f,-743.051331,67.009544,-4.661604,-2.143501,-4.111473,-26.007431,15.372347,-8.244789,...,0,-6,3,-6,1,7,0,11,-2,-2
81627,6400298,b11237b9-d45b-4b3a-a97b-ab7d198f927f,-771.705933,80.820953,-8.854465,-7.0599,-11.404778,-2.714058,11.14065,5.018948,...,-14,15,-24,-13,-11,-14,-9,5,15,0
81628,6401621,b11237b9-d45b-4b3a-a97b-ab7d198f927f,-715.195618,80.476028,-6.829981,0.012865,-10.746695,-21.50732,7.624414,-12.220551,...,-3,15,1,4,0,-8,0,-8,6,1
81629,6402944,b11237b9-d45b-4b3a-a97b-ab7d198f927f,-738.049622,90.538826,-6.111431,-8.988768,-12.044825,-13.902971,17.05493,0.186092,...,19,-4,24,12,7,11,1,-14,-21,1
81630,6404267,b11237b9-d45b-4b3a-a97b-ab7d198f927f,-718.471863,82.442734,1.818819,-0.206799,-9.100965,-15.902157,6.045748,-13.355854,...,10,-14,-5,-2,-3,2,1,7,-1,-1
81631,6405590,b11237b9-d45b-4b3a-a97b-ab7d198f927f,-765.311279,98.591446,-0.973898,-0.769405,1.354244,5.341904,9.28381,-2.500385,...,-11,1,-32,-10,-12,-6,0,4,17,0
81632,6406913,b11237b9-d45b-4b3a-a97b-ab7d198f927f,-720.955078,78.618263,2.422918,-2.051275,-3.298769,-11.082256,16.611044,-10.571404,...,-17,6,5,-4,-2,-2,-1,-1,3,4


## Saving Dataset

In [197]:
#writing to csv without the indexes column
frame_info_df.to_csv(frame_dataset, index=False)

## Generating File to apply model to
Other than the train/test dataset, it would be interesting to see the results of applying the model to be trained on an un-annotated file. As such, this portion of the code does the frame extraction corresponding to a full audio file so that it can be used in the Step 2) notebook of this series.

In [164]:
loader = estd.MonoLoader(filename=path_to_music + apply_file)
audio = loader()

intervals = [(0, frame_to_s(sampling_rate, len(audio)))] #put all file into 1 interval
sections = chop_sections(intervals, 1)
    
segmented_audio_essentia, frame_intervals = get_audio_frames(sections, sampling_rate, frame_samples, 10, audio)

frame_list = []

v = 0 #essentia analysis function will set all frames as having an instrumental label, but this tag will be 
      #removed anyway
mbid = 'a025eef0-d130-496f-9581-f5ce40bd1783'  
for i, audio_frames in enumerate(segmented_audio_essentia):
    start_s = frame_intervals[i][0]
    end_s = frame_intervals[i][1]
    frame_output = essentia_analysis(audio_frames, mbid, start_s, end_s, v)
    frame_list.append(frame_output)
    

In [168]:
#write the frame list to a pandas dataframe
#write to pandas dataframe
print(len(frame_list))
apply_frame_info_df = pd.DataFrame(frame_list)

#drop the 'vocal' column
apply_frame_info_df.drop(['vocal'], inplace=True, axis=1)

#add the MFCC deltas info
mfcc_delta_labels = ['d_mfcc_0', 'd_mfcc_1', 'd_mfcc_2', 'd_mfcc_3', 'd_mfcc_4', 'd_mfcc_5', 
              'd_mfcc_6', 'd_mfcc_7', 'd_mfcc_8', 'd_mfcc_9', 'd_mfcc_10', 
              'd_mfcc_11', 'd_mfcc_12']

mfcc_labels = ['mfcc_0', 'mfcc_1', 'mfcc_2', 'mfcc_3', 'mfcc_4', 'mfcc_5', 
              'mfcc_6', 'mfcc_7', 'mfcc_8', 'mfcc_9', 'mfcc_10', 
              'mfcc_11', 'mfcc_12']

apply_frame_info_df = set_mfcc_deltas_in_df(apply_frame_info_df, mfcc_delta_labels, mfcc_labels)

display(apply_frame_info_df[:5]) #displaying first 5 rows of the dataset

20672


Unnamed: 0,end,mbid,mfcc_0,mfcc_1,mfcc_10,mfcc_11,mfcc_12,mfcc_2,mfcc_3,mfcc_4,...,d_mfcc_3,d_mfcc_4,d_mfcc_5,d_mfcc_6,d_mfcc_7,d_mfcc_8,d_mfcc_9,d_mfcc_10,d_mfcc_11,d_mfcc_12
0,1764,a025eef0-d130-496f-9581-f5ce40bd1783,-1138.420044,1.1e-05,-2.7e-05,-3.1e-05,-3.8e-05,-6.1e-05,8e-05,-6.1e-05,...,0,0,0,0,0,0,0,0,0,0
1,3087,a025eef0-d130-496f-9581-f5ce40bd1783,-1138.420044,1.1e-05,-2.7e-05,-3.1e-05,-3.8e-05,-6.1e-05,8e-05,-6.1e-05,...,0,0,0,0,0,0,0,0,0,0
2,4410,a025eef0-d130-496f-9581-f5ce40bd1783,-1138.420044,1.1e-05,-2.7e-05,-3.1e-05,-3.8e-05,-6.1e-05,8e-05,-6.1e-05,...,0,0,0,0,0,0,0,0,0,0
3,5733,a025eef0-d130-496f-9581-f5ce40bd1783,-1138.420044,1.1e-05,-2.7e-05,-3.1e-05,-3.8e-05,-6.1e-05,8e-05,-6.1e-05,...,0,0,0,0,0,0,0,0,0,0
4,7056,a025eef0-d130-496f-9581-f5ce40bd1783,-1138.420044,1.1e-05,-2.7e-05,-3.1e-05,-3.8e-05,-6.1e-05,8e-05,-6.1e-05,...,0,0,0,0,0,0,0,0,0,0


In [196]:
#write to the apply file dataset
apply_frame_info_df.to_csv(apply_frames, index=False)

## Data Preprocessing Limitations

## Limitations
Some things that were not addressed in the paper, and are slightly vague: (Note; make sure it is 25 ms in my impl)
1) No mention of window size used for extraction of features other than the MFCCs.
2) No parameters given to calculate the jitter and shimmer, which are the 2 new features proposed by the paper
3) How can I check for sure the sampling rate of an audio file
4) Since the results will be quite bad anyway, it might be a useful excercise to try and do visualizations on the features that were used, and this could be the fruit of the project & the notebook. To try and understand why a feature was good or bad.
5) the 0 pitch minimum reflects that in some cases, the pitch detection in the audio frame returns 0.
Say something about this info msg TriangularBands: input spectrum size (4411) does not correspond to the "inputSize" parameter (1025). Recomputing the filter bank.

Why are the pitch values completely unreliable? how do i check that the settings used for frame calculation are actually ok?

## References
[1] http://practicalcryptography.com/miscellaneous/machine-learning/guide-mel-frequency-cepstral-coefficients-mfccs/