In [23]:
import os
import pandas as pd
import numpy as np
import librosa, librosa.display
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns
import regex
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from ast import literal_eval
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from funcs import *

In [24]:
tqdm.pandas() # Registering instance of tqdm with pandas so I can check progress on .apply()

  from pandas import Panel


In [25]:
style.use('fivethirtyeight')

In [26]:
songs = pd.read_csv('./data/ten_genres_500_samp.csv')

Needed to further clean up my data - my filenames and locations didn't match.

In [27]:
for folder in os.listdir('./scrapes/'):
    songs.loc[songs['genre'] == folder, 'filename'] = os.listdir(f'./scrapes/{folder}')

In [28]:
for folder in os.listdir('./scrapes/'):
    new_location = [f'./scrapes/{folder}/{file}' for file in os.listdir(f'./scrapes/{folder}')]
    songs.loc[songs['genre'] == folder, 'location'] = new_location     

## Calculating Energy, RMSE, and MFCC

**Energy** is the total magnitude of the signal, **RMSE** stands for Root Mean Squared Energy. These are different ways of measuring loudness of a signal and require us to set windows of time.[1]

Calculating the **Mel-Frequency Cepstral Coefficient (MFCC)** involves many steps, but helps to create a non-linear representation of a sound's spectral envelope.[2] The steps are[3]:

1. Take the Fourier transform of (a windowed excerpt of) a signal.
2. Map the powers of the spectrum obtained above onto the mel scale, using triangular overlapping windows.
3. Take the logs of the powers at each of the mel frequencies.
4. Take the discrete cosine transform of the list of mel log powers, as if it were a signal.
5. The MFCCs are the amplitudes of the resulting spectrum.


In [29]:
# Creating empty columns to fill
songs['energy_calc'] = np.zeros_like(songs['location'])
songs['rms_calc'] = np.zeros_like(songs['location'])
songs['mfcc_calc'] = np.zeros_like(songs['location'])


In [30]:
for location in tqdm(songs.iloc[:900]['location']):
    new_features(songs, location, n_mfcc=12)

100%|██████████| 900/900 [12:22<00:00,  1.21it/s]


In [31]:
for location in tqdm(songs.iloc[901:]['location']):
    new_features(songs, location, n_mfcc=12)

100%|██████████| 4099/4099 [59:29<00:00,  1.15it/s]  


In [32]:
songs = songs.drop(index=900)

In [33]:
calc_features(songs)

100%|██████████| 4999/4999 [00:13<00:00, 375.84it/s]
100%|██████████| 4999/4999 [00:00<00:00, 22512.93it/s]
100%|██████████| 4999/4999 [00:00<00:00, 66638.25it/s]
100%|██████████| 4999/4999 [00:00<00:00, 113586.79it/s]


In [34]:
nfilt = 12
modeling_df = split_features(songs, nfilt)

When I removed row 900 (it wasn't being recognized by `librosa`), it looks like pandas still filled in that index with calculations. I need to shift all of those down a row now

In [35]:
columns_to_shift = modeling_df.columns[36:]

In [36]:
modeling_df.iloc[900:][columns_to_shift] = modeling_df.iloc[900:][columns_to_shift].shift(axis=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [37]:
modeling_df.drop(900, inplace=True)

In [38]:
modeling_df.iloc[900]['mfcc_mean_00'] # Sanity check to make sure the mfcc_mean_00 is correct

-319.7559814453125

In [39]:
modeling_df['energy_calc'] = modeling_df['energy_calc'].apply(lambda x: x.tolist())
modeling_df['rms_calc'] = modeling_df['rms_calc'].apply(lambda x: x.tolist())
modeling_df['mfcc_calc'] = modeling_df['mfcc_calc'].apply(lambda x: x.tolist())
modeling_df['mfcc_mean'] = modeling_df['mfcc_mean'].apply(lambda x: x.tolist())
modeling_df['mfcc_min'] = modeling_df['mfcc_min'].apply(lambda x: x.tolist())
modeling_df['mfcc_max'] = modeling_df['mfcc_max'].apply(lambda x: x.tolist())

In [40]:
modeling_df['mfcc_max_00'].tail()

4995   -19.002405
4996    26.285011
4997    67.084000
4998    54.184166
4999    13.625891
Name: mfcc_max_00, dtype: float64

In [41]:
songs['mfcc_calc'][3].shape

(2584, 12)

The shape of this feature can be calculated as:  

$\frac{Sample Rate * ClipLengthInSeconds}{FrameLength}$

Or in this case:

$\frac{44100 * 30}{512} \approx 2584$

The answer is rounded up to the nearest integer

In [42]:
modeling_df.to_csv('./data/ten_g_500_modeling_12filt.csv',
         sep='|',
         header=True,
         index=False,
         chunksize=100000,
         compression='gzip',
         encoding='utf-8')

In [21]:
hop_length = 256

## **Sources**

1. https://musicinformationretrieval.com/energy.html
2. https://musicinformationretrieval.com/mfcc.html
3.  Sahidullah, Md.; Saha, Goutam (May 2012). "Design, analysis and experimental evaluation of block based transformation in MFCC computation for speaker recognition". Speech Communication. 54 (4): 543–565. doi:10.1016/j.specom.2011.11.004.