In [1]:
import librosa
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import kurtosis
from scipy.stats import skew
%matplotlib inline
import os
import csv
# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
#Keras
import keras
from keras import models
from keras import layers

In [None]:
!pip install librosa --upgrade

# Audio feature extraction

In [2]:
genres = {'classical':0, 'rock':1, 'electronic':2, 'pop':3}

In [8]:
def read_process_songs(src_dir, debug = True):    
    """
    This function reads an audio file.

            Parameters:
                    src_dir (string): the audio file path

            Returns:
                    arr_features (DataFrame): the extracted audio features
    """
    # Empty array of dicts with the processed features from all files
    arr_features = []

    i = 0
    # Read files from the folders
    for x,_ in genres.items():
        folder = src_dir + x
        print(folder)
        print(x)

        for root, subdirs, files in os.walk(folder):
            for file in files:
                # Read the audio file
                file_name = folder + "/" + file
                print(file_name)
                signal, sr = librosa.load(file_name,duration=30)

                #pre-emphasis before extracting features
                signal_filt = librosa.effects.preemphasis(signal)
                
                # Debug process
                if debug:
                    print("Reading file: {}".format(file_name))
                
                track_id = int(file.replace(".mp3",""))

                # Append the result to the data structure
                features = get_features(signal_filt,sr,track_id+i)
                features['genre'] = genres[x]
                arr_features.append(features)
        i = i+100
    return arr_features

In [9]:
def get_features(y, sr, id):
    '''
    This function extracts audio features from an audio file.

            Parameters:
                    id (string): the audio track id 
                    y 
                    sr 

            Returns:
                    features (DataFrame): the extracted audio features
    '''
    # Features to concatenate in the final dictionary
    features = {'chroma_sftf': None, 'rolloff': None, 'zero_crossing_rate': None, 'rmse': None,
                'flux': None, 'contrast': None, 'flatness': None}
    print(id)

    # Count silence
    if 0 < len(y):
        y_sound, _ = librosa.effects.trim(y)
    features['sample_silence'] = len(y) - len(y_sound)

    # Using librosa to calculate the features
    features['chroma_sftf'] = np.mean(librosa.feature.chroma_stft(y=y, sr=sr))
    features['rolloff'] = np.mean(librosa.feature.spectral_rolloff(y, sr=sr))
    features['zero_crossing_rate'] = np.mean(librosa.feature.zero_crossing_rate(y))
    features['rmse'] = np.mean(librosa.feature.rms(y))
    features['flux'] = np.mean(librosa.onset.onset_strength(y=y, sr=sr))
    features['contrast'] = np.mean(librosa.feature.spectral_contrast(y, sr=sr))
    features['flatness'] = np.mean(librosa.feature.spectral_flatness(y))
    
    # MFCC treatment
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
    for idx, v_mfcc in enumerate(mfcc):
        features['mfcc_{}'.format(idx)] = np.mean(v_mfcc)

    features['tempo'] = librosa.beat.tempo(y, sr=sr)[0]
    features['track_id'] = id
    
    return features

In [10]:
data_path ='/content/drive/MyDrive/Module 4/Data/music-genre-dataset/'

In [None]:
# Get audio features 
features = read_process_songs(data_path, debug=False)
df_features = pd.DataFrame(features)

In [13]:
df_features.head()

Unnamed: 0,chroma_sftf,rolloff,zero_crossing_rate,rmse,flux,contrast,flatness,sample_silence,mfcc_0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,mfcc_13,mfcc_14,mfcc_15,mfcc_16,mfcc_17,mfcc_18,mfcc_19,tempo,track_id,genre
0,0.24044,5609.832745,0.197936,0.024033,1.225963,25.116579,0.01058,512,-310.556274,32.449673,-34.577312,7.906827,-12.078389,-12.289697,-24.998838,-9.762599,-16.842607,-0.584137,-3.597935,6.116321,2.501004,7.970569,-5.256515,-3.204948,-11.860776,-6.819301,-6.551328,5.704544,129.199219,99,0
1,0.362437,3052.823207,0.11847,0.012409,1.051247,24.711777,0.001478,0,-346.451477,140.577972,-84.989655,24.135998,-30.728107,-3.305699,-23.885736,-11.872183,-16.818594,-11.321119,-7.550418,-6.350888,-10.039324,-9.511783,-7.478334,-0.448974,-3.658296,-1.458033,-7.320634,-0.976182,129.199219,16,0
2,0.310384,2812.891324,0.103645,0.01872,1.043701,24.332351,0.001663,0,-325.874237,139.118607,-67.381599,11.422537,-14.829703,-0.252155,-7.146189,-5.018671,-13.117849,-7.918598,-10.458308,-6.276796,-5.203215,-5.634519,-5.501945,-2.512312,-6.415515,-3.57924,-4.306728,-2.410398,123.046875,10,0
3,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,25.564194,-41.426327,-9.400389,-35.06028,-5.177183,-9.786539,5.636172,-8.562539,-5.510411,-13.493268,-1.942426,-2.245079,6.757333,-6.536245,-7.276831,-11.553081,-10.180579,-14.974916,-8.946871,129.199219,1,0
4,0.319633,7310.130797,0.295759,0.016223,0.95913,24.662484,0.05178,0,-285.206696,-0.045624,-22.728594,12.827009,-15.649368,-3.868008,-17.855396,-5.60013,-20.700975,-7.606978,-7.271137,-1.96283,-10.223964,-8.84428,-12.049548,-4.828347,-9.483101,-4.844469,-11.426237,-3.66793,103.359375,11,0


# Merge audio features with emotion features

In [15]:
# save audio features 
df_features.to_csv('../data/train_new_feat.csv',index=False)

In [16]:
# Read emotion features
emo_feat = pd.read_csv('/content/drive/MyDrive/Module 4/Data/music-genre-dataset/data.csv')
emo_feat.head()

Unnamed: 0,track id,genre,amazement,solemnity,tenderness,nostalgia,calmness,power,joyful_activation,tension,sadness,mood,liked,disliked,age,gender,mother tongue
0,1,classical,0,1,0,0,0,0,1,1,0,3,1,0,21,1,English
1,1,classical,0,0,0,1,0,0,0,0,0,3,0,1,41,1,Dutch
2,1,classical,0,0,0,1,0,0,0,0,1,3,0,0,24,1,English
3,1,classical,0,0,0,0,1,0,0,0,0,3,0,0,32,0,Spanish
4,1,classical,0,0,0,1,1,0,0,0,0,4,0,1,21,0,English


In [26]:
print(emo_feat.columns)

Index(['track id', ' genre', ' amazement', ' solemnity', ' tenderness',
       ' nostalgia', ' calmness', ' power', ' joyful_activation', ' tension',
       ' sadness', ' mood', ' liked', ' disliked', ' age', ' gender',
       ' mother tongue'],
      dtype='object')


In [27]:
emo_feat.drop([' genre'],axis=1,inplace=True)
emo_feat.shape

(8407, 16)

In [28]:
emo_feat.head()

Unnamed: 0,track id,amazement,solemnity,tenderness,nostalgia,calmness,power,joyful_activation,tension,sadness,mood,liked,disliked,age,gender,mother tongue
0,1,0,1,0,0,0,0,1,1,0,3,1,0,21,1,English
1,1,0,0,0,1,0,0,0,0,0,3,0,1,41,1,Dutch
2,1,0,0,0,1,0,0,0,0,1,3,0,0,24,1,English
3,1,0,0,0,0,1,0,0,0,0,3,0,0,32,0,Spanish
4,1,0,0,0,1,1,0,0,0,0,4,0,1,21,0,English


In [29]:
# merge audio features to emotion features 
df_merge = pd.merge(df_features, emo_feat, left_on='track_id', right_on='track id')
df_merge.head()

Unnamed: 0,chroma_sftf,rolloff,zero_crossing_rate,rmse,flux,contrast,flatness,sample_silence,mfcc_0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,mfcc_13,mfcc_14,mfcc_15,mfcc_16,mfcc_17,mfcc_18,mfcc_19,tempo,track_id,genre,track id,amazement,solemnity,tenderness,nostalgia,calmness,power,joyful_activation,tension,sadness,mood,liked,disliked,age,gender,mother tongue
0,0.24044,5609.832745,0.197936,0.024033,1.225963,25.116579,0.01058,512,-310.556274,32.449673,-34.577312,7.906827,-12.078389,-12.289697,-24.998838,-9.762599,-16.842607,-0.584137,-3.597935,6.116321,2.501004,7.970569,-5.256515,-3.204948,-11.860776,-6.819301,-6.551328,5.704544,129.199219,99,0,99,0,0,0,1,0,0,0,0,0,3,0,0,52,1,English
1,0.24044,5609.832745,0.197936,0.024033,1.225963,25.116579,0.01058,512,-310.556274,32.449673,-34.577312,7.906827,-12.078389,-12.289697,-24.998838,-9.762599,-16.842607,-0.584137,-3.597935,6.116321,2.501004,7.970569,-5.256515,-3.204948,-11.860776,-6.819301,-6.551328,5.704544,129.199219,99,0,99,0,0,0,0,1,0,0,0,0,2,1,0,28,1,Estonian
2,0.24044,5609.832745,0.197936,0.024033,1.225963,25.116579,0.01058,512,-310.556274,32.449673,-34.577312,7.906827,-12.078389,-12.289697,-24.998838,-9.762599,-16.842607,-0.584137,-3.597935,6.116321,2.501004,7.970569,-5.256515,-3.204948,-11.860776,-6.819301,-6.551328,5.704544,129.199219,99,0,99,0,0,1,0,1,0,0,0,1,4,0,0,21,1,English
3,0.24044,5609.832745,0.197936,0.024033,1.225963,25.116579,0.01058,512,-310.556274,32.449673,-34.577312,7.906827,-12.078389,-12.289697,-24.998838,-9.762599,-16.842607,-0.584137,-3.597935,6.116321,2.501004,7.970569,-5.256515,-3.204948,-11.860776,-6.819301,-6.551328,5.704544,129.199219,99,0,99,0,1,0,0,1,0,1,0,0,3,1,0,21,1,English
4,0.24044,5609.832745,0.197936,0.024033,1.225963,25.116579,0.01058,512,-310.556274,32.449673,-34.577312,7.906827,-12.078389,-12.289697,-24.998838,-9.762599,-16.842607,-0.584137,-3.597935,6.116321,2.501004,7.970569,-5.256515,-3.204948,-11.860776,-6.819301,-6.551328,5.704544,129.199219,99,0,99,0,0,1,0,1,0,1,0,0,5,1,0,18,0,Swedish


In [30]:
# sort by track id 
audio_df = df_merge.sort_values(by=['track id'], ascending=True)
audio_df.head()

Unnamed: 0,chroma_sftf,rolloff,zero_crossing_rate,rmse,flux,contrast,flatness,sample_silence,mfcc_0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,mfcc_13,mfcc_14,mfcc_15,mfcc_16,mfcc_17,mfcc_18,mfcc_19,tempo,track_id,genre,track id,amazement,solemnity,tenderness,nostalgia,calmness,power,joyful_activation,tension,sadness,mood,liked,disliked,age,gender,mother tongue
136,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,25.564194,-41.426327,-9.400389,-35.06028,-5.177183,-9.786539,5.636172,-8.562539,-5.510411,-13.493268,-1.942426,-2.245079,6.757333,-6.536245,-7.276831,-11.553081,-10.180579,-14.974916,-8.946871,129.199219,1,0,1,0,0,0,0,0,0,0,0,1,3,0,0,30,0,English
133,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,25.564194,-41.426327,-9.400389,-35.06028,-5.177183,-9.786539,5.636172,-8.562539,-5.510411,-13.493268,-1.942426,-2.245079,6.757333,-6.536245,-7.276831,-11.553081,-10.180579,-14.974916,-8.946871,129.199219,1,0,1,0,0,1,1,0,0,0,0,1,1,1,0,33,0,Russian
134,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,25.564194,-41.426327,-9.400389,-35.06028,-5.177183,-9.786539,5.636172,-8.562539,-5.510411,-13.493268,-1.942426,-2.245079,6.757333,-6.536245,-7.276831,-11.553081,-10.180579,-14.974916,-8.946871,129.199219,1,0,1,0,1,0,0,1,0,0,0,1,4,0,1,51,0,English
135,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,25.564194,-41.426327,-9.400389,-35.06028,-5.177183,-9.786539,5.636172,-8.562539,-5.510411,-13.493268,-1.942426,-2.245079,6.757333,-6.536245,-7.276831,-11.553081,-10.180579,-14.974916,-8.946871,129.199219,1,0,1,1,1,0,0,1,0,0,0,0,3,0,0,21,1,English
137,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,25.564194,-41.426327,-9.400389,-35.06028,-5.177183,-9.786539,5.636172,-8.562539,-5.510411,-13.493268,-1.942426,-2.245079,6.757333,-6.536245,-7.276831,-11.553081,-10.180579,-14.974916,-8.946871,129.199219,1,0,1,0,1,0,1,0,0,0,0,1,3,0,0,60,1,English


In [31]:
audio_df.drop(['track_id'],axis=1,inplace=True)
audio_df.head()

Unnamed: 0,chroma_sftf,rolloff,zero_crossing_rate,rmse,flux,contrast,flatness,sample_silence,mfcc_0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,mfcc_13,mfcc_14,mfcc_15,mfcc_16,mfcc_17,mfcc_18,mfcc_19,tempo,genre,track id,amazement,solemnity,tenderness,nostalgia,calmness,power,joyful_activation,tension,sadness,mood,liked,disliked,age,gender,mother tongue
136,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,25.564194,-41.426327,-9.400389,-35.06028,-5.177183,-9.786539,5.636172,-8.562539,-5.510411,-13.493268,-1.942426,-2.245079,6.757333,-6.536245,-7.276831,-11.553081,-10.180579,-14.974916,-8.946871,129.199219,0,1,0,0,0,0,0,0,0,0,1,3,0,0,30,0,English
133,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,25.564194,-41.426327,-9.400389,-35.06028,-5.177183,-9.786539,5.636172,-8.562539,-5.510411,-13.493268,-1.942426,-2.245079,6.757333,-6.536245,-7.276831,-11.553081,-10.180579,-14.974916,-8.946871,129.199219,0,1,0,0,1,1,0,0,0,0,1,1,1,0,33,0,Russian
134,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,25.564194,-41.426327,-9.400389,-35.06028,-5.177183,-9.786539,5.636172,-8.562539,-5.510411,-13.493268,-1.942426,-2.245079,6.757333,-6.536245,-7.276831,-11.553081,-10.180579,-14.974916,-8.946871,129.199219,0,1,0,1,0,0,1,0,0,0,1,4,0,1,51,0,English
135,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,25.564194,-41.426327,-9.400389,-35.06028,-5.177183,-9.786539,5.636172,-8.562539,-5.510411,-13.493268,-1.942426,-2.245079,6.757333,-6.536245,-7.276831,-11.553081,-10.180579,-14.974916,-8.946871,129.199219,0,1,1,1,0,0,1,0,0,0,0,3,0,0,21,1,English
137,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,25.564194,-41.426327,-9.400389,-35.06028,-5.177183,-9.786539,5.636172,-8.562539,-5.510411,-13.493268,-1.942426,-2.245079,6.757333,-6.536245,-7.276831,-11.553081,-10.180579,-14.974916,-8.946871,129.199219,0,1,0,1,0,1,0,0,0,0,1,3,0,0,60,1,English


In [32]:
track_id = audio_df[['track id']] 
audio_df.drop(['track id'],axis=1,inplace=True)
final_data = pd.concat([track_id,audio_df], axis=1)
final_data.head()

Unnamed: 0,track id,chroma_sftf,rolloff,zero_crossing_rate,rmse,flux,contrast,flatness,sample_silence,mfcc_0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,mfcc_13,mfcc_14,mfcc_15,mfcc_16,mfcc_17,mfcc_18,mfcc_19,tempo,genre,amazement,solemnity,tenderness,nostalgia,calmness,power,joyful_activation,tension,sadness,mood,liked,disliked,age,gender,mother tongue
136,1,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,25.564194,-41.426327,-9.400389,-35.06028,-5.177183,-9.786539,5.636172,-8.562539,-5.510411,-13.493268,-1.942426,-2.245079,6.757333,-6.536245,-7.276831,-11.553081,-10.180579,-14.974916,-8.946871,129.199219,0,0,0,0,0,0,0,0,0,1,3,0,0,30,0,English
133,1,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,25.564194,-41.426327,-9.400389,-35.06028,-5.177183,-9.786539,5.636172,-8.562539,-5.510411,-13.493268,-1.942426,-2.245079,6.757333,-6.536245,-7.276831,-11.553081,-10.180579,-14.974916,-8.946871,129.199219,0,0,0,1,1,0,0,0,0,1,1,1,0,33,0,Russian
134,1,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,25.564194,-41.426327,-9.400389,-35.06028,-5.177183,-9.786539,5.636172,-8.562539,-5.510411,-13.493268,-1.942426,-2.245079,6.757333,-6.536245,-7.276831,-11.553081,-10.180579,-14.974916,-8.946871,129.199219,0,0,1,0,0,1,0,0,0,1,4,0,1,51,0,English
135,1,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,25.564194,-41.426327,-9.400389,-35.06028,-5.177183,-9.786539,5.636172,-8.562539,-5.510411,-13.493268,-1.942426,-2.245079,6.757333,-6.536245,-7.276831,-11.553081,-10.180579,-14.974916,-8.946871,129.199219,0,1,1,0,0,1,0,0,0,0,3,0,0,21,1,English
137,1,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,25.564194,-41.426327,-9.400389,-35.06028,-5.177183,-9.786539,5.636172,-8.562539,-5.510411,-13.493268,-1.942426,-2.245079,6.757333,-6.536245,-7.276831,-11.553081,-10.180579,-14.974916,-8.946871,129.199219,0,0,1,0,1,0,0,0,0,1,3,0,0,60,1,English


In [33]:
final_data.shape

(8407, 46)

# Save the final data

In [34]:
final_data.to_csv('/content/drive/MyDrive/Module 4/Data/music-genre-dataset/train_new_feat.csv',index=False)