In [1]:
import librosa
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
import csv

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler

# model
from sklearn.linear_model import LogisticRegression, LinearRegression
# from catboost import CatBoostClassifier
# from xgboost import XGBClassifier, XGBRFClassifier 
from sklearn.model_selection import KFold,StratifiedKFold
from lightgbm import LGBMRegressor, LGBMClassifier
# from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.metrics import mean_absolute_error as mae, roc_auc_score,accuracy_score 
from sklearn.multioutput import MultiOutputClassifier
# Keras
import keras

# save the model to disk
import pickle

Using TensorFlow backend.


# Read data

In [2]:
train = pd.read_csv('../data/train_new_feat.csv')
train.head()

Unnamed: 0,track id,chroma_sftf,rolloff,zero_crossing_rate,rmse,flux,contrast,flatness,sample_silence,mfcc_0,...,power,joyful_activation,tension,sadness,mood,liked,disliked,age,gender,mother tongue
0,1,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,...,0,0,0,1,3,0,0,30,0,English
1,1,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,...,0,0,0,1,1,1,0,33,0,Russian
2,1,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,...,0,0,0,1,4,0,1,51,0,English
3,1,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,...,0,0,0,0,3,0,0,21,1,English
4,1,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,...,0,0,0,1,3,0,0,60,1,English


# Process data cleaning 

In [3]:
# define target labels
LABELS = [' tenderness', ' calmness', ' power',
                         ' joyful_activation', ' tension', ' sadness',
                         ]
#define useless features
USELESS = [' amazement', ' solemnity',' nostalgia',' mother tongue',
           'genre',' liked', 'sample_silence',' disliked' ]

In [5]:
# remove unlabelled data to avoid overfitting 
train['sum'] = train[LABELS].sum(1)
train = train[train['sum'] > 0]
train = train.drop('sum', 1)
train.head()

Unnamed: 0,track id,chroma_sftf,rolloff,zero_crossing_rate,rmse,flux,contrast,flatness,sample_silence,mfcc_0,...,power,joyful_activation,tension,sadness,mood,liked,disliked,age,gender,mother tongue
0,1,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,...,0,0,0,1,3,0,0,30,0,English
1,1,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,...,0,0,0,1,1,1,0,33,0,Russian
2,1,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,...,0,0,0,1,4,0,1,51,0,English
3,1,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,...,0,0,0,0,3,0,0,21,1,English
4,1,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,...,0,0,0,1,3,0,0,60,1,English


In [6]:
# remove useless features
train.drop(USELESS,axis=1,inplace=True)
train.head()

Unnamed: 0,track id,chroma_sftf,rolloff,zero_crossing_rate,rmse,flux,contrast,flatness,mfcc_0,mfcc_1,...,tempo,tenderness,calmness,power,joyful_activation,tension,sadness,mood,age,gender
0,1,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,-404.308594,25.564194,...,129.199219,0,0,0,0,0,1,3,30,0
1,1,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,-404.308594,25.564194,...,129.199219,1,0,0,0,0,1,1,33,0
2,1,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,-404.308594,25.564194,...,129.199219,0,1,0,0,0,1,4,51,0
3,1,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,-404.308594,25.564194,...,129.199219,0,1,0,0,0,0,3,21,1
4,1,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,-404.308594,25.564194,...,129.199219,0,0,0,0,0,1,3,60,1


In [7]:
# drop duplicated data to avoid overfitting
train = train.drop_duplicates()

In [None]:
# get train and target 
y_train = train[LABELS]

X_train = train.drop(columns=LABELS)

# Outlier treatment 

In [None]:
# predict audio features outliers  
def predict_audio_feat_outlier(db):
    """
    This function globally detects outliers  
    
    Parameters:
        db (DataFrame): the dataframe containing the values to be cleaned
    Returns:
        db_tr (DataFrame): the  dataframe with detected outliers
    """
    db_tr = db.copy() 
    clf =  IForest() 
    clf.fit(db_tr.drop(columns=['track id',' mood',' age']))
    db_tr['is_outlier'] = clf.predict(db_tr.drop(columns=['track id',' mood',' age']))
    
    return db_tr

In [None]:
#treat audio features outliers
def treat_audio_feat_outlier(db):
    """
    This function globally treats outliers  
    
    Parameters:
        db (DataFrame): the dataframe containing the outliers to be cleaned
    Returns:
        new_db (DataFrame): the  dataframe with cleaned outliers
    """
    features = db.columns
    features = features.drop(['track id',' mood',' gender', ' age','is_outlier'])
    mask = db['is_outlier'] == 1
    
    for f in features: 
        db.loc[mask, f] = db[f].median()
    return db.drop(columns=['is_outlier'])

In [None]:
# predict and treat outliers for a given feature 
def predict_and_treat_outlier(feat,db):
    """
    This function treats outliers for a given feature 
    
    Parameters:
        db (DataFrame): the dataframe containing the outliers to be cleaned
        feat (string): the given feature 
    Returns:
        db (DataFrame): the  dataframe with cleaned outliers
    """
  X =  db[[feat]]
  clf = IForest() 
  clf.fit(X)
  db['is_outlier'] = clf.predict(X)
  db_cleaned = db.copy()
  mask = db_cleaned['is_outlier'] == 1
  db_cleaned.loc[mask, feat] = db_cleaned[feat].median()
  return db_cleaned.drop(columns=['is_outlier'])

In [None]:
from pyod.models.iforest import IForest 
from pyod.models.knn import KNN 

# predict and treat age outliers 
X_train_age = predict_and_treat_outlier(' age',X_train) # performed 74% (roc_auc_score = 74%)
X_train_age.head()
# predict audio features outliers
# X_train = predict_audio_feat_outlier(X_train)
# treat audio features outliers
#X_train = treat_audio_feat_outlier(X_train)


In [None]:
X_train_age.columns

In [None]:
X_train.shape

# Model training 

## Set the batch size and epochs for the DNN and the LSTM model

In [None]:
# batch size and epochs for the DNN and the LSTM model
batch_size=128
epochs = 5
X_train = np.expand_dims(X_train.values, axis=-1)


## Import all the useful module for the training 

In [None]:
from keras import models
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation,Conv1D,Flatten
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split


## DNN model 

The DNN model scored 77% on the mfcc features, and  77% with the new features,  no improvement has observed 
After  outliers treatment, the model scored 75% 
Conclusion : the DNN model is not stable and the results are not reproducible 

In [None]:

inp = Input(shape=(35,))
x = GlobalMaxPool1D()(inp)
x = Dense(256, activation='relu')(inp)
x = Dropout(0.5)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(9, activation='softmax')(x)
model = Model(inputs=inp, outputs=x)

## LSTM model 


The LSTM model scored 77% with the mfcc features, and 77% with the new features, no improvement has been observed. 
After  outliers treatment, the model's scored remained the same, 
thus, there is no improvement even after the training was run for 10 epochs

In [None]:
inp = Input(shape=(32,1))
x = LSTM(50, return_sequences=True)(inp)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(9, activation="softmax")(x)
model = Model(inputs=inp, outputs=x)

In [None]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
print(y_train.shape)
history = model.fit(X_train,y_train,epochs=epochs,batch_size=batch_size,validation_split=0.2)

In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

def plot_history(history):
    """ This functions plot the model's loss and  accuracy on graphs 
        
        parameters:
            history (model) : the model to evaluate
        returns:
            - 
    """
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

In [None]:
plot_history(history)

## LGBM model 

In [None]:
# Model Validation 
epochs = 5

# KFold Validation
kf = KFold(n_splits=epochs, shuffle=True, random_state=1997)  # 30, n_split 3
y_oof = np.zeros([X_train_age.shape[0], len(LABELS)])
i = 0

# Data normalization 
SCALE = True
if SCALE:  
    scaler = MinMaxScaler()
    X_train_age[X_train_age.columns.drop(['track id'])] = scaler.fit_transform(
        X_train_age.drop(columns=['track id']))

# Model Validation
for tr_idx, val_idx in kf.split(X_train_age, y_train):
    X_tr, X_vl = X_train_age.iloc[tr_idx, :], X_train_age.iloc[val_idx, :]
    y_tr, y_vl = y_train.iloc[tr_idx, :], y_train.iloc[val_idx, :]
    X_tr = X_tr.drop(columns=['track id'] )
    X_vl = X_vl.drop(columns=['track id'] ) 
    model =  MultiOutputClassifier(LGBMClassifier(n_estimators=10, random_state=47)) 
    model.fit(X_tr, y_tr) 
    y_pred = np.zeros((X_vl.shape[0],len(LABELS)))
    for i, j in enumerate(model.predict_proba(X_vl)):
        y_pred[:,i] = j[:, 1]
    y_oof[val_idx, :] = y_pred
    i += 1
    acc = roc_auc_score(y_vl, y_pred, multi_class='ovr')
    print(f"Fold #{i} AUC : {round(acc, 2)}")
    
metric = roc_auc_score(y_train, y_oof, multi_class='ovr')
print(f"Full AUC  : {round(metric, 2)}")

## Model interpretation:
1- With the initial features : 
LGBM model scored 79.4% 
2- After adding the new features :
LGBM scored 79.55%

3- After both audio and age features outliers treatment:
LGBM scored 79.36% ==> the model's performance slightly decreased

4- After only age feature treatment (Indeed, the 'age' feature is the most important feature of the model:
LGBM scored 79.57% 

## Observation after testing the model: 
The model is overfitting especially with 3 of the emotion features : amazement, solemnity and nostalgia. 
Thus, we decided to work on 6 of the emotions and remove the 3 others. 
Performance: LGBM scored 74%

# Save model

In [None]:
from pure_sklearn.map import convert_estimator

filename = f'../model_saved/music_emotion_classifier_model.sav'
clf_pure_predict = convert_estimator(model)
pickle.dump(clf_pure_predict, open(filename, 'wb'))

# Test the model on  existing test data

In [None]:
def get_features(y, sr, id):
     '''
        This function extracts audio features from an audio file.

                Parameters:
                        id (string): the audio track id 
                        y 
                        sr 

                Returns:
                        audio_features (DataFrame): the extracted audio features
        '''
    # Features to concatenate in the final dictionary
    features = {'chroma_sftf': None, 'rolloff': None, 'zero_crossing_rate': None, 'rmse': None,
                'flux': None, 'contrast': None, 'flatness': None}
    print(id)

    # Using librosa to calculate the features
    features['chroma_sftf'] = np.mean(librosa.feature.chroma_stft(y=y, sr=sr))
    features['rolloff'] = np.mean(librosa.feature.spectral_rolloff(y, sr=sr))
    features['zero_crossing_rate'] = np.mean(librosa.feature.zero_crossing_rate(y))
    features['rmse'] = np.mean(librosa.feature.rms(y))
    features['flux'] = np.mean(librosa.onset.onset_strength(y=y, sr=sr))
    features['contrast'] = np.mean(librosa.feature.spectral_contrast(y, sr=sr))
    features['flatness'] = np.mean(librosa.feature.spectral_flatness(y))
    
    # MFCC treatment
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
    for idx, v_mfcc in enumerate(mfcc):
        features['mfcc_{}'.format(idx)] = np.mean(v_mfcc)

    features['tempo'] = librosa.beat.tempo(y, sr=sr)[0]
    features['track_id'] = id
    
    return features

In [None]:
def read_process_songs(audiofile, debug = True): 
    """
        This function reads an audio file.

                Parameters:
                        audiofile (string): the audio file path

                Returns:
                        audio_features (DataFrame): the extracted audio features
        """
    # Empty array of dicts with the processed features from all files
    arr_features = []
    
    # Read the audio file
    signal, sr = librosa.load(audiofile,duration=30)
    
    #pre-emphasis before extracting features
    signal_filt = librosa.effects.preemphasis(signal)
    
    track_id = audiofile.replace(".wav","")

    # Append the result to the data structure
    features = get_features(signal_filt,sr,track_id)
    arr_features.append(features)
         
    return arr_features

In [None]:
from pydub import AudioSegment

def convert_to_wav(src,dst):
     '''
        This function converts any mp3 file into wav format
                Parameters:
                        src (string): audio file source (path)

                Returns:
                        dst (string): new source of the converted audio file
        '''
    # convert wav to mp3                                                            
    sound = AudioSegment.from_mp3(src)
    sound.export(dst, format="wav")
    return dst

In [None]:
audio_file = f'C:/Music/Retro/Aerosmith - I Don\'t Want to Miss a Thing (Official Music Video).wav'

In [None]:
print(audio_file)

In [None]:
test_data = read_process_songs(audio_file,debug=False)

In [None]:
df_test = pd.DataFrame(test_data)
df_test.columns

In [None]:
df_test[' mood'] = 3
df_test[' gender'] = 1
df_test[' age'] = 23
df_test.head()

In [None]:
df_test.columns

In [None]:
emotion_clf = pickle.load(open(filename, 'rb'))
def predict_proba(test, model):
     '''
        This function predicts the music genre

                Parameters:
                        test (DataFrame): audio features
                        model (model): the music emotion identifier

                Returns:
                        y_pred (DataFrame): probability of music emotions
        '''
    y_pred = np.zeros((test.shape[0],len(LABELS)))
    print(test.columns)
    df_test = test.drop(columns=['track_id'])
    for i, j in enumerate(model.predict_proba(df_test)):
        y_pred[:,i] = j[:, 1]
    y_pred = pd.DataFrame(y_pred)
    y_pred.columns = LABELS
    return y_pred
pred = predict_proba(df_test, emotion_clf)
print(pred)

In [None]:
print(LABELS[np.argmax(pred)])