In [1]:
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.neural_network import MLPClassifier, MLPRegressor 
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, mean_absolute_error
from sklearn.preprocessing import LabelEncoder 
import pickle 
import librosa
%matplotlib inline

# Read data

In [2]:
train_old = pd.read_csv("../data/train_mfcc.csv")
train_old.head() 

Unnamed: 0,track id,chroma_stft,rmse,spectral_centroid,spectral_bandwidth,rolloff,zero_crossing_rate,mfcc1,mfcc2,mfcc3,...,power,joyful_activation,tension,sadness,mood,liked,disliked,age,gender,mother tongue
0,1,0.168761,0.084875,1250.061078,1383.643005,1891.738561,0.074272,-320.786858,109.852414,-13.965514,...,0,0,0,1,3,0,0,30,0,English
1,1,0.168761,0.084875,1250.061078,1383.643005,1891.738561,0.074272,-320.786858,109.852414,-13.965514,...,0,0,0,1,1,1,0,33,0,Russian
2,1,0.168761,0.084875,1250.061078,1383.643005,1891.738561,0.074272,-320.786858,109.852414,-13.965514,...,0,0,0,1,4,0,1,51,0,English
3,1,0.168761,0.084875,1250.061078,1383.643005,1891.738561,0.074272,-320.786858,109.852414,-13.965514,...,0,0,0,0,3,0,0,21,1,English
4,1,0.168761,0.084875,1250.061078,1383.643005,1891.738561,0.074272,-320.786858,109.852414,-13.965514,...,0,0,0,1,3,0,0,60,1,English


In [3]:
df = pd.read_csv("../data/train_new_feat.csv") 
df['genre'] = train_old[' genre']
df.head()

Unnamed: 0,track id,chroma_sftf,rolloff,zero_crossing_rate,rmse,flux,contrast,flatness,sample_silence,mfcc_0,...,power,joyful_activation,tension,sadness,mood,liked,disliked,age,gender,mother tongue
0,1,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,...,0,0,0,1,3,0,0,30,0,English
1,1,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,...,0,0,0,1,1,1,0,33,0,Russian
2,1,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,...,0,0,0,1,4,0,1,51,0,English
3,1,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,...,0,0,0,0,3,0,0,21,1,English
4,1,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,...,0,0,0,1,3,0,0,60,1,English


# Data cleaning 

In [4]:
#Drop useless features
useless = [ ' mother tongue', ' liked', ' disliked', ' amazement',' mood',' age', ' gender',' solemnity', ' nostalgia']
df  = df.drop(useless, 1)
df = pd.get_dummies(df, columns=['genre'])
df.head()

Unnamed: 0,track id,chroma_sftf,rolloff,zero_crossing_rate,rmse,flux,contrast,flatness,sample_silence,mfcc_0,...,tenderness,calmness,power,joyful_activation,tension,sadness,genre_classical,genre_electronic,genre_pop,genre_rock
0,1,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,...,0,0,0,0,0,1,1,0,0,0
1,1,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,...,1,0,0,0,0,1,1,0,0,0
2,1,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,...,0,1,0,0,0,1,1,0,0,0
3,1,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,...,0,1,0,0,0,0,1,0,0,0
4,1,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,...,0,0,0,0,0,1,1,0,0,0


In [5]:
LABELS =  [' tenderness', ' calmness', ' power',
       ' joyful_activation', ' tension', ' sadness',
           "genre_classical", "genre_electronic", "genre_pop", "genre_rock"
         ]

In [6]:
#Get list of base Features
FEATURES = df.columns.drop(['track id'] +  LABELS)
print(f"base features: {FEATURES.tolist()}")

base features: ['chroma_sftf', 'rolloff', 'zero_crossing_rate', 'rmse', 'flux', 'contrast', 'flatness', 'sample_silence', 'mfcc_0', 'mfcc_1', 'mfcc_2', 'mfcc_3', 'mfcc_4', 'mfcc_5', 'mfcc_6', 'mfcc_7', 'mfcc_8', 'mfcc_9', 'mfcc_10', 'mfcc_11', 'mfcc_12', 'mfcc_13', 'mfcc_14', 'mfcc_15', 'mfcc_16', 'mfcc_17', 'mfcc_18', 'mfcc_19', 'tempo']


In [None]:
# drop duplicates to avoid ovefitting
Xtrain = df[FEATURES.tolist() + ['track id']].drop_duplicates()

# Feature engineering

As previously observed when performing the EDA,  all the subjects do not share the same feelings for the same song and most of the subjects did not feel any emotion.
Thus I decide to proceed as follows: 
For a given song, if more than 30% of the subjects indicate felt the same emotion, then 

In [7]:

ytrain = df.groupby('track id')[LABELS].mean() 
ytrain = ytrain.reset_index()

for l in LABELS:
    ytrain[l] = ytrain[l].apply(lambda x: 1 if x > 0.3 else 0, 1)
train = Xtrain.merge(ytrain, on='track id', how='left')

## Outlier treatment 

In [8]:
from pyod.models.iforest import IForest 
from pyod.models.knn import KNN 

# predict and treat outliers for a given feature 
def predict_and_treat_outlier(feat,db):
    """
    This function treats outliers for a given feature 
    
    Parameters:
        db (DataFrame): the dataframe containing the outliers to be cleaned
        feat (string): the given feature 
    Returns:
        db (DataFrame): the  dataframe with cleaned outliers
    """
  X =  db[[feat]]
  clf = IForest() #KNN()
  clf.fit(X)
  db['is_outlier'] = clf.predict(X)
  db_cleaned = db.copy()
  mask = db_cleaned['is_outlier'] == 1
  db_cleaned.loc[mask, feat] = db_cleaned[feat].median()
  return db_cleaned.drop(columns=['is_outlier'])

In [19]:
# predict and treat age outliers 
train = predict_and_treat_outlier(' age',train) 
train.head()

Unnamed: 0,chroma_sftf,rolloff,zero_crossing_rate,rmse,flux,contrast,flatness,sample_silence,mfcc_0,mfcc_1,...,tenderness,calmness,power,joyful_activation,tension,sadness,genre_classical,genre_electronic,genre_pop,genre_rock
0,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,25.564194,...,0,1,0,0,0,1,1,0,0,0
1,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,25.564194,...,0,1,0,0,0,1,1,0,0,0
2,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,25.564194,...,0,1,0,0,0,1,1,0,0,0
3,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,25.564194,...,0,1,0,0,0,1,1,0,0,0
4,0.168468,5243.018296,0.115553,0.021488,1.004327,27.502646,0.00545,0,-404.308594,25.564194,...,0,1,0,0,0,1,1,0,0,0


In [12]:
# predict audio features outliers  
def predict_audio_feat_outlier(db):
    """
    This function globally detects outliers  
    
    Parameters:
        db (DataFrame): the dataframe containing the values to be cleaned
    Returns:
        db_tr (DataFrame): the  dataframe with detected outliers
    """
    features = ['chroma_sftf', 'rolloff', 'zero_crossing_rate', 'rmse', 'flux', 'contrast', 'flatness',
                          'sample_silence', 'mfcc_0', 'mfcc_1','mfcc_2', 'mfcc_3', 'mfcc_4', 'mfcc_5', 'mfcc_6',
                          'mfcc_7', 'mfcc_8', 'mfcc_9', 'mfcc_10','mfcc_11','mfcc_12', 'mfcc_13', 'mfcc_14',
                          'mfcc_15', 'mfcc_16', 'mfcc_17', 'mfcc_18', 'mfcc_19', 'tempo']
    db_tr = db.copy() 
    clf =  IForest() 
    clf.fit(db_tr[features])
    db_tr['is_outlier'] = clf.predict(db_tr[features])
    
    return db_tr

In [10]:
#treat audio features outliers
def treat_audio_feat_outlier(db):
    """
    This function globally treats outliers  
    
    Parameters:
        db (DataFrame): the dataframe containing the outliers to be cleaned
    Returns:
        new_db (DataFrame): the  dataframe with cleaned outliers
    """
    features = ['chroma_sftf', 'rolloff', 'zero_crossing_rate', 'rmse', 'flux', 'contrast', 'flatness',
                          'sample_silence', 'mfcc_0', 'mfcc_1','mfcc_2', 'mfcc_3', 'mfcc_4', 'mfcc_5', 'mfcc_6',
                          'mfcc_7', 'mfcc_8', 'mfcc_9', 'mfcc_10','mfcc_11','mfcc_12', 'mfcc_13', 'mfcc_14',
                          'mfcc_15', 'mfcc_16', 'mfcc_17', 'mfcc_18', 'mfcc_19', 'tempo']
    mask = db['is_outlier'] == 1
    
    for f in features: 
        db.loc[mask, f] = db[f].median()
    return db.drop(columns=['is_outlier'])

In [13]:
# predict audio features outliers
train = predict_audio_feat_outlier(train)
# treat audio features outliers
train = treat_audio_feat_outlier(train)
train.head()

Unnamed: 0,chroma_sftf,rolloff,zero_crossing_rate,rmse,flux,contrast,flatness,sample_silence,mfcc_0,mfcc_1,...,tenderness,calmness,power,joyful_activation,tension,sadness,genre_classical,genre_electronic,genre_pop,genre_rock
0,0.39751,6909.249887,0.26987,0.022182,1.320094,21.357228,0.073558,0.0,-277.617462,22.630903,...,0,1,0,0,0,1,1,0,0,0
1,0.198768,3432.57091,0.064848,0.015493,0.989229,22.154778,0.003597,0.0,-380.886902,108.089806,...,1,1,0,0,0,0,1,0,0,0
2,0.470191,7299.764193,0.299333,0.011899,1.121246,19.68708,0.116151,0.0,-275.277588,19.399797,...,0,0,0,1,0,0,1,0,0,0
3,0.162379,3555.286835,0.079207,0.009335,1.060743,22.654876,0.003908,0.0,-486.330688,102.48394,...,1,1,0,0,0,0,1,0,0,0
4,0.233411,1986.971318,0.061498,0.005119,0.993969,21.760514,0.001321,0.0,-499.45047,151.041504,...,1,1,0,0,0,1,1,0,0,0


# Model training : MLP model

In [14]:
# Data normalization
scaler = MinMaxScaler()
train[FEATURES] = scaler.fit_transform(train[FEATURES])

## Model Validation 

In [15]:
# Model Validation 
epochs = 5
kf = KFold(n_splits=epochs, shuffle=True, random_state=1997)  # 30, n_split 3
X_train = train[FEATURES] 
y_train = train[LABELS]
y_oof = np.zeros([X_train.shape[0], len(LABELS)])
i = 0
for tr_idx, val_idx in kf.split(X_train, y_train):
    X_tr, X_vl = X_train.iloc[tr_idx, :], X_train.iloc[val_idx, :]
    y_tr, y_vl = y_train.iloc[tr_idx, :], y_train.iloc[val_idx, :]
    model = MLPClassifier([300], random_state=47, activation='logistic', max_iter=5000)
    model.fit(X_tr, y_tr) 
    y_pred = model.predict_proba(X_vl)
    y_oof[val_idx, :] = y_pred
    i += 1
    acc = roc_auc_score(y_vl, y_pred, multi_class='ovr')
    print(f"Fold #{i} AUC : {round(acc, 2)}")
metric = roc_auc_score(y_train, y_oof, multi_class='ovr')
print(f"Full AUC  : {round(metric, 2)}")

Fold #1 AUC : 0.79
Fold #2 AUC : 0.75
Fold #3 AUC : 0.76
Fold #4 AUC : 0.78
Fold #5 AUC : 0.75
Full AUC  : 0.76


## Save model and scaler

In [16]:
import pickle
filename = '../model_saved/emo_genre_clf_model.sav'
pickle.dump(model, open(filename, 'wb'))

filename = '../model_saved/Scaler_Extractor.sav'
pickle.dump(scaler, open(filename, 'wb'))