In [1]:
# Mandatorio al inicio de cada notebook
import os
import warnings
os.chdir(os.path.abspath(".."))
warnings.filterwarnings('ignore')

import json
import numpy as np
import pandas as pd
from src.traductores import emocion_ekman
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [2]:
import keras
from keras.callbacks import ReduceLROnPlateau
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization

### Input Data

In [3]:
df_input = pd.read_csv(f'data/FEATURES/OPENSMILE/opensmile_features_whisper.csv')

In [55]:
# Fix para la diferencia en como estan estos valores, debo arreglarlo a futuro en el archivo de features opensmile
df_input['Time'] = df_input['Time'].str.replace('(','[').str.replace(')',']')

### Target Data

In [4]:
# Obtenemos el json con los intervalos
with open('data/objetivos_por_voto_promedio_whisper.json', 'r') as f:
    targets_mean_vote = json.load(f)

In [57]:
# Loop principal para obtener los tiempos y target en pandas
df_ranges = pd.DataFrame()

for _key in targets_mean_vote.keys():
    print(_key)
    X = []
    for segment, target in zip(targets_mean_vote[_key]['rangos'], targets_mean_vote[_key]['targets']):
        x = []
        x.append(segment)
        x.append(target)
        X.append(x)
        
    df = pd.DataFrame(X, columns = ['Time','Target'])
    df['Audio_Name'] = _key
    df_ranges = pd.concat([df_ranges, df], ignore_index = True)

MSP-Conversation_0021.wav
MSP-Conversation_0023.wav
MSP-Conversation_0035.wav
MSP-Conversation_0047.wav
MSP-Conversation_0061.wav
MSP-Conversation_0079.wav
MSP-Conversation_0083.wav
MSP-Conversation_0087.wav
MSP-Conversation_0088.wav
MSP-Conversation_0103.wav
MSP-Conversation_0114.wav
MSP-Conversation_0125.wav
MSP-Conversation_0135.wav
MSP-Conversation_0147.wav
MSP-Conversation_0153.wav
MSP-Conversation_0160.wav
MSP-Conversation_0166.wav
MSP-Conversation_0167.wav
MSP-Conversation_0172.wav
MSP-Conversation_0180.wav
MSP-Conversation_0190.wav
MSP-Conversation_0201.wav
MSP-Conversation_0202.wav
MSP-Conversation_0218.wav
MSP-Conversation_0227.wav
MSP-Conversation_0231.wav
MSP-Conversation_0257.wav
MSP-Conversation_0269.wav
MSP-Conversation_0281.wav
MSP-Conversation_0295.wav
MSP-Conversation_0300.wav
MSP-Conversation_0361.wav
MSP-Conversation_0380.wav
MSP-Conversation_0381.wav
MSP-Conversation_0418.wav
MSP-Conversation_0422.wav
MSP-Conversation_0456.wav
MSP-Conversation_0498.wav
MSP-Conversa

In [58]:
# Join para obtener dataset final
df_ranges['Time'] = df_ranges['Time'].astype(str) 
df_final = pd.merge(df_input, df_ranges, how = 'inner', left_on = ['Time','Audio_Name'], right_on = ['Time','Audio_Name'])

In [59]:
df_final.head()

Unnamed: 0,audspec_lengthL1norm_sma_range,audspec_lengthL1norm_sma_maxPos,audspec_lengthL1norm_sma_minPos,audspec_lengthL1norm_sma_quartile1,audspec_lengthL1norm_sma_quartile2,audspec_lengthL1norm_sma_quartile3,audspec_lengthL1norm_sma_iqr1-2,audspec_lengthL1norm_sma_iqr2-3,audspec_lengthL1norm_sma_iqr1-3,audspec_lengthL1norm_sma_percentile1.0,...,mfcc_sma_de[14]_peakMeanMeanDist,mfcc_sma_de[14]_peakMeanRel,mfcc_sma_de[14]_minRangeRel,mfcc_sma_de[14]_meanRisingSlope,mfcc_sma_de[14]_stddevRisingSlope,mfcc_sma_de[14]_meanFallingSlope,mfcc_sma_de[14]_stddevFallingSlope,Time,Audio_Name,Target
0,1.498477,0.0,0.714286,0.485805,0.701391,0.962348,0.215586,0.260957,0.476543,0.36237,...,1.451927,-10.622149,0.404903,85.88666,25.21846,80.42445,24.244904,"[1187.84, 1188.7599999999998]",MSP-Conversation_2281.wav,"[10.491853582554517, 44.9523613707165, 36.2193..."
1,1.990642,0.666667,0.384615,0.777784,1.016043,1.538328,0.238259,0.522285,0.760544,0.478745,...,3.538584,-18.395329,0.557455,135.9222,44.451454,133.165,63.074097,"[1186.7, 1187.94]",MSP-Conversation_2281.wav,"[23.88179307116105, 36.23635533707865, 42.1201..."
2,1.419202,0.0,0.277372,0.964727,1.079281,1.391491,0.114554,0.31221,0.426764,0.466527,...,3.40233,19.98661,0.568858,145.35555,85.94251,81.72596,75.72732,"[1185.3600000000001, 1186.8]",MSP-Conversation_2281.wav,"[21.23491677070329, 36.74688098210571, 41.1363..."
3,1.209723,0.392157,0.993464,0.604566,0.765181,1.165415,0.160615,0.400234,0.560849,0.376036,...,3.636257,-12.23507,0.551213,133.98561,61.830128,160.21582,103.63221,"[1183.6200000000001, 1185.22]",MSP-Conversation_2281.wav,"[16.90110178970917, 39.83573266219239, 40.8024..."
4,1.559967,0.25974,0.155844,0.70533,0.954492,1.32622,0.249163,0.371728,0.62089,0.255142,...,2.807891,9.537731,0.467584,138.5861,30.043957,88.29222,59.455185,"[1182.88, 1183.72]",MSP-Conversation_2281.wav,"[21.654366952789697, 35.922482117310444, 50.54..."


### Remover Duplicados

In [64]:
df_final['Duplicated'] = df_final['Time'] + df_final['Audio_Name']
df_final = df_final.drop_duplicates(subset = 'Duplicated')
df_final = df_final.drop('Duplicated', axis = 1)

### Votación

In [65]:
df_final['Ekman'] = [ekman_emotion(i[0],i[1],i[2]) for i in df_final['Target']]

### Sacar nulos

In [66]:
df_final.isna().sum()

audspec_lengthL1norm_sma_range         85
audspec_lengthL1norm_sma_maxPos        85
audspec_lengthL1norm_sma_minPos        85
audspec_lengthL1norm_sma_quartile1     85
audspec_lengthL1norm_sma_quartile2     85
                                     ... 
mfcc_sma_de[14]_stddevFallingSlope     85
Time                                    0
Audio_Name                              0
Target                                  0
Ekman                                 760
Length: 6377, dtype: int64

In [67]:
df_final = df_final[~df_final['Ekman'].isna()]

In [68]:
df_final = df_final[~df_final['audspec_lengthL1norm_sma_range'].isna()]

In [69]:
df_final.isna().sum()

audspec_lengthL1norm_sma_range        0
audspec_lengthL1norm_sma_maxPos       0
audspec_lengthL1norm_sma_minPos       0
audspec_lengthL1norm_sma_quartile1    0
audspec_lengthL1norm_sma_quartile2    0
                                     ..
mfcc_sma_de[14]_stddevFallingSlope    0
Time                                  0
Audio_Name                            0
Target                                0
Ekman                                 0
Length: 6377, dtype: int64

### Pre procesamiento

#### Distribución de la data

In [70]:
df_count = df_final.groupby('Ekman').count().reset_index()[['Ekman','audspec_lengthL1norm_sma_range']].rename({'audspec_lengthL1norm_sma_range':'Count'}, axis = 1)

In [71]:
df_count

Unnamed: 0,Ekman,Count
0,anger,7101
1,disgust,5900
2,joy,18883
3,sadness,22
4,surprise,7484


Por un tema de investigación voy a sacar sadness

In [72]:
df_final = df_final[df_final['Ekman'] != 'sadness']

Por ahora solo voy a forzar valores, pero hay mejores formas de arreglar esto

In [73]:
df_joy = df_final[df_final['Ekman'] == 'joy'].sample(n = 5900)
df_surprise = df_final[df_final['Ekman'] == 'surprise'].sample(n = 5900)
df_disgust = df_final[df_final['Ekman'] == 'disgust'].sample(n = 5900)
df_anger = df_final[df_final['Ekman'] == 'anger'].sample(n = 5900)

In [74]:
df_final_balanced = pd.concat([df_joy, df_surprise, df_disgust, df_anger], ignore_index = True)

In [75]:
df_final_balanced.groupby('Ekman').count().reset_index()[['Ekman','audspec_lengthL1norm_sma_range']].rename({'audspec_lengthL1norm_sma_range':'Count'}, axis = 1)

Unnamed: 0,Ekman,Count
0,anger,5900
1,disgust,5900
2,joy,5900
3,surprise,5900


#### Split

In [51]:
X = df_final_balanced.iloc[:,:-4].values
Y = df_final_balanced['Ekman'].values

In [52]:
# Encoder de las emociones
encoder = OneHotEncoder()
Y = encoder.fit_transform(np.array(Y).reshape(-1,1)).toarray()

In [53]:
# split de la data
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=0, shuffle=True)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((17700, 6373), (17700, 4), (5900, 6373), (5900, 4))

In [54]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((17700, 6373), (17700, 4), (5900, 6373), (5900, 4))

In [55]:
x_train = np.expand_dims(x_train, axis=2)
x_test = np.expand_dims(x_test, axis=2)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((17700, 6373, 1), (17700, 4), (5900, 6373, 1), (5900, 4))

In [57]:
model=Sequential()
model.add(Conv1D(256, kernel_size=5, strides=1, padding='same', activation='relu', input_shape=(x_train.shape[1], 1)))
model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))

model.add(Conv1D(256, kernel_size=5, strides=1, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))

model.add(Conv1D(128, kernel_size=5, strides=1, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))
model.add(Dropout(0.2))

model.add(Conv1D(64, kernel_size=5, strides=1, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))

model.add(Flatten())
model.add(Dense(units=32, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(units=len(df_final_balanced['Ekman'].unique()), activation='softmax'))
model.compile(optimizer = 'adam' , loss = 'categorical_crossentropy' , metrics = ['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_4 (Conv1D)           (None, 6373, 256)         1536      
                                                                 
 max_pooling1d_4 (MaxPooling  (None, 3187, 256)        0         
 1D)                                                             
                                                                 
 conv1d_5 (Conv1D)           (None, 3187, 256)         327936    
                                                                 
 max_pooling1d_5 (MaxPooling  (None, 1594, 256)        0         
 1D)                                                             
                                                                 
 conv1d_6 (Conv1D)           (None, 1594, 128)         163968    
                                                                 
 max_pooling1d_6 (MaxPooling  (None, 797, 128)        

In [58]:
rlrp = ReduceLROnPlateau(monitor='loss', factor=0.4, verbose=0, patience=2, min_lr=0.0000001)
history = model.fit(x_train, y_train, batch_size=64, epochs=50, validation_data=(x_test, y_test), callbacks=[rlrp])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
 36/277 [==>...........................] - ETA: 9:24 - loss: 1.3863 - accuracy: 0.2435

KeyboardInterrupt: 

In [None]:
print("Accuracy of our model on test data : " , model.evaluate(x_test,y_test)[1]*100 , "%")

epochs = [i for i in range(50)]
fig , ax = plt.subplots(1,2)
train_acc = history.history['accuracy']
train_loss = history.history['loss']
test_acc = history.history['val_accuracy']
test_loss = history.history['val_loss']

fig.set_size_inches(20,6)
ax[0].plot(epochs , train_loss , label = 'Training Loss')
ax[0].plot(epochs , test_loss , label = 'Testing Loss')
ax[0].set_title('Training & Testing Loss')
ax[0].legend()
ax[0].set_xlabel("Epochs")

ax[1].plot(epochs , train_acc , label = 'Training Accuracy')
ax[1].plot(epochs , test_acc , label = 'Testing Accuracy')
ax[1].set_title('Training & Testing Accuracy')
ax[1].legend()
ax[1].set_xlabel("Epochs")
plt.show()

In [None]:
# predicting on test data.
pred_test = model.predict(x_test)
y_pred = encoder.inverse_transform(pred_test)

y_test = encoder.inverse_transform(y_test)

In [None]:
df = pd.DataFrame(columns=['Predicted Labels', 'Actual Labels'])
df['Predicted Labels'] = y_pred.flatten()
df['Actual Labels'] = y_test.flatten()

df.head(10)

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize = (12, 10))
cm = pd.DataFrame(cm , index = [i for i in encoder.categories_] , columns = [i for i in encoder.categories_])
sns.heatmap(cm, linecolor='white', cmap='Blues', linewidth=1, annot=True, fmt='')
plt.title('Confusion Matrix', size=20)
plt.xlabel('Predicted Labels', size=14)
plt.ylabel('Actual Labels', size=14)
plt.show()

In [None]:
print(classification_report(y_test, y_pred))