In [1]:
import pandas as pd
import numpy as np
import os

import librosa
import moviepy.editor as mp

#use this package to extract mfcc features
import python_speech_features as mfcc
from python_speech_features import delta

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.mixture import GaussianMixture

### Prepare the dataset

In [2]:
#get all the video files
#video_path = r"C:/Users/Gauri/OneDrive/Desktop/DS3/MELD.Raw/train_splits"
audio_path = r"C:/Users/Gauri/OneDrive/Desktop/DS3/MELD.Raw/EDA/Wavs"
#files = os.listdir(video_path)
wav_files = os.listdir(audio_path)
#error_files when converting data
error_files = ['dia125_utt3.mp4']

In [3]:
data_df = pd.read_csv("train_sent_emo.csv")
#drop the row that give us an error audio
error = data_df[(data_df['Dialogue_ID']==125) & (data_df['Utterance_ID']==3)].index
data_df.drop(error, inplace=True)
data_df

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime
0,1,also I was the point person on my companys tr...,Chandler,neutral,neutral,0,0,8,21,"00:16:16,059","00:16:21,731"
1,2,You mustve had your hands full.,The Interviewer,neutral,neutral,0,1,8,21,"00:16:21,940","00:16:23,442"
2,3,That I did. That I did.,Chandler,neutral,neutral,0,2,8,21,"00:16:23,442","00:16:26,389"
3,4,So lets talk a little bit about your duties.,The Interviewer,neutral,neutral,0,3,8,21,"00:16:26,820","00:16:29,572"
4,5,My duties? All right.,Chandler,surprise,positive,0,4,8,21,"00:16:34,452","00:16:40,917"
...,...,...,...,...,...,...,...,...,...,...,...
9984,10474,You or me?,Chandler,neutral,neutral,1038,13,2,3,"00:00:48,173","00:00:50,799"
9985,10475,"I got it. Uh, Joey, women don't have Adam's ap...",Ross,neutral,neutral,1038,14,2,3,"00:00:51,009","00:00:53,594"
9986,10476,"You guys are messing with me, right?",Joey,surprise,positive,1038,15,2,3,"00:01:00,518","00:01:03,520"
9987,10477,Yeah.,All,neutral,neutral,1038,16,2,3,"00:01:05,398","00:01:07,274"


### Split the Train/Test dataset

In [4]:
#select the main characters and split training and testing set
main = ['Chandler', 'Rachel', 'Ross', 'Joey', 'Monica', 'Phoebe']
main_data = data_df[data_df['Speaker'].isin(main)]
X_train, X_test, y_train, y_test = train_test_split(main_data[['Dialogue_ID', 'Utterance_ID','Speaker']], 
                                                    main_data['Speaker'], 
                                                    test_size=0.25, shuffle=True, random_state=42)

### Define Hepler Functions

In [5]:
def process_data(df):
    """
    Extract features from a dataframes of audio files
    Input: A dataframe contains the Dialogue ID and Utterance ID
    Output: A 2D numpy array of features extracted from audio files (MFCC, MFCC_delta, MFCC_delta_delta)
    """
    mfcc_features = np.array([])
    counter = 0 #used to count how many audio files does not have mfcc
    
    for i in range(len(df)):
        entry = df.iloc[i]
        dia = entry['Dialogue_ID']
        utt = entry['Utterance_ID']
        path = fr"C:\Users\Gauri\OneDrive\Desktop\DS3\MELD.Raw\EDA\Wavs\dia{dia}_utt{utt}.wav"
        audio, sr = librosa.load(path,res_type='kaiser_fast'
                                  ,duration=2.5
                                  ,sr=44100
                                  ,offset=0.5)
        #print(audio.shape)
        try:
            audio_mfcc = mfcc.mfcc(audio, sr,nfilt=20, nfft=1200, appendEnergy=True)
        except:
            #print('no mfcc from audio')
            counter = counter+1
            continue
            
        audio_mfcc = preprocessing.scale(audio_mfcc)
        delta1 = delta(audio_mfcc, 2)
        delta2 = delta(delta1, 2)
        combined = np.hstack((audio_mfcc, delta1, delta2))
        if mfcc_features.size == 0:
            mfcc_features = combined
        else:
            mfcc_features = np.vstack((mfcc_features, combined))
            
    
    print(str(counter)+" has no mfccs")
    return mfcc_features

In [6]:
def identify(dia, utt, m1, m2, m3, m4, m5, m6):
    """
    Given an audio file and six models, identify the speaker
    Input: Dialogue ID, Utterance ID, and six models(one for each speaker)
    Output: the speaker
    """
    path = fr"C:\Users\Gauri\OneDrive\Desktop\DS3\MELD.Raw\EDA\Wavs\dia{dia}_utt{utt}.wav"

    audio, sr = librosa.load(path,res_type='kaiser_fast'
                                      ,duration=2.5
                                      ,sr=44100
                                      ,offset=0.5)
    try:
        audio_mfcc = mfcc.mfcc(audio, sr,nfilt=20, nfft=1200, appendEnergy=True)
    except:
        return None
    audio_mfcc = preprocessing.scale(audio_mfcc)
    delta1 = delta(audio_mfcc, 2)
    delta2 = delta(delta1, 2)
    combined = np.hstack((audio_mfcc, delta1, delta2))

    scores = np.array([m1.score(combined), m2.score(combined), m3.score(combined),
                         m4.score(combined), m5.score(combined), m6.score(combined)])
    idx = np.argmax(scores)
    
    return main[idx]

### Extract the features for each speaker

In [7]:
chandler = X_train[X_train['Speaker']=='Chandler']
rachel = X_train[X_train['Speaker']=='Rachel']
ross = X_train[X_train['Speaker']=='Ross']
joey = X_train[X_train['Speaker']=='Joey']
monica = X_train[X_train['Speaker']=='Monica']
phoebe = X_train[X_train['Speaker']=='Phoebe']


#process the data, get the mfccs for every speaker
print('start extracting')
chandler_mfcc = process_data(chandler)
print('chandler')
rachel_mfcc = process_data(rachel)
print('rachel')
ross_mfcc = process_data(ross)
print('ross')
joey_mfcc = process_data(joey)
print('joey')
monica_mfcc = process_data(monica)
print('monica')
phoebe_mfcc = process_data(phoebe)
print('phoebe')

start extracting
23 has no mfccs
chandler
36 has no mfccs
rachel
30 has no mfccs
ross
49 has no mfccs
joey
24 has no mfccs
monica
32 has no mfccs
phoebe


### Create the models, one for each speaker

In [8]:
gmm_chandler = GaussianMixture(n_components=1, max_iter=200, covariance_type='diag', n_init=3)
gmm_chandler.fit(chandler_mfcc)
gmm_rachel = GaussianMixture(n_components=1, max_iter=200, covariance_type='diag', n_init=3)
gmm_rachel.fit(rachel_mfcc)
gmm_ross = GaussianMixture(n_components=1, max_iter=200, covariance_type='diag', n_init=3)
gmm_ross.fit(ross_mfcc)
gmm_joey = GaussianMixture(n_components=1, max_iter=200, covariance_type='diag', n_init=3)
gmm_joey.fit(joey_mfcc)
gmm_monica = GaussianMixture(n_components=1, max_iter=200, covariance_type='diag', n_init=3)
gmm_monica.fit(monica_mfcc)
gmm_phoebe = GaussianMixture(n_components=1, max_iter=200, covariance_type='diag', n_init=3)
gmm_phoebe.fit(phoebe_mfcc)

GaussianMixture(covariance_type='diag', max_iter=200, n_init=3)

### Use the model to identify the speakers in the testing set

In [9]:
#n_components = 1
prediction = X_test.copy()
prediction['Predictions'] = prediction.apply(lambda row: identify(row['Dialogue_ID'], row['Utterance_ID'], 
                                                          gmm_chandler, gmm_rachel, gmm_ross,
                                                         gmm_joey, gmm_monica, gmm_phoebe), axis=1)

correct_label = (prediction['Speaker'] == prediction['Predictions'])
accuracy = correct_label.sum()/len(prediction)

print("The accuracy of this model is "+str(accuracy))

The accuracy of this model is 0.2128069330765527


### Function to run the train and test for different parameters of the model

In [10]:
def train_test(n, cov):
    """
    n - n_components
    cov - covariance_type
    """
    gmm_chandler = GaussianMixture(n_components=n, max_iter=200, covariance_type=cov, n_init=3)
    gmm_chandler.fit(chandler_mfcc)
    gmm_rachel = GaussianMixture(n_components=n, max_iter=200, covariance_type=cov, n_init=3)
    gmm_rachel.fit(rachel_mfcc)
    gmm_ross = GaussianMixture(n_components=n, max_iter=200, covariance_type=cov, n_init=3)
    gmm_ross.fit(ross_mfcc)
    gmm_joey = GaussianMixture(n_components=n, max_iter=200, covariance_type=cov, n_init=3)
    gmm_joey.fit(joey_mfcc)
    gmm_monica = GaussianMixture(n_components=n, max_iter=200, covariance_type=cov, n_init=3)
    gmm_monica.fit(monica_mfcc)
    gmm_phoebe = GaussianMixture(n_components=n, max_iter=200, covariance_type=cov, n_init=3)
    gmm_phoebe.fit(phoebe_mfcc)
    
    prediction = X_test.copy()
    prediction['Predictions'] = prediction.apply(lambda row: identify(row['Dialogue_ID'], row['Utterance_ID'], 
                                                              gmm_chandler, gmm_rachel, gmm_ross,
                                                             gmm_joey, gmm_monica, gmm_phoebe), axis=1)

    correct_label = (prediction['Speaker'] == prediction['Predictions'])
    accuracy = correct_label.sum()/len(prediction)

    #print("The accuracy of this model is "+str(accuracy))
    return accuracy

In [36]:
#n_component=1, covariance_type='diag'
accuracy = train_test(n=1, cov='diag')
print("The accuracy of this model is "+str(accuracy))

The accuracy of this model is 0.2128069330765527


0.2128069330765527

In [14]:
#n_component=8, covariance_type='diag'
accuracy = train_test(n=8, cov='diag')
print("The accuracy of this model is "+str(accuracy))

The accuracy of this model is 0.5267212325469427


In [15]:
#n_component=32, covariance_type='diag'
accuracy = train_test(n=32, cov='diag')
print("The accuracy of this model is "+str(accuracy))

The accuracy of this model is 0.5960519980741454


In [16]:
#n_component=64, covariance_type='diag'
accuracy = train_test(n=64, cov='diag')
print("The accuracy of this model is "+str(accuracy))

The accuracy of this model is 0.6172363986519018
