In [1]:
import pandas as pd
import numpy as np
import os

import librosa
import moviepy.editor as mp

#use this package to extract mfcc features
import python_speech_features as mfcc
from python_speech_features import delta

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import LabelEncoder, minmax_scale

### Prepare the dataset

In [2]:
#get all the video files
#video_path = r"C:/Users/Gauri/OneDrive/Desktop/DS3/MELD.Raw/train_splits"
audio_path = r"wav_files/train"
#files = os.listdir(video_path)
wav_files = os.listdir(audio_path)
#error_files when converting data
error_files = ['dia125_utt3.mp4','dia795_utt0.mp4']

In [3]:
data_df = pd.read_csv("https://raw.githubusercontent.com/declare-lab/MELD/master/data/MELD/train_sent_emo.csv")
#drop the row that give us an error audio
error = data_df[(data_df['Dialogue_ID']==125) & (data_df['Utterance_ID']==3)].index
error2 = data_df[(data_df['Dialogue_ID']==795) & (data_df['Utterance_ID']==0)].index
data_df.drop(error, inplace=True)
data_df.drop(error2,inplace=True)
data_df

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime
0,1,also I was the point person on my company’s tr...,Chandler,neutral,neutral,0,0,8,21,"00:16:16,059","00:16:21,731"
1,2,You must’ve had your hands full.,The Interviewer,neutral,neutral,0,1,8,21,"00:16:21,940","00:16:23,442"
2,3,That I did. That I did.,Chandler,neutral,neutral,0,2,8,21,"00:16:23,442","00:16:26,389"
3,4,So let’s talk a little bit about your duties.,The Interviewer,neutral,neutral,0,3,8,21,"00:16:26,820","00:16:29,572"
4,5,My duties? All right.,Chandler,surprise,positive,0,4,8,21,"00:16:34,452","00:16:40,917"
...,...,...,...,...,...,...,...,...,...,...,...
9984,10474,You or me?,Chandler,neutral,neutral,1038,13,2,3,"00:00:48,173","00:00:50,799"
9985,10475,"I got it. Uh, Joey, women don't have Adam's ap...",Ross,neutral,neutral,1038,14,2,3,"00:00:51,009","00:00:53,594"
9986,10476,"You guys are messing with me, right?",Joey,surprise,positive,1038,15,2,3,"00:01:00,518","00:01:03,520"
9987,10477,Yeah.,All,neutral,neutral,1038,16,2,3,"00:01:05,398","00:01:07,274"


### Split the Train/Test dataset

In [4]:
#select the main characters and split training and testing set
main = ['Chandler', 'Rachel', 'Ross', 'Joey', 'Monica', 'Phoebe']
main_data = data_df[data_df['Speaker'].isin(main)]
X_train, X_test, y_train, y_test = train_test_split(main_data[['Dialogue_ID', 'Utterance_ID','Speaker']], 
                                                    main_data['Speaker'], 
                                                    test_size=0.25, shuffle=True, random_state=42)

### Define Hepler Functions

In [5]:
def normalize(x, axis=0):
    return minmax_scale(x, axis=axis)

In [6]:
def process_data(df):
    """
    Extract features from a dataframes of audio files
    Input: A dataframe contains the Dialogue ID and Utterance ID
    Output: A 2D numpy array of features extracted from audio files 
    """
    lpc_features = np.array([])
    counter = 0 #used to count how many audio files does not have lpc
    
    for i in range(len(df)):
        entry = df.iloc[i]
        dia = entry['Dialogue_ID']
        utt = entry['Utterance_ID']
        path = fr"wav_files/train/dia{dia}_utt{utt}.wav"
        audio, sr = librosa.load(path,res_type='kaiser_fast'
                                  ,duration=2.5
                                  ,sr=44100
                                  ,offset=0.5)
        
        #print(audio.shape)
        try:
            # adding LPC Preprocessing
            audio = normalize(audio)
            audio = librosa.effects.preemphasis(audio)
            audio_lpc = librosa.lpc(audio,44)
        except:
            #print('no mfcc from audio')
            counter = counter+1
            continue
        # need to reshape lpc to 2d array
        audio_lpc = audio_lpc.reshape(-1, 1)
        delta1 = delta(audio_lpc, 2)
        delta2 = delta(delta1, 2)
        combined = np.hstack((audio_lpc, delta1, delta2))
        if lpc_features.size == 0:
            lpc_features = combined
        else:
            lpc_features = np.vstack((lpc_features, combined))
            
    
    print(str(counter)+" has no lpcs")
    return lpc_features

In [7]:
def identify(dia, utt, m1, m2, m3, m4, m5, m6):
    """
    Given an audio file and six models, identify the speaker
    Input: Dialogue ID, Utterance ID, and six models(one for each speaker)
    Output: the speaker
    """
    path = fr"wav_files/train/dia{dia}_utt{utt}.wav"

    audio, sr = librosa.load(path,res_type='kaiser_fast'
                                      ,duration=2.5
                                      ,sr=44100
                                      ,offset=0.5)
    
        
    try:
        # adding LPC Preprocessing
        audio = normalize(audio)
        audio = librosa.effects.preemphasis(audio)
        audio_lpc = librosa.lpc(audio,44)
    except:
        return None
    
    # need to reshape lpc to 2d array
    audio_lpc = audio_lpc.reshape(-1, 1)
    delta1 = delta(audio_lpc, 2)
    delta2 = delta(delta1, 2)
    combined = np.hstack((audio_lpc, delta1, delta2))

    scores = np.array([m1.score(combined), m2.score(combined), m3.score(combined),
                         m4.score(combined), m5.score(combined), m6.score(combined)])
    idx = np.argmax(scores)
    
    return main[idx]

In [41]:
#test
# data,sr = librosa.load("wav_files/train/dia0_utt0.wav",res_type='kaiser_fast'
#                                   ,duration=2.5
#                                   ,sr=44100
#                                   ,offset=0.5 )
# audio = normalize(data)
# audio = librosa.effects.preemphasis(audio)
# audio_lpc = librosa.lpc(data,44)

In [43]:
# audio_lpc.shape

(45,)

In [45]:
# re_audio = audio_lpc.reshape(-1, 1)
# re_audio.shape

(45, 1)

In [34]:
# audiomfc, sr = librosa.load("wav_files/train/dia0_utt0.wav",res_type='kaiser_fast'
#                                   ,duration=2.5
#                                   ,sr=44100
#                                   ,offset=0.5)
# audio_mfcc = mfcc.mfcc(audiomfc, sr,nfilt=20, nfft=1200, appendEnergy=True)
# audio_mfcc = preprocessing.scale(audio_mfcc)

In [35]:
# audio_mfcc.shape

(249, 13)

In [46]:
# delta1 = delta(re_audio, 2)

### Extract the features for each speaker

In [10]:
chandler = X_train[X_train['Speaker']=='Chandler']
rachel = X_train[X_train['Speaker']=='Rachel']
ross = X_train[X_train['Speaker']=='Ross']
joey = X_train[X_train['Speaker']=='Joey']
monica = X_train[X_train['Speaker']=='Monica']
phoebe = X_train[X_train['Speaker']=='Phoebe']


#process the data, get the lpcs for every speaker
print('start extracting')
chandler_lpc = process_data(chandler)
print('chandler')
rachel_lpc = process_data(rachel)
print('rachel')
ross_lpc = process_data(ross)
print('ross')
joey_lpc = process_data(joey)
print('joey')
monica_lpc = process_data(monica)
print('monica')
phoebe_lpc = process_data(phoebe)
print('phoebe')

start extracting
25 has no lpcs
chandler
37 has no lpcs
rachel
29 has no lpcs
ross
49 has no lpcs
joey
23 has no lpcs
monica
33 has no lpcs
phoebe


### Create the models, one for each speaker

In [12]:
gmm_chandler = GaussianMixture(n_components=1, max_iter=200, covariance_type='diag', n_init=3)
gmm_chandler.fit(chandler_lpc)
gmm_rachel = GaussianMixture(n_components=1, max_iter=200, covariance_type='diag', n_init=3)
gmm_rachel.fit(rachel_lpc)
gmm_ross = GaussianMixture(n_components=1, max_iter=200, covariance_type='diag', n_init=3)
gmm_ross.fit(ross_lpc)
gmm_joey = GaussianMixture(n_components=1, max_iter=200, covariance_type='diag', n_init=3)
gmm_joey.fit(joey_lpc)
gmm_monica = GaussianMixture(n_components=1, max_iter=200, covariance_type='diag', n_init=3)
gmm_monica.fit(monica_lpc)
gmm_phoebe = GaussianMixture(n_components=1, max_iter=200, covariance_type='diag', n_init=3)
gmm_phoebe.fit(phoebe_lpc)

GaussianMixture(covariance_type='diag', max_iter=200, n_init=3)

### Use the model to identify the speakers in the testing set

In [13]:
#n_components = 1
prediction = X_test.copy()
prediction['Predictions'] = prediction.apply(lambda row: identify(row['Dialogue_ID'], row['Utterance_ID'], 
                                                          gmm_chandler, gmm_rachel, gmm_ross,
                                                         gmm_joey, gmm_monica, gmm_phoebe), axis=1)

correct_label = (prediction['Speaker'] == prediction['Predictions'])
accuracy = correct_label.sum()/len(prediction)

print("The accuracy of this model is "+str(accuracy))

The accuracy of this model is 0.15221579961464354


### Function to run the train and test for different parameters of the model

In [14]:
def train_test(n, cov):
    """
    n - n_components
    cov - covariance_type
    """
    gmm_chandler = GaussianMixture(n_components=n, max_iter=200, covariance_type=cov, n_init=3)
    gmm_chandler.fit(chandler_lpc)
    gmm_rachel = GaussianMixture(n_components=n, max_iter=200, covariance_type=cov, n_init=3)
    gmm_rachel.fit(rachel_lpc)
    gmm_ross = GaussianMixture(n_components=n, max_iter=200, covariance_type=cov, n_init=3)
    gmm_ross.fit(ross_lpc)
    gmm_joey = GaussianMixture(n_components=n, max_iter=200, covariance_type=cov, n_init=3)
    gmm_joey.fit(joey_lpc)
    gmm_monica = GaussianMixture(n_components=n, max_iter=200, covariance_type=cov, n_init=3)
    gmm_monica.fit(monica_lpc)
    gmm_phoebe = GaussianMixture(n_components=n, max_iter=200, covariance_type=cov, n_init=3)
    gmm_phoebe.fit(phoebe_lpc)
    
    prediction = X_test.copy()
    prediction['Predictions'] = prediction.apply(lambda row: identify(row['Dialogue_ID'], row['Utterance_ID'], 
                                                              gmm_chandler, gmm_rachel, gmm_ross,
                                                             gmm_joey, gmm_monica, gmm_phoebe), axis=1)

    correct_label = (prediction['Speaker'] == prediction['Predictions'])
    accuracy = correct_label.sum()/len(prediction)

    #print("The accuracy of this model is "+str(accuracy))
    return accuracy

In [15]:
#n_component=1, covariance_type='diag'
accuracy = train_test(n=1, cov='diag')
print("The accuracy of this model is "+str(accuracy))

The accuracy of this model is 0.15221579961464354


In [16]:
#n_component=8, covariance_type='diag'
accuracy = train_test(n=8, cov='diag')
print("The accuracy of this model is "+str(accuracy))

The accuracy of this model is 0.16714836223506743
