In [9]:
import collections
import contextlib
import sys
import wave
import os
import pandas as pd


import webrtcvad
import librosa

In [10]:
def read_wave(path):
    """Reads a .wav file.
    Takes the path, and returns (PCM audio data, sample rate).
    """
    with contextlib.closing(wave.open(path, 'rb')) as wf:
        num_channels = wf.getnchannels()
        assert num_channels == 1
        sample_width = wf.getsampwidth()
        assert sample_width == 2
        sample_rate = wf.getframerate()
        assert sample_rate in (8000, 16000, 32000, 48000)
        pcm_data = wf.readframes(wf.getnframes())
        return pcm_data,sample_rate
        

In [11]:
audio = "/home/divyansh/Divyansh/projects/senti_random_forest/data/"
actor_folders = os.listdir(audio)
actor_folders.sort()
actor_folders[0:5]

['Actor_01', 'Actor_02', 'Actor_03', 'Actor_04', 'Actor_05']

In [12]:
emotion = []
gender = []
actor = []
file_path = []
for i in actor_folders:
    filename = os.listdir(audio + i) #iterate over Actor folders
    for f in filename: # go through files in Actor folder
        part = f.split('.')[0].split('-')
        emotion.append(int(part[2]))
        actor.append(int(part[6]))
        bg = int(part[6])
        if bg%2 == 0:
            bg = "female"
        else:
            bg = "male"
        gender.append(bg)
        file_path.append(audio + i + '/' + f)

In [13]:
audio_df = pd.DataFrame(emotion)
audio_df = audio_df.replace({1:'neutral', 2:'calm', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust', 8:'surprise'})
audio_df = pd.concat([pd.DataFrame(gender),audio_df,pd.DataFrame(actor)],axis=1)
audio_df.columns = ['gender','emotion','actor']
audio_df = pd.concat([audio_df,pd.DataFrame(file_path, columns = ['path'])],axis=1)
audio_df

Unnamed: 0,gender,emotion,actor,path
0,male,neutral,1,/home/divyansh/Divyansh/projects/senti_random_...
1,male,fear,1,/home/divyansh/Divyansh/projects/senti_random_...
2,male,angry,1,/home/divyansh/Divyansh/projects/senti_random_...
3,male,happy,1,/home/divyansh/Divyansh/projects/senti_random_...
4,male,sad,1,/home/divyansh/Divyansh/projects/senti_random_...
...,...,...,...,...
1435,female,sad,24,/home/divyansh/Divyansh/projects/senti_random_...
1436,female,fear,24,/home/divyansh/Divyansh/projects/senti_random_...
1437,female,calm,24,/home/divyansh/Divyansh/projects/senti_random_...
1438,female,surprise,24,/home/divyansh/Divyansh/projects/senti_random_...


In [14]:
def write_wave(path, audio, sample_rate):
    """Writes a .wav file.
    Takes path, PCM audio data, and sample rate.
    """
    with contextlib.closing(wave.open(path, 'wb')) as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(sample_rate)
        wf.writeframes(audio)


  Generates audio frames from PCM audio data.
    Takes the desired frame duration in milliseconds, the PCM data, and
    the sample rate.
    Yields Frames of the requested duration.
  
  
Filters out non-voiced audio frames.
    Given a webrtcvad.Vad and a source of audio frames, yields only
    the voiced audio.
    Uses a padded, sliding window algorithm over the audio frames.
    When more than 90% of the frames in the window are voiced (as
    reported by the VAD), the collector triggers and begins yielding
    audio frames. Then the collector waits until 90% of the frames in
    the window are unvoiced to detrigger.
    The window is padded at the front and back to provide a small
    amount of silence or the beginnings/endings of speech around the
    voiced frames.
    Arguments:
    sample_rate - The audio sample rate, in Hz.
    frame_duration_ms - The frame duration in milliseconds.
    padding_duration_ms - The amount to pad the window, in milliseconds.
    vad - An instance of webrtcvad.Vad.
    frames - a source of audio frames (sequence or generator).
    Returns: A generator that yields PCM audio data.
    
    

In [20]:
class Frame(object):
    """Represents a "frame" of audio data."""
    def __init__(self, bytes, timestamp, duration):
        self.bytes = bytes
        self.timestamp = timestamp
        self.duration = duration
    def frame_generator(frame_duration_ms, audio, sample_rate):
  
        n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
        offset = 0
        timestamp = 0.0
        duration = (float(n) / sample_rate) / 2.0
        while offset + n < len(audio):
            yield Frame(audio[offset:offset + n], timestamp, duration)
            timestamp += duration
            offset += n
        
    def vad_collector(sample_rate, frame_duration_ms,
                  padding_duration_ms, vad, frames):
        num_padding_frames = int(padding_duration_ms / frame_duration_ms)
        # We use a deque for our sliding window/ring buffer.
        ring_buffer = collections.deque(maxlen=num_padding_frames)
        # We have two states: TRIGGERED and NOTTRIGGERED. We start in the
        # NOTTRIGGERED state.
        triggered = False

        voiced_frames = []
        for frame in frames:
            is_speech = vad.is_speech(frame.bytes, sample_rate)

            sys.stdout.write('1' if is_speech else '0')
            if not triggered:
                ring_buffer.append((frame, is_speech))
                num_voiced = len([f for f, speech in ring_buffer if speech])
                # If we're NOTTRIGGERED and more than 90% of the frames in
                # the ring buffer are voiced frames, then enter the
                # TRIGGERED state.
                if num_voiced > 0.9 * ring_buffer.maxlen:
                    triggered = True
                    sys.stdout.write('+(%s)' % (ring_buffer[0][0].timestamp,))
                    # We want to yield all the audio we see from now until
                    # we are NOTTRIGGERED, but we have to start with the
                    # audio that's already in the ring buffer.
                    for f, s in ring_buffer:
                        voiced_frames.append(f)
                    ring_buffer.clear()
            else:
                # We're in the TRIGGERED state, so collect the audio data
                # and add it to the ring buffer.
                voiced_frames.append(frame)
                ring_buffer.append((frame, is_speech))
                num_unvoiced = len([f for f, speech in ring_buffer if not speech])
                # If more than 90% of the frames in the ring buffer are
                # unvoiced, then enter NOTTRIGGERED and yield whatever
                # audio we've collected.
                if num_unvoiced > 0.9 * ring_buffer.maxlen:
                    sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration))
                    triggered = False
                    yield b''.join([f.bytes for f in voiced_frames])
                    ring_buffer.clear()
                    voiced_frames = []
        if triggered:
            sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration))
        sys.stdout.write('\n')
        # If we have any leftover voiced audio when we run out of input,
        # yield it.
        if voiced_frames:
            yield b''.join([f.bytes for f in voiced_frames])



In [23]:
from sklearn import preprocessing
import numpy as np 
from sklearn.mixture import GaussianMixture
from copy import deepcopy
from sklearn.cluster import SpectralClustering

In [25]:
audio,sample_rate = read_wave('input_test.wav')

In [31]:
vad = webrtcvad.Vad(2)
frames = Frame.frame_generator(30, audio, sample_rate)
frames = list(frames)
segments = Frame.vad_collector(sample_rate,30,300, vad, frames)

In [32]:
c = 0
for i, segment in enumerate(segments):
    path = 'chunk-%002d.wav' % (i,)
    print('Writing %s' % (path,))
    write_wave(path, segment, sample_rate)
    c+=1

00001111111111+(0.12)11111111111111111111111111111111111000001111111111111111111111111111111111111111110000000000-(3.179999999999994)Writing chunk-00.wav
000000000000000000000000001111111111+(3.959999999999989)111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111101111111111111111111111111111111111111111111111111111111111111111111111111111111111000000000111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111000001111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111110000000000-(24.42000000000017)Writing chunk-01.wav
000000000001111110111100001111111111+(25.2000000000002)1111111111111111111111

In [33]:
c

15

In [40]:
sampling_rate = 8000
n_mfcc = 13
n_fft = 0.032
hop_length = 0.010
components = 16
cov_type = 'full'

test_file_path = sys.argv[1]
test_file_path
y,sr = librosa.load('input_test.wav')
print(np.shape(y))

mfcc = librosa.feature.mfcc(np.array(y),sr,hop_length=int(hop_length * sr),n_fft=int(n_fft*sr),n_mfcc=n_mfcc,dct_type=2)
mfcc_delta = librosa.feature.delta(mfcc)
mfcc_delta_second_order = librosa.feature.delta(mfcc,order=2)
temp = librosa.feature.delta(mfcc_delta)
inter = np.vstack((mfcc,mfcc_delta,mfcc_delta_second_order))
ubm_feature = inter.T


(3287920,)


In [41]:
ubm_model = GaussianMixture(n_components = components, covariance_type = cov_type)
ubm_model.fit(ubm_feature)

print(ubm_model.score(ubm_feature))
print(ubm_model.means_)



-16.83761125494483
[[-5.73234322e+02  1.95677657e+01  1.20618043e+01  7.47188334e+00
   4.94613475e+00  1.81555262e+00 -9.80393540e-01 -2.33626730e+00
  -2.95695009e+00 -3.22843485e+00 -3.28698183e+00 -3.51988896e+00
  -3.50637023e+00  3.45242023e+00  2.42263076e+00 -9.07919898e-01
  -8.75814245e-01  2.68307654e-03 -5.82442791e-01 -8.14022599e-01
  -1.34326609e-01  2.96904930e-02 -1.33940377e-01  9.76200210e-03
  -5.49876420e-03 -1.08012695e-01  3.95753943e+00  3.14547043e+00
  -3.48325608e-01 -5.91997945e-01  3.08643943e-01 -1.85498977e-01
  -7.30670363e-01 -4.09804760e-01 -1.80636320e-01 -1.80151766e-01
  -1.11510707e-01 -2.24378371e-01 -2.82944865e-01]
 [-4.17233904e+02  1.49230956e+02 -2.74850549e-01 -2.92830857e+01
   1.25382663e+01  3.78110005e+00 -3.33391447e+01 -3.02482531e+01
  -9.32482754e+00 -9.86941113e+00 -1.55324791e+01 -1.19297498e+01
  -1.11675298e+01 -1.00284537e+00 -7.65164583e-01  1.79916384e-01
   2.52981387e-01 -2.84672305e-02  3.48308751e-02  1.34854357e-01
   9.4

In [42]:
def MAP_Estimation(model,data,m_iterations):

    N = data.shape[0]
    D = data.shape[1]
    K = model.n_components


    mu_new = np.zeros((K,D))
    n_k = np.zeros((K,1))

    mu_k = model.means_
    
    pi_k = model.weights_

    old_likelihood = model.score(data)
    new_likelihood = 0
    iterations = 0
    while(iterations < m_iterations):
        iterations += 1
        old_likelihood = new_likelihood
        z_n_k = model.predict_proba(data)
        n_k = np.sum(z_n_k,axis = 0)
        n_k = n_k.reshape(np.shape(n_k)[0],1)

        mu_new = np.dot(z_n_k.T,data)
        n_k[n_k == 0] = 1e-20
        mu_new = mu_new / n_k

        adaptation_coefficient = n_k/(n_k + relevance_factor)
        I = np.ones(shape=np.shape(n_k))
        # for k in range(K):
        #     mu_k[k] = (adaptation_coefficient[k] * mu_new[k]) + ((1 - adaptation_coefficient[k]) * mu_k[k])
        mu_k = (adaptation_coefficient*mu_new) + (( I - adaptation_coefficient) * mu_k)
        model.means_ = mu_k

        log_likelihood = model.score(data)

        new_likelihood = log_likelihood

        if abs(old_likelihood - new_likelihood) < 1e-20:
            break
        print(log_likelihood)
    return model

    

In [45]:
Total = []
relevance_factor = 16

for i in range(c):
    fname='chunk-%002d.wav' % (i,)
    print('MAP adaptation for {0}'.format(fname))
    temp_y,sr_temp = librosa.load(fname,sr=None)
    
    temp_mfcc = librosa.feature.mfcc(np.array(temp_y),sr_temp, hop_length=int(hop_length * sr_temp),n_fft=int(n_fft*sr_temp),n_mfcc=n_mfcc,dct_type=2)
    temp_mfcc_delta = librosa.feature.delta(temp_mfcc)
    temp_mfcc_delta_second_order = librosa.feature.delta(temp_mfcc, order=2)
    temp_inter = np.vstack((temp_mfcc,temp_mfcc_delta, temp_mfcc_delta_second_order))
    temp_gmm_feature = temp_inter.T
    gmm = deepcopy(ubm_model)
    gmm = MAP_Estimation(gmm, temp_gmm_feature, m_iterations= 1)
    sv = gmm.means_.flatten()
    
    Total.append(sv)
    
    
    
    
    


MAP adaptation for chunk-00.wav
-184283825.73820278
MAP adaptation for chunk-01.wav
-148852740.10645866
MAP adaptation for chunk-02.wav
-238697208.71887907
MAP adaptation for chunk-03.wav
-184392302.1181167
MAP adaptation for chunk-04.wav
-220044775.64754352
MAP adaptation for chunk-05.wav
-198222039.30888522
MAP adaptation for chunk-06.wav
-131023458.81823187
MAP adaptation for chunk-07.wav
-86907425.57563713
MAP adaptation for chunk-08.wav
-200438409.99321377
MAP adaptation for chunk-09.wav
-72203313.82636423
MAP adaptation for chunk-10.wav
-89635956.11996256
MAP adaptation for chunk-11.wav
-190297094.32860363
MAP adaptation for chunk-12.wav
-272107741.94763654
MAP adaptation for chunk-13.wav
-201850385.2200475
MAP adaptation for chunk-14.wav
-133465508.074026


In [48]:
def rearrange(labels, n):
    seen = set()
    distinct = [x for x in labels if x not in seen and not seen.add(x)]
    correct = [i for i in range(n)]
    dict_ = dict(zip(distinct, correct))
    return [x if x not in dict_ else dict_[x] for x in labels]


In [49]:
N_CLUSTERS = 2
sc = SpectralClustering(n_clusters= N_CLUSTERS, affinity='cosine')

labels = sc.fit_predict(Total)
labels = rearrange(labels, N_CLUSTERS)
print(labels)


[0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1]


In [50]:
print([i for i,x in enumerate(labels) if x ==1])

[1, 2, 3, 5, 6, 9, 10, 11, 13, 14]
