# Separating_speakers

The goal of this notebook is to seperate the interviewer from an interviewee to obtain two different audio and analyze the one that corresponds to the interviewee.
Then match the timestamps from the later audio (where only the interviewee is speaking) to the timestamps of the original audio of the interview

## Importing packages

In [1]:
# Example 1: short-term feature extraction
from pyAudioAnalysis import ShortTermFeatures as aF
from pyAudioAnalysis import audioBasicIO as aIO 
from pyAudioAnalysis import MidTermFeatures as aFm
import numpy as np 
import plotly.graph_objs as go 
import plotly
import IPython

import glob
import numpy as np
import pandas as pd
import parselmouth
from parselmouth.praat import call

import sys

# librosa is a Python library for analyzing audio and music. It can be used to extract the data from the audio files we will see it later.
import librosa
import librosa.display
import seaborn as sns

import pandas as pd
import matplotlib.pyplot as plt
import random
#
import os, sklearn.cluster
from pyAudioAnalysis.MidTermFeatures import mid_feature_extraction as mT
from pyAudioAnalysis.audioBasicIO import read_audio_file, stereo_to_mono
from pyAudioAnalysis.audioSegmentation import labels_to_segments
#from pyAudioAnalysis.audioTrainTest import normalize_features
import numpy as np
import scipy.io.wavfile as wavfile
import IPython

## Helper functions

In [2]:
def processing_list(my_list):
    l = my_list.replace('[', '')
    l = l.replace(']', '')
    l = l.split("\n")
    l = [x.split(" ") for x in l]
    for i in range(len(l)):
        for j in range(len(l[i])):
            try:
                l[i].remove('')
            except:
                print("there is no '' ")
    l = [[float(l[i][0]), float(l[i][1])] for i in range(len(l))]
    return l

In [3]:
def normalize_features(features):
    """
    This function normalizes a feature set to 0-mean and 1-std.
    Used in most classifier trainning cases.
    ARGUMENTS:
        - features:    list of feature matrices (each one of them is a np
                       matrix)
    RETURNS:
        - features_norm:    list of NORMALIZED feature matrices
        - mean:        mean vector
        - std:        std vector
    """
    temp_feats = np.array([])

    for count, f in enumerate(features):
        if f.shape[0] > 0:
            if count == 0:
                temp_feats = f
            else:
                temp_feats = np.vstack((temp_feats, f))
            count += 1

    mean = np.mean(temp_feats, axis=0) + 1e-14
    std = np.std(temp_feats, axis=0) + 1e-14

    features_norm = []
    for f in features:
        ft = f.copy()
        for n_samples in range(f.shape[0]):
            ft[n_samples, :] = (ft[n_samples, :] - mean) / std
        features_norm.append(ft)
    return features_norm, mean, std

In [4]:
def seperate_speakers(audio_file):
    fs, audio_f = read_audio_file(audio_file)
    audio_f = stereo_to_mono(audio_f)
    print(fs)

    mt_size, mt_step, st_win = 2, 0.1, 0.05
    [mt_feats, st_feats, _] = mT(audio_f, fs, mt_size * fs, mt_step * fs, round(fs * st_win), round(fs * st_win * 0.5))
    (mt_feats_norm, MEAN, STD) = normalize_features([mt_feats.T])
    mt_feats_norm = mt_feats_norm[0].T

    #print(mt_feats_norm.shape)
    # perform clustering
    n_clusters = 2
    x_clusters = [np.zeros((fs, )) for i in range(n_clusters)]
    k_means = sklearn.cluster.KMeans(n_clusters=n_clusters)
    k_means.fit(mt_feats_norm.T)
    cls = k_means.labels_

    # save clusters to concatenated wav files
    segs, c = labels_to_segments(cls, mt_step)  # convert flags to segment limits
    
    #print(segs)
    #print(c)
    speakers = [[],[]]

    
    for sp in range(n_clusters):                
        count_cl = 0
        for i in range(len(c)):     # for each segment in each cluster (>2 secs long)
            if c[i] == sp and segs[i, 1]-segs[i, 0] > 2:
                count_cl += 1
                # get the signal and append it to the cluster's signal (followed by some silence)
                speakers[sp].append([segs[i, 0], segs[i, 1]])
                
                cur_x = audio_f[int(segs[i, 0] * fs): int(segs[i, 1] * fs)]
                x_clusters[sp] = np.append(x_clusters[sp], cur_x)
                x_clusters[sp] = np.append(x_clusters[sp], np.zeros((fs,)))
        # write cluster's signal into a WAV file
        #print(f'speaker {sp}: {count_cl} segments {len(x_clusters[sp])/float(fs)} sec total dur')  
        #print(x_clusters[0].shape)
    if (len(x_clusters[0])>len(x_clusters[1])):
        #wavfile.write(audio_file  + f'.wav', fs, np.int16(x_clusters[0]))
        #IPython.display.display(IPython.display.Audio(audio_file + f'.wav'))
        return speakers[0]
    else:
        #wavfile.write(audio_file  + f'.wav', fs, np.int16(x_clusters[1]))
        #IPython.display.display(IPython.display.Audio(audio_file + f'.wav'))
        return speakers[1]
    

In [5]:
def matching_timestamps(my_speaker, time_stamp):
    # my_speaker is the output of the preivous funtion 
    # time_stamp is the time stamp in the new audio (where there is only the interviewee) that you want to match to 
    # to the original audio
    # the output is the corresponding time_stamp of the original audio of the interview
    n = len(my_speaker)
    l = [(x[1]-x[0]) for x in my_speaker]
    lm = [0,l[0]]
    total = 0
    
    for i in range (0,n):
#        lm.append([total, total+l[i]+1])
        if (time_stamp>total and time_stamp<= total+l[i]+1):
            if (time_stamp>+l[i] and time_stamp<= total+l[i]+1):
                new_t = time_stamp-total + my_speaker[i][0]
            else:
                new_t = time_stamp-total + my_speaker[i][0]
            #print(total,i)
            return new_t
        total += l[i]+1
        
    print("The time stamp is longer than the audio or less than 0")
        

## Getting original timestamps

In [7]:
my_dir = 'MIT_dataset_APC/Audio/'
files = os.listdir(my_dir)
#my_speaker = seperate_speakers('MIT_dataset_APC/Audio/P1.wav')
print(files, len(files))

['PP89.wav', 'PP76.wav', 'PP62.wav', 'P67.wav', 'P73.wav', 'P8.wav', 'P72.wav', 'P66.wav', 'PP63.wav', 'PP77.wav', 'PP61.wav', 'PP49.wav', 'P70.wav', 'P64.wav', 'P58.wav', 'P59.wav', 'P65.wav', 'P71.wav', 'PP48.wav', 'PP74.wav', 'PP60.wav', 'PP58.wav', 'PP64.wav', 'PP70.wav', 'P49.wav', 'P61.wav', '.DS_Store', 'P60.wav', 'P74.wav', 'P48.wav', 'PP71.wav', 'PP65.wav', 'PP59.wav', 'PP73.wav', 'PP67.wav', 'P62.wav', 'P76.wav', 'P89.wav', 'P77.wav', 'P63.wav', 'PP66.wav', 'PP72.wav', 'PP15.wav', 'PP29.wav', 'P10.wav', 'P11.wav', 'PP14.wav', 'PP16.wav', 'P13.wav', 'P12.wav', 'PP17.wav', 'PP8.wav', 'PP13.wav', 'P16.wav', 'P17.wav', 'PP12.wav', 'PP10.wav', 'P29.wav', 'P15.wav', 'P14.wav', 'PP11.wav', 'PP34.wav', 'PP20.wav', 'PP7.wav', 'P25.wav', 'P31.wav', 'P30.wav', 'P24.wav', 'PP6.wav', 'PP21.wav', 'PP35.wav', 'PP37.wav', 'PP4.wav', 'P32.wav', 'P27.wav', 'P33.wav', 'PP5.wav', 'PP22.wav', 'PP1.wav', 'PP32.wav', 'P37.wav', 'P22.wav', 'PP33.wav', 'PP27.wav', 'PP31.wav', 'PP25.wav', 'P20.wav', '

In [10]:
matching_timestamps(my_speaker, 14)

13.400000000000002 2


22.4

In [8]:
print(my_speaker)

In [8]:
df = pd.read_csv("mit_transcription_timestamps_2.csv",index_col=0)

In [9]:
df.head()

Unnamed: 0,Participant,tokenized_sentences,tmp1
0,pp89,[' Mhmm uh so is it all right if I kind of tal...,"[[16277, 29623], [33954, 45155], [45625, 54603..."
1,pp76,"[' Im doing well thanks.', 'And yourself?', 'W...","[[10500, 21352], [10500, 21352], [10500, 21352..."
2,pp62,"[' Im good.', 'Okay so I was born in Irvine Ca...","[[2304, 9065], [2304, 9065], [9426, 13751], [1..."
3,p67,"[' Im actually doing pretty well.', 'Im just u...","[[2386, 4561], [4561, 5179], [5179, 7997], [84..."
4,p73,"[' Im good how are you?', 'Im Amanda Im a fres...","[[545, 3570], [3570, 4006], [3570, 9793], [357..."


In [14]:
def getting_original_time_stamps(df,speakers):
    list_of_timestamps = []
    for i in range(len(speakers)):
        speaker = speakers[i][:-4].lower()
        try:
            time_stamps = df.loc[df["Participant"] == speaker ,['tmp1']].values[0][0]

            time_stamps = time_stamps.replace(']]', '').replace('[[','')
            time_stamps = time_stamps.split('], [')
            time_stamps = np.array([sub.split(',') for sub in time_stamps])
            time_stamps = time_stamps.astype(float)

            new_time_stamps = np.zeros(time_stamps.shape)
            my_speaker = seperate_speakers(my_dir + speakers[i])

            for i in range(len(time_stamps)):
                new_time_stamps[i,0] = matching_timestamps(my_speaker, time_stamps[i,0]/1000)
                new_time_stamps[i,1] = matching_timestamps(my_speaker, time_stamps[i,1]/1000)
            list_of_timestamps.append(new_time_stamps)
        except :
            print("there is a problem with ",speaker)
    return list_of_timestamps


In [15]:
list_ts = getting_original_time_stamps(df,files)

48000
48000
48000
The time stamp is longer than the audio or less than 0
48000
48000
The time stamp is longer than the audio or less than 0
The time stamp is longer than the audio or less than 0
48000
48000
The time stamp is longer than the audio or less than 0
The time stamp is longer than the audio or less than 0
The time stamp is longer than the audio or less than 0
The time stamp is longer than the audio or less than 0
48000
48000
48000
48000
The time stamp is longer than the audio or less than 0
48000
48000
The time stamp is longer than the audio or less than 0
48000
48000
The time stamp is longer than the audio or less than 0
The time stamp is longer than the audio or less than 0
48000
The time stamp is longer than the audio or less than 0
48000
48000
The time stamp is longer than the audio or less than 0
The time stamp is longer than the audio or less than 0
48000
48000
48000
The time stamp is longer than the audio or less than 0
The time stamp is longer than the audio or less t

48000
48000
The time stamp is longer than the audio or less than 0
48000
The time stamp is longer than the audio or less than 0
The time stamp is longer than the audio or less than 0
The time stamp is longer than the audio or less than 0
The time stamp is longer than the audio or less than 0
The time stamp is longer than the audio or less than 0
The time stamp is longer than the audio or less than 0
48000
48000
The time stamp is longer than the audio or less than 0
The time stamp is longer than the audio or less than 0
48000
The time stamp is longer than the audio or less than 0
48000
48000
48000
48000
The time stamp is longer than the audio or less than 0
The time stamp is longer than the audio or less than 0
The time stamp is longer than the audio or less than 0
48000
48000
48000
The time stamp is longer than the audio or less than 0
48000
The time stamp is longer than the audio or less than 0
The time stamp is longer than the audio or less than 0
The time stamp is longer than the au

In [16]:
print(list_ts)

[array([[ 65.577,  85.523],
       [ 90.754, 103.255],
       [103.725, 113.503],
       [103.725, 167.584],
       [103.725, 154.854],
       [103.725, 154.854],
       [103.725, 154.854],
       [146.903, 154.854],
       [154.854, 180.57 ],
       [154.854, 186.185],
       [180.57 , 186.185],
       [146.903, 154.854],
       [154.854, 230.535],
       [154.854, 244.646],
       [154.854, 244.646],
       [154.854, 244.646],
       [154.854, 244.646],
       [230.535, 244.646],
       [164.51 , 314.761],
       [164.51 , 293.324],
       [289.416, 293.324],
       [289.416, 314.761],
       [164.51 , 314.761],
       [314.761, 358.12 ],
       [314.761, 367.611],
       [314.761, 367.611],
       [358.12 , 367.611],
       [358.12 , 367.611],
       [367.611, 385.479],
       [367.611, 397.164],
       [385.479, 397.164],
       [397.164, 411.985],
       [397.164, 417.736],
       [397.164, 417.736],
       [411.985, 417.736],
       [385.479, 397.164],
       [397.164, 447.171],


In [12]:
time_stamps = df.loc[df["Participant"] == 'pp89' ,['tmp1']].values[0][0] #.split(',')

In [26]:
x = list_ts

In [27]:
print(x)

[array([[ 65.577,  85.523],
       [ 96.465,  95.59 ],
       [103.725, 113.503],
       [103.725, 167.584],
       [167.584, 146.903],
       [146.903, 146.903],
       [146.903, 146.903],
       [146.903, 154.854],
       [154.854, 180.57 ],
       [180.57 , 180.57 ],
       [180.57 , 186.185],
       [146.903, 154.854],
       [154.854, 230.535],
       [230.535, 230.535],
       [230.535, 230.535],
       [230.535, 230.535],
       [230.535, 230.535],
       [230.535, 244.646],
       [246.396, 314.761],
       [314.761, 289.416],
       [289.416, 293.324],
       [293.324, 301.68 ],
       [301.68 , 167.584],
       [167.584, 358.12 ],
       [358.12 , 358.12 ],
       [358.12 , 358.12 ],
       [358.12 , 367.611],
       [358.12 , 367.611],
       [367.611, 385.479],
       [385.479, 385.479],
       [385.479, 397.164],
       [397.164, 411.985],
       [411.985, 411.985],
       [411.985, 411.985],
       [411.985, 417.736],
       [385.479, 397.164],
       [397.164, 447.171],


In [31]:
print(len(list_ts))

138


In [17]:
participants = [files[i][:-4].lower() for i in range(len(files)) if files[i][0]=='P']

In [19]:
print(len(participants))

138


In [20]:
df.insert(3, "new_time_stamps", np.zeros(138), True)

In [23]:
for i in range(138):
    participant = participants[i]
    my_ts = str(list_ts[i])
    #df.loc[df["Participant"] == participant ,'new_time_stamps'] = my_ts
    #df.loc[df.ID == 103, 'FirstName'] = "Matt"
    df['new_time_stamps'][df.Participant == participant] = my_ts



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [24]:
df.head()

Unnamed: 0,Participant,tokenized_sentences,tmp1,new_time_stamps
0,pp89,[' Mhmm uh so is it all right if I kind of tal...,"[[16277, 29623], [33954, 45155], [45625, 54603...",[[ 65.577 85.523]\n [ 90.754 103.255]\n [103....
1,pp76,"[' Im doing well thanks.', 'And yourself?', 'W...","[[10500, 21352], [10500, 21352], [10500, 21352...",[[ 56.1 72.352]\n [ 56.1 72.352]\n [ 56....
2,pp62,"[' Im good.', 'Okay so I was born in Irvine Ca...","[[2304, 9065], [2304, 9065], [9426, 13751], [1...",[[ 4.904 11.665]\n [ 4.904 11.665]\n [ 12....
3,p67,"[' Im actually doing pretty well.', 'Im just u...","[[2386, 4561], [4561, 5179], [5179, 7997], [84...",[[ 3.186 9.561]\n [ 9.561 10.179]\n [ 10....
4,p73,"[' Im good how are you?', 'Im Amanda Im a fres...","[[545, 3570], [3570, 4006], [3570, 9793], [357...",[[ 1.345 31.27 ]\n [ 31.27 31.706]\n [ 31....


In [73]:
ts_df = pd.read_csv("testing_ts",index_col=0)

In [63]:
new_df = pd.read_csv("new_time_stamps",index_col=0)

In [74]:
ts_df.head()

Unnamed: 0,Participant,tokenized_sentences,tmp1,new_time_stamps
0,pp89,[' Mhmm uh so is it all right if I kind of tal...,"[[16277, 29623], [38565, 37690], [45625, 54603...",[[ 65.577 85.523]\n [ 96.465 95.59 ]\n [103....
1,pp76,"[' Im doing well thanks.', 'And yourself?', 'W...","[[10500, 21352], [21352, 10500], [10500, 10500...",[[ 56.1 72.352]\n [ 72.352 56.1 ]\n [ 56....
2,pp62,"[' Im good.', 'Okay so I was born in Irvine Ca...","[[2304, 9065], [2304, 9065], [9426, 13751], [1...",[[ 4.904 11.565]\n [ 4.904 11.565]\n [ 11....
3,p67,"[' Im actually doing pretty well.', 'Im just u...","[[2386, 4561], [4561, 5179], [5179, 7997], [84...",[[ 3.186 9.561]\n [ 9.561 10.179]\n [ 10....
4,p73,"[' Im good how are you?', 'Im Amanda Im a fres...","[[545, 3570], [3570, 4006], [4006, 4006], [400...",[[ 1.345 31.27 ]\n [ 31.27 31.706]\n [ 31....


In [86]:
new_df.head()

Unnamed: 0,Participant,tokenized_sentences,tmp1,new_time_stamps
0,pp89,[' Mhmm uh so is it all right if I kind of tal...,"[[16277, 29623], [38565, 37690], [45625, 54603...","[[65.577, 85.523], [96.465, 95.59], [103.725, ..."
1,pp76,"[' Im doing well thanks.', 'And yourself?', 'W...","[[10500, 21352], [21352, 10500], [10500, 10500...","[[56.1, 72.352], [72.352, 56.1], [56.1, 56.1],..."
2,pp62,"[' Im good.', 'Okay so I was born in Irvine Ca...","[[2304, 9065], [2304, 9065], [9426, 13751], [1...","[[4.904, 11.565], [4.904, 11.565], [11.926, 15..."
3,p67,"[' Im actually doing pretty well.', 'Im just u...","[[2386, 4561], [4561, 5179], [5179, 7997], [84...","[[3.186, 9.561], [9.561, 10.179], [10.179, 15...."
4,p73,"[' Im good how are you?', 'Im Amanda Im a fres...","[[545, 3570], [3570, 4006], [4006, 4006], [400...","[[1.345, 31.27], [31.27, 31.706], [31.706, 31...."


In [65]:
l = new_df['new_time_stamps'][0]

In [27]:
for i in range(138):
    df['new_time_stamps'][i] = new_ts[i]

In [15]:
print( df.loc[df["Participant"] == 'pp89' ,['tokenized_sentences']].values[0][0] )#.split(',')

[' Mhmm uh so is it all right if I kind of talk about the same stuff as last time?', 'I was going to say I havent changed too much but um yeah so still a junior at MIT here and um about halfway through junior year and um Im a physics major and um Im from Pensacola Florida and um Im yeah so born and raised in Pensacola and I just um uh you know moved up here a couple years ago and started at MIT and I uh Im not sure as far as what Im going to go into career wise but Im looking at something applied in the physics area.', 'And um as far as extracurricular go I do a few things.', 'Im involved with a few IM sports that my fraternity does uh soccer and basketball and I enjoy juggling and unicycling as well.', 'Ive done Ive unicylced since 5th grade and Ive done that a lot and um Im involved with the juggling club here and I like to play ultimate Frisbee as well from time to time just for fun and um yeah so you know enjoy doing sports in my spare time but uh yeah physics as far as school goes

In [28]:
df.to_csv("new_time_stamps.csv")