In [1]:
import pandas as pd
import numpy as np
import librosa
from librosa import feature
from glob import glob
import os
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [4]:
# rootdir = '/Users/abhishekvaidyanathan/Desktop/NNDL-project/audio-files'
rootdir = '/Users/iyeng/Desktop/NTU/NTU Sem 5/CZ4042/GroupProject/audio-files'
# rootdir="audio_files"
audio_files = []
for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        audio_files.append(os.path.join(subdir, file))


In [5]:
len(audio_files)

1440

In [6]:
audio_files[1][-33:-25]

'Actor_01'

In [7]:
dict_actors = {}
for audio_file in audio_files:
    try :
        if (len(dict_actors[audio_file[-33:-25]])>0):
            dict_actors[audio_file[-33:-25]].append(audio_file)
    except:
        dict_actors[audio_file[-33:-25]] = []
        dict_actors[audio_file[-33:-25]].append(audio_file)

In [8]:
data = pd.DataFrame(columns = ['Actor','Modality','Vocal_channel','Emotion','Emotional_intensity','Statement','Repetion','Gender','Audio_file'])

In [9]:
def get_gender(value):
    if(int(value)%2==0):
        return 1
    else :
        return 0

for keys in dict_actors:
    for files in dict_actors[keys]:
        data.loc[len(data)] = [keys,int(files[-24:-22]),int(files[-21:-19]),int(files[-18:-16]),int(files[-15:-13]),int(files[-12:-10]),int(files[-9:-7]),get_gender(files[-6:-4]),files]

In [10]:
data.head()

Unnamed: 0,Actor,Modality,Vocal_channel,Emotion,Emotional_intensity,Statement,Repetion,Gender,Audio_file
0,Actor_01,3,1,1,1,1,1,0,/Users/iyeng/Desktop/NTU/NTU Sem 5/CZ4042/Grou...
1,Actor_01,3,1,1,1,1,2,0,/Users/iyeng/Desktop/NTU/NTU Sem 5/CZ4042/Grou...
2,Actor_01,3,1,1,1,2,1,0,/Users/iyeng/Desktop/NTU/NTU Sem 5/CZ4042/Grou...
3,Actor_01,3,1,1,1,2,2,0,/Users/iyeng/Desktop/NTU/NTU Sem 5/CZ4042/Grou...
4,Actor_01,3,1,2,1,1,1,0,/Users/iyeng/Desktop/NTU/NTU Sem 5/CZ4042/Grou...


In [11]:
data.to_csv("local_audio_files.csv", index=False)

## can change the below code cell to include different set of features. 

#### right now uses mean values, can change to using raw values for each of the features. 

In [38]:
# y, sr = librosa.load(librosa.util.example_audio_file())
# temp_func = feature.spectral_bandwidth
# temp_res = temp_func(y=y, sr=sr)
# print(temp_res)
# print(temp_res.shape)
# print(np.mean(temp_res))

[[2803.66001659 1379.6672431  1562.99924373 ... 2549.85781492
  2456.23949636 2515.19654634]]
(1, 2647)
1364.8838771312614


In [12]:
# to add:
# mfcc - use n_mfcc=13 and take mean along axis 1 [13 features]
# chroma_stft - take mean along axis 1 [12 features]
# librosa.onset.onset_strength(y=y, sr=sr) - take direct mean
# zero_crossing_rate - take direct mean
# spectral_rolloff - direct mean
# librosa.piptrack - returns pitch and magnitude, take direct means of both
# melspectrogram - take direct mean
# spectral_contrast - use axis=1 [7 features]
# tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0) [6 features]
# rms - take direct mean
# spectral_centroid - take direct mean
# spectral_bandwidth - take direct mean


def get_feature_vector(y, sr):
    feature_vector = []
    
    # multi-dim features
    feature_vector.extend(np.mean(feature.mfcc(y=y, sr=sr, n_mfcc=13), axis=1))
    feature_vector.extend(np.mean(feature.chroma_stft(y=y, sr=sr), axis=1))
    feature_vector.extend(np.mean(feature.spectral_contrast(y=y, sr=sr), axis=1))
    feature_vector.extend(np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr),axis=1))
    feature_vector.extend(np.mean(librosa.feature.melspectrogram(y=y, sr=sr), axis=1))
    
    # single-dim features with special requirements
    feature_vector.append(np.mean(feature.rms(y=y)))
    feature_vector.append(np.mean(feature.zero_crossing_rate(y=y)))
    feature_vector.extend([np.mean(x) for x in librosa.piptrack(y=y, sr=sr)])
    
    # single-dim features
    feat_list = [
        librosa.onset.onset_strength,
        feature.spectral_rolloff,
        feature.melspectrogram,
        feature.spectral_centroid,
        feature.spectral_bandwidth
    ]
    
    for temp_func in feat_list:
        feature_vector.append(np.mean(temp_func(y=y, sr=sr)))
    
    return feature_vector

In [13]:
audio_features = []
for i in range(data.shape[0]):
   y , sr = librosa.load(data.iloc[i]['Audio_file'],sr=None)
   feature_vector = get_feature_vector(y, sr)
   audio_features.append(feature_vector) 

In [14]:
data_features = data.copy()

In [15]:
data_features = data_features.drop("Audio_file",axis=1)
data_features.head()

Unnamed: 0,Actor,Modality,Vocal_channel,Emotion,Emotional_intensity,Statement,Repetion,Gender
0,Actor_01,3,1,1,1,1,1,0
1,Actor_01,3,1,1,1,1,2,0
2,Actor_01,3,1,1,1,2,1,0
3,Actor_01,3,1,1,1,2,2,0
4,Actor_01,3,1,2,1,1,1,0


In [16]:
data_features['librosa'] = audio_features

In [17]:
features = pd.DataFrame(data_features['librosa'].values.tolist())

In [18]:
features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,165,166,167,168,169,170,171,172,173,174
0,-726.217224,68.54142,3.293397,12.2053,5.510278,13.667408,-2.983829,3.098029,-3.310813,-1.564384,...,5.362133e-09,0.00212,0.050476,37.457439,0.002697,0.891978,13286.038306,0.002952,7416.379545,5551.324979
1,-719.127808,70.20224,1.169071,13.123216,7.837617,14.41195,-4.110705,4.469619,-3.53873,-3.657982,...,5.566884e-09,0.002258,0.052904,40.032833,0.002662,0.992758,13191.718251,0.003416,7135.753114,5653.712371
2,-714.994934,69.690376,3.925557,11.925324,6.423343,11.014113,-2.874456,4.514386,-4.470305,-2.665093,...,5.541867e-09,0.002707,0.046627,37.690022,0.003359,0.860653,13280.28298,0.004721,7240.619346,5640.892215
3,-710.959839,67.579193,5.783356,13.227695,6.194669,12.640195,-1.662046,5.663977,-4.953693,-3.484669,...,5.706893e-09,0.002521,0.053835,39.524185,0.003164,0.918893,13273.018037,0.004341,7009.490125,5802.602446
4,-759.917847,75.788948,6.028997,14.562723,6.459432,14.636641,-2.999552,4.625813,-5.19535,-0.702961,...,5.490418e-09,0.001579,0.045929,38.651924,0.001842,0.988106,12649.614081,0.001624,6997.114097,5518.781643


In [19]:
features.shape

(1440, 175)

In [20]:
labels = data['Emotion']

In [21]:
actor_labels = pd.Series([int(x[-2:]) for x in data['Actor']], name='Actor')

In [22]:
features_values = features.values

In [23]:
def minMax(x):
    return pd.Series(index=['min','max'],data=[x.min(),x.max()])

In [24]:
minMax=features.apply(minMax)

In [25]:
len(minMax)

2

In [26]:
minMax

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,165,166,167,168,169,170,171,172,173,174
min,-873.24231,18.9622,-51.987183,-17.522383,-22.980276,-11.918348,-29.965725,-16.97331,-21.486086,-21.428455,...,1.692426e-09,0.000328,0.027717,2.918853,0.00027,0.774896,4998.104889,7.7e-05,2604.486405,2753.455274
max,-333.377991,115.150726,22.751472,36.544689,21.137091,28.005917,10.136993,16.415524,6.143632,7.424147,...,1.918034e-07,0.088504,0.168519,56.945759,0.118322,1.711348,14629.079312,4.081514,7654.876393,6367.981144


In [34]:
features.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,165,166,167,168,169,170,171,172,173,174
count,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,...,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0
mean,-617.140581,66.080678,-4.862992,10.20049,-0.221858,8.249207,-8.381523,0.883575,-7.188022,-5.599906,...,8.866162e-09,0.010056,0.069216,26.352842,0.012137,1.111466,10842.090014,0.138023,5560.69955,5054.138396
std,102.583014,14.814136,11.788753,8.135827,6.869832,7.183415,6.068622,5.2175,4.972387,4.351054,...,7.177196e-09,0.012293,0.018194,11.865822,0.015512,0.137765,1503.491458,0.376826,810.441517,568.791205
min,-873.24231,18.9622,-51.987183,-17.522383,-22.980276,-11.918348,-29.965725,-16.97331,-21.486086,-21.428455,...,1.692426e-09,0.000328,0.027717,2.918853,0.00027,0.774896,4998.104889,7.7e-05,2604.486405,2753.455274
25%,-693.68042,55.788986,-12.19283,4.913393,-4.880722,2.871132,-12.363138,-2.674974,-10.828884,-8.717539,...,6.588461e-09,0.002892,0.057009,15.424516,0.003377,1.012132,9785.626614,0.004462,4971.463118,4667.985285
50%,-622.874908,66.305763,-4.08565,10.293716,-0.372262,8.559745,-8.944895,0.729321,-7.731732,-5.618548,...,7.828838e-09,0.005693,0.066405,28.185102,0.006799,1.093377,11035.676077,0.017409,5576.231073,5144.064319
75%,-551.599503,76.608791,3.578638,15.05987,4.966537,13.646142,-4.219577,4.262726,-3.369817,-2.317766,...,9.151838e-09,0.011736,0.079508,36.045641,0.01389,1.185385,11987.194293,0.070814,6144.085023,5471.756522
max,-333.377991,115.150726,22.751472,36.544689,21.137091,28.005917,10.136993,16.415524,6.143632,7.424147,...,1.918034e-07,0.088504,0.168519,56.945759,0.118322,1.711348,14629.079312,4.081514,7654.876393,6367.981144


In [35]:
features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,165,166,167,168,169,170,171,172,173,174
0,-726.217224,68.54142,3.293397,12.2053,5.510278,13.667408,-2.983829,3.098029,-3.310813,-1.564384,...,5.362133e-09,0.00212,0.050476,37.457439,0.002697,0.891978,13286.038306,0.002952,7416.379545,5551.324979
1,-719.127808,70.20224,1.169071,13.123216,7.837617,14.41195,-4.110705,4.469619,-3.53873,-3.657982,...,5.566884e-09,0.002258,0.052904,40.032833,0.002662,0.992758,13191.718251,0.003416,7135.753114,5653.712371
2,-714.994934,69.690376,3.925557,11.925324,6.423343,11.014113,-2.874456,4.514386,-4.470305,-2.665093,...,5.541867e-09,0.002707,0.046627,37.690022,0.003359,0.860653,13280.28298,0.004721,7240.619346,5640.892215
3,-710.959839,67.579193,5.783356,13.227695,6.194669,12.640195,-1.662046,5.663977,-4.953693,-3.484669,...,5.706893e-09,0.002521,0.053835,39.524185,0.003164,0.918893,13273.018037,0.004341,7009.490125,5802.602446
4,-759.917847,75.788948,6.028997,14.562723,6.459432,14.636641,-2.999552,4.625813,-5.19535,-0.702961,...,5.490418e-09,0.001579,0.045929,38.651924,0.001842,0.988106,12649.614081,0.001624,6997.114097,5518.781643


In [36]:
features_normalised = features.copy()
for i in range(175):
    features_normalised[i] = features_normalised.apply(lambda x: (x[i]-minMax[i]['min'])/(minMax[i]['max']-minMax[i]['min']), axis=1)
features_normalised.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,165,166,167,168,169,170,171,172,173,174
0,0.272337,0.515438,0.739652,0.54983,0.64579,0.640857,0.67282,0.601139,0.657816,0.688467,...,0.019303,0.020323,0.161635,0.639285,0.020555,0.125028,0.86055,0.000705,0.952777,0.774063
1,0.285469,0.532704,0.711228,0.566807,0.698543,0.659506,0.64472,0.642219,0.649567,0.615905,...,0.02038,0.021892,0.178884,0.686954,0.020258,0.232647,0.850756,0.000818,0.897211,0.802389
2,0.293124,0.527383,0.74811,0.544651,0.666486,0.574399,0.675547,0.643559,0.615851,0.650318,...,0.020248,0.026986,0.1343,0.64359,0.026162,0.091577,0.859952,0.001138,0.917975,0.798843
3,0.300599,0.505434,0.772967,0.568739,0.661303,0.615128,0.70578,0.67799,0.598355,0.621912,...,0.021116,0.024872,0.185497,0.677539,0.024511,0.153769,0.859198,0.001045,0.872211,0.843582
4,0.209913,0.590785,0.776254,0.593432,0.667304,0.665134,0.672428,0.646897,0.589609,0.718323,...,0.019978,0.014187,0.129345,0.661394,0.013311,0.227678,0.794469,0.000379,0.86976,0.765059


In [37]:
features_normalised.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,165,166,167,168,169,170,171,172,173,174
count,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,...,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0
mean,0.474382,0.489855,0.63052,0.51275,0.515861,0.505145,0.538223,0.534816,0.517489,0.5486,...,0.037734,0.110328,0.294735,0.433747,0.100521,0.35941,0.606791,0.033798,0.585344,0.63651
std,0.190016,0.154011,0.157733,0.150477,0.155717,0.179926,0.151327,0.156265,0.179965,0.150803,...,0.037753,0.139412,0.129216,0.219628,0.131401,0.147114,0.15611,0.092327,0.160471,0.157363
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.332606,0.38286,0.532447,0.414962,0.410259,0.370438,0.438937,0.428237,0.385715,0.440547,...,0.025754,0.029084,0.208038,0.231471,0.026313,0.253335,0.497096,0.001074,0.468672,0.529677
50%,0.46376,0.492196,0.64092,0.514474,0.512452,0.512923,0.524175,0.530196,0.49781,0.547954,...,0.032278,0.060842,0.274771,0.46766,0.055306,0.340093,0.626891,0.004247,0.588419,0.661389
75%,0.595785,0.599308,0.743468,0.602627,0.633465,0.640325,0.642005,0.636022,0.655681,0.662356,...,0.039237,0.12938,0.367831,0.613154,0.115368,0.438346,0.725689,0.017331,0.700856,0.752049
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [60]:
X_train, X_test, y_train, y_test = train_test_split(features_normalised, labels, test_size=0.30, random_state=42)

In [61]:
X_train.to_csv('SER_data/X_train.csv', index=False)
X_test.to_csv('SER_data/X_test.csv', index=False)
y_train.to_csv('SER_data/y_train.csv', index=False)
y_test.to_csv('SER_data/y_test.csv', index=False)

In [72]:
X_train, X_test, y_train, y_test = train_test_split(features_normalised, actor_labels, test_size=0.30, random_state=42)

In [73]:
X_train.to_csv('Speaker_Classification_data/X_train.csv', index=False)
X_test.to_csv('Speaker_Classification_data/X_test.csv', index=False)
y_train.to_csv('Speaker_Classification_data/y_train.csv', index=False)
y_test.to_csv('Speaker_Classification_data/y_test.csv', index=False)

In [27]:
total_pairs = []
for i in range(len(features_normalised)-1):
    for j in range(i+1, len(features_normalised)):
        f1 = features_normalised.loc[i, :].values.tolist()
        f2 = features_normalised.loc[j, :].values.tolist()
        if actor_labels[i] == actor_labels[j]:
            l = 1
        else:
            l = 0
        total_pairs.append([f1, f2, l])
pairs_df = pd.DataFrame(total_pairs, columns=['inputA', 'inputB', 'label'])

In [28]:
pairs_df.head()

Unnamed: 0,inputA,inputB,label
0,"[0.27233710455813176, 0.5154379411790018, 0.73...","[0.2854689531660164, 0.5327042404145397, 0.711...",1
1,"[0.27233710455813176, 0.5154379411790018, 0.73...","[0.2931243461802795, 0.5273827775710467, 0.748...",1
2,"[0.27233710455813176, 0.5154379411790018, 0.73...","[0.3005986227234241, 0.5054343905494958, 0.772...",1
3,"[0.27233710455813176, 0.5154379411790018, 0.73...","[0.20991285946164528, 0.5907850515999307, 0.77...",1
4,"[0.27233710455813176, 0.5154379411790018, 0.73...","[0.25605640704038835, 0.6251353793996148, 0.80...",1


In [29]:
pairs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1036080 entries, 0 to 1036079
Data columns (total 3 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   inputA  1036080 non-null  object
 1   inputB  1036080 non-null  object
 2   label   1036080 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 23.7+ MB


In [30]:
matched_df = pairs_df[pairs_df['label']==1].copy()
mismatched_df = pairs_df[pairs_df['label']==0].copy()

In [31]:
matched_df.describe()

Unnamed: 0,label
count,42480.0
mean,1.0
std,0.0
min,1.0
25%,1.0
50%,1.0
75%,1.0
max,1.0


In [32]:
mismatched_df.describe()

Unnamed: 0,label
count,993600.0
mean,0.0
std,0.0
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,0.0


In [33]:
# shuffling
matched_df = matched_df.sample(frac=1).reset_index(drop=True)
mismatched_df = mismatched_df.sample(frac=1).reset_index(drop=True)

In [34]:
downsampled_df = pd.concat([matched_df, mismatched_df[:len(matched_df)].copy()], ignore_index=True)

In [35]:
downsampled_df.describe()

Unnamed: 0,label
count,84960.0
mean,0.5
std,0.500003
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [36]:
# shuffling and taking train-test split

downsampled_df = downsampled_df.sample(frac=1).reset_index(drop=True)
train_size = int(0.7*len(downsampled_df))
train_df = downsampled_df[:train_size].copy()
test_df = downsampled_df[train_size+1:].copy()

In [37]:
test_df.reset_index(drop=True, inplace=True)

In [38]:
train_df.head()

Unnamed: 0,inputA,inputB,label
0,"[0.2548620784505655, 0.5727212983477814, 0.767...","[0.39912236520474464, 0.6317624808513673, 0.80...",0
1,"[0.447249036391409, 0.7145415232299728, 0.6546...","[0.6877563343563813, 0.2620880929700986, 0.547...",0
2,"[0.4520862706818447, 0.48994289050589107, 0.59...","[0.5978961094667046, 0.32250348370571186, 0.32...",0
3,"[0.8625960796311012, 0.36900578447635274, 0.51...","[0.42108867279274476, 0.43359682338229716, 0.5...",0
4,"[0.38979531916803545, 0.36376930989412876, 0.4...","[0.2943152830758866, 0.5540498175568138, 0.644...",1


In [39]:
test_df.head()

Unnamed: 0,inputA,inputB,label
0,"[0.5059726039291419, 0.4252726534862022, 0.776...","[0.5639246740892763, 0.4126797568153903, 0.599...",1
1,"[0.12106494223888209, 0.6208612194042755, 0.93...","[0.2244010465863897, 0.6231336540149697, 0.871...",0
2,"[0.5837704944536188, 0.42954332352960883, 0.57...","[0.5477542066335432, 0.4934326838043664, 0.532...",0
3,"[0.3857447318226952, 0.7669578251603086, 0.645...","[0.8945456695583119, 0.34242048075118997, 0.51...",1
4,"[0.318031252653294, 0.528274301655661, 0.72983...","[0.32320369938871485, 0.25070946162058216, 0.6...",0


In [40]:
train_df.to_csv('Speaker_Classification_data/train_data.csv', index=False)
test_df.to_csv('Speaker_Classification_data/test_data.csv', index=False)

In [41]:
train_df.to_pickle('Speaker_Classification_data/train.df')
test_df.to_pickle('Speaker_Classification_data/test.df')

In [49]:
minMax

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,165,166,167,168,169,170,171,172,173,174
min,-873.24231,18.9622,-51.987183,-17.522383,-22.980276,-11.918348,-29.965725,-16.97331,-21.486086,-21.428455,...,1.692426e-09,0.000328,0.027717,2.918853,0.00027,0.774896,4998.104889,7.7e-05,2604.486405,2753.455274
max,-333.377991,115.150734,22.751472,36.544682,21.137091,28.005917,10.136993,16.415524,6.143632,7.424147,...,1.918034e-07,0.088504,0.168519,56.945759,0.118322,1.711348,14629.079312,4.081514,7654.876393,6367.981144


In [39]:
minMax.to_pickle('Speaker_Classification_data/minMax.df')