In [1]:
import numpy as np
import os
import pandas as pd
from pathlib import Path
from scipy import stats
from pydub import AudioSegment
import matplotlib.pyplot as plt
from scipy.io import wavfile as wav
from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, f1_score
from sklearn.preprocessing import OneHotEncoder
from scipy.signal import spectrogram
from sklearn.neural_network import MLPClassifier


np.seterr(divide = 'ignore') 

cwd = '/Users/franky/Documents/CS573/project'
group = "cv-valid-train"
data_path = os.path.join(cwd, "data", group)
fft_path = "cv-valid-train-window-fft"
os.chdir(data_path)

np.random.seed(123)
df = pd.read_csv( group + ".csv")
# df = df.rename(columns={"Unnamed: 0":"filenum"}).reset_index(drop=True)
# df.to_csv( group + ".csv")
df = df.drop(df.columns[0], axis=1)
df

Unnamed: 0,filenum,filename,age,gender,accent
0,5,cv-valid-train/sample-000005.wav,20s,female,us
1,8,cv-valid-train/sample-000008.wav,70s,male,us
2,13,cv-valid-train/sample-000013.wav,30s,female,us
3,14,cv-valid-train/sample-000014.wav,60s,male,england
4,19,cv-valid-train/sample-000019.wav,50s,male,australia
...,...,...,...,...,...
62416,195766,cv-valid-train/sample-195766.wav,40s,male,england
62417,195767,cv-valid-train/sample-195767.wav,30s,female,us
62418,195770,cv-valid-train/sample-195770.wav,20s,male,england
62419,195771,cv-valid-train/sample-195771.wav,30s,male,england


In [2]:
n_sample = 20000
df_sample = df.sample(n=n_sample, random_state=33).sort_index().reset_index(drop=True)
df_sample

Unnamed: 0,filenum,filename,age,gender,accent
0,13,cv-valid-train/sample-000013.wav,30s,female,us
1,14,cv-valid-train/sample-000014.wav,60s,male,england
2,19,cv-valid-train/sample-000019.wav,50s,male,australia
3,20,cv-valid-train/sample-000020.wav,30s,male,england
4,26,cv-valid-train/sample-000026.wav,20s,female,australia
...,...,...,...,...,...
19995,195756,cv-valid-train/sample-195756.wav,30s,male,us
19996,195762,cv-valid-train/sample-195762.wav,20s,male,england
19997,195766,cv-valid-train/sample-195766.wav,40s,male,england
19998,195767,cv-valid-train/sample-195767.wav,30s,female,us


In [3]:
for file in df_sample["filename"]:
    filename = os.path.join(fft_path, Path(file).stem)+".npz"
    if not os.path.exists(filename):
        print(file, "not exist")
        df_sample = df_sample[df_sample.filename != file]

# for file in df_sample_fm["filename"]:
#     filename = os.path.join(fft_path, Path(file).stem)+".npz"
#     if not os.path.exists(filename):
#         print(file, "not exist")
#         # print(df.index[df["filename"]==file].tolist())
#         idx = df_sample_fm.index[df["filename"]==file].tolist()
#         df_sample_fm = df_sample_fm.drop(idx)


In [4]:
df_sample

Unnamed: 0,filenum,filename,age,gender,accent
0,13,cv-valid-train/sample-000013.wav,30s,female,us
1,14,cv-valid-train/sample-000014.wav,60s,male,england
2,19,cv-valid-train/sample-000019.wav,50s,male,australia
3,20,cv-valid-train/sample-000020.wav,30s,male,england
4,26,cv-valid-train/sample-000026.wav,20s,female,australia
...,...,...,...,...,...
19995,195756,cv-valid-train/sample-195756.wav,30s,male,us
19996,195762,cv-valid-train/sample-195762.wav,20s,male,england
19997,195766,cv-valid-train/sample-195766.wav,40s,male,england
19998,195767,cv-valid-train/sample-195767.wav,30s,female,us


In [5]:
data = np.load('cv-valid-train-window-fft/sample-000013.npz')
f_dim = data["f"].shape[0]

feature_df = np.zeros((df_sample.shape[0], f_dim))
for i, file in enumerate(df_sample["filename"]):
    file = Path(file).stem + ".npz"
    data = np.load(os.path.join(fft_path,file))
    f = data["f"]
    t = data["t"]
    Sxx = data["Sxx"]
    # Sxx_trunc = Sxx[:, Sxx.sum(axis=0)>1]
    Sxx_sum = 10*np.log10(Sxx.sum(axis=1)+1e-10)
    # Sxx_normalized = Sxx_sum/Sxx_sum.sum() 
    Sxx_normalized = Sxx_sum/t.shape[0]
    feature_df[i,:] = Sxx_normalized

In [44]:
X = pd.DataFrame(feature_df)
Y = df_sample["gender"]
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.2, random_state=33)

clf = svm.SVC(kernel='rbf') # Linear Kernel
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)
print(cm/cm.sum())
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred, pos_label='male'))
print(precision_score(y_test, y_pred, pos_label='female'))

print(f1_score(y_test, y_pred, pos_label='male'))
print(f1_score(y_test, y_pred, pos_label='female'))


[[ 823  218]
 [ 102 2857]]
[[0.20575 0.0545 ]
 [0.0255  0.71425]]
0.92
0.9291056910569105
0.8897297297297297
0.9469671859463042
0.8372329603255341


In [47]:
X = pd.DataFrame(feature_df)
Y = df_sample["gender"]
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.2, random_state=33)

clf_ann = MLPClassifier(solver="adam", hidden_layer_sizes=( 128, 64, 32,16))
clf_ann.fit(X_train, y_train)


y_pred = clf_ann.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)
print(cm/cm.sum())
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred, pos_label='male'))
print(precision_score(y_test, y_pred, pos_label='female'))

print(f1_score(y_test, y_pred, pos_label='male'))
print(f1_score(y_test, y_pred, pos_label='female'))


[[ 862  179]
 [  88 2871]]
[[0.2155  0.04475]
 [0.022   0.71775]]
0.93325
0.9413114754098361
0.9073684210526316
0.9555666500249625
0.8658965344048216


In [6]:
df_sample["age01"] = 0
df_sample

Unnamed: 0,filenum,filename,age,gender,accent,age01
0,13,cv-valid-train/sample-000013.wav,30s,female,us,0
1,14,cv-valid-train/sample-000014.wav,60s,male,england,0
2,19,cv-valid-train/sample-000019.wav,50s,male,australia,0
3,20,cv-valid-train/sample-000020.wav,30s,male,england,0
4,26,cv-valid-train/sample-000026.wav,20s,female,australia,0
...,...,...,...,...,...,...
19995,195756,cv-valid-train/sample-195756.wav,30s,male,us,0
19996,195762,cv-valid-train/sample-195762.wav,20s,male,england,0
19997,195766,cv-valid-train/sample-195766.wav,40s,male,england,0
19998,195767,cv-valid-train/sample-195767.wav,30s,female,us,0


In [7]:
df_sample.loc[df_sample['age'] == '10s', 'age01'] = 'Young'
df_sample.loc[df_sample['age'] == '20s', 'age01'] = 'Young'
df_sample.loc[df_sample['age'] == '30s', 'age01'] = 'Young'
df_sample.loc[df_sample['age'] == '40s', 'age01'] = 'Old'
df_sample.loc[df_sample['age'] == '50s', 'age01'] = 'Old'
df_sample.loc[df_sample['age'] == '60s', 'age01'] = 'Old'
df_sample.loc[df_sample['age'] == '70s', 'age01'] = 'Old'
df_sample.loc[df_sample['age'] == '80s', 'age01'] = 'Old'
print(np.mean(df_sample["age01"]=="Young"))
df_sample

0.6066


Unnamed: 0,filenum,filename,age,gender,accent,age01
0,13,cv-valid-train/sample-000013.wav,30s,female,us,Young
1,14,cv-valid-train/sample-000014.wav,60s,male,england,Old
2,19,cv-valid-train/sample-000019.wav,50s,male,australia,Old
3,20,cv-valid-train/sample-000020.wav,30s,male,england,Young
4,26,cv-valid-train/sample-000026.wav,20s,female,australia,Young
...,...,...,...,...,...,...
19995,195756,cv-valid-train/sample-195756.wav,30s,male,us,Young
19996,195762,cv-valid-train/sample-195762.wav,20s,male,england,Young
19997,195766,cv-valid-train/sample-195766.wav,40s,male,england,Old
19998,195767,cv-valid-train/sample-195767.wav,30s,female,us,Young


In [54]:
X = pd.DataFrame(feature_df)
Y = df_sample["age01"]
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.2, random_state=33)
clf_ann = MLPClassifier(solver="adam", hidden_layer_sizes=( 128, 64, 32,16))
clf_ann.fit(X_train, y_train)

y_pred = clf_ann.predict(X_test)
print(confusion_matrix(y_test, y_pred))

print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred, pos_label='Young'))
print(precision_score(y_test, y_pred, pos_label='Old'))

print(f1_score(y_test, y_pred, pos_label='Young'))
print(f1_score(y_test, y_pred, pos_label='Old'))

[[ 937  674]
 [ 413 1976]]
0.72825
0.7456603773584906
0.6940740740740741
0.7842825957531256
0.6328942924687606




In [58]:
X = pd.DataFrame(feature_df)
Y = df_sample["age01"]
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.2, random_state=33)
clf_ann = MLPClassifier(solver="adam", hidden_layer_sizes=( 128, 128, 64, 32,16))
clf_ann.fit(X_train, y_train)

y_pred = clf_ann.predict(X_test)
print(confusion_matrix(y_test, y_pred))

print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred, pos_label='Young'))
print(precision_score(y_test, y_pred, pos_label='Old'))

print(f1_score(y_test, y_pred, pos_label='Young'))
print(f1_score(y_test, y_pred, pos_label='Old'))

[[ 966  645]
 [ 444 1945]]
0.72775
0.750965250965251
0.6851063829787234
0.7812813818035751
0.6395233366434956




In [8]:
X = pd.DataFrame(feature_df)
Y = df_sample["age01"]
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.2, random_state=33)

clf_ann = MLPClassifier(solver="adam", hidden_layer_sizes=( 128, 128, 64, 32,16), random_state=33)
clf_ann.fit(X_train, y_train)

y_pred = clf_ann.predict(X_test)
print(confusion_matrix(y_test, y_pred))

print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred, pos_label='Young'))
print(precision_score(y_test, y_pred, pos_label='Old'))

print(f1_score(y_test, y_pred, pos_label='Young'))
print(f1_score(y_test, y_pred, pos_label='Old'))

[[1044  567]
 [ 566 1823]]
0.71675
0.7627615062761506
0.6484472049689441
0.762921113203599
0.6482458863706924




In [56]:
X = pd.DataFrame(feature_df)
Y = df_sample["age01"]
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.2, random_state=33)

clf = svm.SVC(kernel='rbf') # Linear Kernel
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)


confusion_matrix(y_test, y_pred)
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))

print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred, pos_label='Young'))
print(precision_score(y_test, y_pred, pos_label='Old'))

print(f1_score(y_test, y_pred, pos_label='Young'))
print(f1_score(y_test, y_pred, pos_label='Old'))

[[ 269 1272]
 [ 142 2317]]
0.6465
0.6455837280579548
0.6545012165450121
0.7662037037037037
0.27561475409836067


In [9]:
enc = OneHotEncoder(handle_unknown="ignore")
age_gender = df_sample.iloc[:, [5,3]]
enc.fit(age_gender)

OneHotEncoder(handle_unknown='ignore')

In [10]:
enc.categories_

[array(['Old', 'Young'], dtype=object),
 array(['female', 'male'], dtype=object)]

In [11]:
enc_mat = enc.inverse_transform(enc.transform(age_gender).toarray())
df_sample["gender-age"] = enc_mat[:,0] + "-" + enc_mat[:,1]
np.unique(df_sample["gender-age"])


array(['Old-female', 'Old-male', 'Young-female', 'Young-male'],
      dtype=object)

In [16]:
X = pd.DataFrame(feature_df)
Y = df_sample["gender-age"]
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.2)

clf_ann = MLPClassifier(solver="adam", hidden_layer_sizes=( 128, 128, 64, 32,16))
clf_ann.fit(X_train, y_train)

y_pred = clf_ann.predict(X_test)
print(confusion_matrix(y_test, y_pred))

print(accuracy_score(y_test, y_pred))
# print(precision_score(y_test, y_pred, pos_label='Young'))
# print(precision_score(y_test, y_pred, pos_label='Old'))

# print(f1_score(y_test, y_pred, pos_label='Young'))
# print(f1_score(y_test, y_pred, pos_label='Old'))

[[ 246   47  121   61]
 [  19  630   12  400]
 [  64   28  407   46]
 [  34  346   35 1504]]
0.69675




In [78]:
print(confusion_matrix(y_test, y_pred, labels=['Old-female', 'Young-female', 'Old-male', 'Young-male']))
# np.unique(df_sample["gender-age"])

[[ 278  112   35   57]
 [  63  441   24   53]
 [  22   20  631  450]
 [  30   29  317 1438]]


In [76]:

print(confusion_matrix(y_test, y_pred, labels=['Old-female', 'Young-female', 'Old-male', 'Young-male']))
# np.unique(df_sample["gender-age"])

[[ 281  108   41   54]
 [  76  428   14   39]
 [  26   25  618  458]
 [  32   45  284 1471]]


In [12]:
track = AudioSegment.from_file("my_voice/test.m4a")
wav_path = "my_voice/test.wav"
file_handle = track.export(wav_path, format="wav")

In [13]:
nperseg = 4800
rate, data = wav.read(wav_path)
if data.shape[0]>=10*nperseg:
    f, t, Sxx = spectrogram(x=data, fs=rate, nperseg=nperseg, window="hamming", noverlap= int(nperseg/2))
    f_trunc = f[f<=1000]
    Sxx_trunc = Sxx[f<=1000]

    dst = os.path.join( "my_voice/test.npz")
    np.savez(dst, f = f_trunc, t = t, Sxx = Sxx_trunc)
else:
    print(file, "is too short!")

In [14]:
data_test = np.load("my_voice/test.npz")
t = data_test["t"]
Sxx = data_test["Sxx"]
Sxx_sum = 10*np.log10(Sxx.sum(axis=1)+1e-10)
Sxx_test = Sxx_sum/t.shape[0]

In [17]:
clf_ann.predict(Sxx_test.reshape(1,-1))

array(['Young-female'], dtype='<U12')