# Using Scikit learn SVM setup we train SVM on the SCMC ivectors

In [1]:
import scipy.io as sio
import numpy as np

## Working directory

/homes/bc305/myphd/stage1/experiments/ASVspoof2017/scripts/afterInterspeech/repeat/individual_systems/Experiment4/SCMC/ivector_setup/

In [2]:
base='/homes/bc305/myphd/stage1/experiments/ASVspoof2017/scripts/afterInterspeech/repeat/individual_systems/'
scmc=base+'/Experiment4/SCMC/ivector_setup/'

## Load the i-vectors

In [3]:
train_ivec = np.transpose(sio.loadmat(scmc+'/ivectors/train_ivectors.mat')['train_ivs'])

In [4]:
print(train_ivec.shape)

(3014, 200)


In [187]:
dev_ivec = np.transpose(sio.loadmat(scmc+'/ivectors/dev_ivectors.mat')['dev_ivs'])

In [188]:
print(dev_ivec.shape)

(1710, 200)


In [189]:
eval_ivec = np.transpose(sio.loadmat(scmc+'/ivectors/eval_ivectors.mat')['eval_ivs'])

In [190]:
print(eval_ivec.shape)

(13306, 200)


In [191]:
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler

### Normalize data to have zero mean and unit variance

In [192]:
scaler = StandardScaler().fit(train_ivec)
print(scaler)

StandardScaler(copy=True, with_mean=True, with_std=True)


In [193]:
n_train_ivec = scaler.transform(train_ivec)
n_dev_ivec = scaler.transform(dev_ivec)
n_eval_ivec = scaler.transform(eval_ivec)

In [194]:
print(n_train_ivec.shape)
print(n_dev_ivec.shape)
print(n_eval_ivec.shape)

(3014, 200)
(1710, 200)
(13306, 200)


In [195]:
gen_labels = np.ones(1507)    # +1 labels from genuine class
spf_labels = 0-gen_labels     # -1 labels from spoofed class

train_data = n_train_ivec
train_labels = np.hstack((gen_labels, spf_labels))

# Linear SVM classifier

In [196]:
svm = LinearSVC(random_state=0)
svm.fit(train_data, train_labels)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
     verbose=0)

### Score using the trained SVM

So far what we have done ? Brief story.

We trained I-vector setup on 40 dimensional SCMC features using the MSR identity toolbox. The path to this setup is here:/homes/bc305/myphd/stage1/experiments/ASVspoof2017/scripts/afterInterspeech/repeat/individual_systems/Experiment4/SCMC/ivector_setup. Having done that, we extracted ivectors for training, development and evaluation data. We used 200 dimensional ivector, 256 mixture UBM.

Then, using scikit learn SVM linear package we trained a linear SVM. We have one 200 dimensional ivector for each audio file now. We use +1 for genuine and -1 for spoofed class. We also standardize the dataset to have zero mean and unit variance. We estimate mean and variance from training data. Having trained the SVM, we used it to score on the development, training and evaluation data and obtained score value for each test files. We will now use these scores in our EER setup to compute the EER.

In [197]:
dev_scores = svm.decision_function(n_dev_ivec)
train_scores = svm.decision_function(train_data)
eval_scores = svm.decision_function(n_eval_ivec)

In [198]:
# Save these scores to compute EER.
def save_scores(scores, filename):
    with open(filename, 'w') as f:
        for score in scores:
            f.write(str(score)+'\n')

In [199]:
save_scores(train_scores, 'scores/train_scores.txt')
save_scores(dev_scores, 'scores/dev_scores.txt')
save_scores(eval_scores, 'scores/eval_scores.txt')

## Scripts in matlab injecting 60ms and extracting i-vectors

/homes/bc305/myphd/stage1/experiments/ASVspoof2017/scripts/afterInterspeech/repeat/individual_systems/
Experiment4/SCMC/ivector_setup/extract_ivectors_injecting60ms

# Test performance on new ivectors

In [200]:
new_train_ivec = np.transpose(sio.loadmat(scmc+'/ivectors/train_ivectors_injected.mat')['train_ivs'])
new_dev_ivec = np.transpose(sio.loadmat(scmc+'/ivectors/dev_ivectors_injected.mat')['dev_ivs'])
new_eval_ivec = np.transpose(sio.loadmat(scmc+'/ivectors/eval_ivectors_injected.mat')['eval_ivs'])

In [201]:
print(new_eval_ivec.shape)
print(new_train_ivec.shape)
print(new_dev_ivec.shape)

(13306, 200)
(3014, 200)
(1710, 200)


In [202]:
# Normalise the i-vectors
nn_train_ivec = scaler.transform(new_train_ivec)
nn_dev_ivec = scaler.transform(new_dev_ivec)
nn_eval_ivec = scaler.transform(new_eval_ivec)

In [203]:
# Score the new ivectors now
new_train_scores = svm.decision_function(nn_train_ivec)
new_dev_scores = svm.decision_function(nn_dev_ivec)
new_eval_scores = svm.decision_function(nn_eval_ivec)

In [204]:
# Save the score files for computing EER in matlab
save_scores(new_train_scores, 'scores/train_scores_injected.txt')
save_scores(new_dev_scores, 'scores/dev_scores_injected.txt')
save_scores(new_eval_scores, 'scores/eval_scores_injected.txt')

# Results: EER before and after adding 60ms signature to test files

    BEFORE:

    Tesing on training set
    EER on train = 0.564
    ----------------------
    Tesing on development set
    EER on dev = 21.889
    ----------------------
    Tesing on evaluation set
    EER on eval = 20.901
    
    
Next, we will see if injecting the 60ms signature will influence the prediction in the ivector-SVM setup. Therefore, we will now append it to every test files (dev and eval) and extract the i-vectors. Then we will use this new i-vectors to test and obtain new-set of scores which we will compute EER upon.     

    AFTER:

    Tesing on training set
    EER on train = 0.863
    ----------------------
    Tesing on development set
    EER on dev = 21.817
    ----------------------
    Tesing on evaluation set
    EER on eval = 20.530

# Training SVM on Pooled data

In [205]:
# Training i-vectors and labels
gen_labels = np.ones(1507)    # +1 labels from genuine class
spf_labels = 0-gen_labels     # -1 labels from spoofed class
gen_data = n_train_ivec[0:1507, :]
spf_data = n_train_ivec[1507:, :]

# Development i-vectors and labels
gen_labels_dev = np.ones(760)
spf_labels_dev = 0-np.ones(950)
gen_data_dev = n_dev_ivec[0:760, :]
spf_data_dev = n_dev_ivec[760:, :]

# Merge training+dev data 
pooled_gen_data = np.vstack((gen_data, gen_data_dev))
pooled_spf_data = np.vstack((spf_data, spf_data_dev))
pooled_data = np.vstack((pooled_gen_data, pooled_spf_data))

# Merge labels
pooled_gen_labels = np.hstack((gen_labels, gen_labels_dev))
pooled_spf_labels = np.hstack((spf_labels, spf_labels_dev))
pooled_labels = np.hstack((pooled_gen_labels, pooled_spf_labels))

In [206]:
print(pooled_gen_data.shape)
print(pooled_gen_labels.shape)

print(pooled_spf_data.shape)
print(pooled_spf_labels.shape)

print(pooled_data.shape)
print(pooled_labels.shape)

(2267, 200)
(2267,)
(2457, 200)
(2457,)
(4724, 200)
(4724,)


# Train SVM on pooled data

In [207]:
svm_pooled = LinearSVC(random_state=0)
svm_pooled.fit(pooled_data, pooled_labels)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
     verbose=0)

In [208]:
# Score using the new pooled svm model

dev_scores1 = svm_pooled.decision_function(n_dev_ivec)
train_scores1 = svm_pooled.decision_function(train_data)
eval_scores1 = svm_pooled.decision_function(n_eval_ivec)

In [209]:
# Save the new pooled scores

save_scores(train_scores1, 'scores/train_scores_pooled.txt')
save_scores(dev_scores1, 'scores/dev_scores_pooled.txt')
save_scores(eval_scores1, 'scores/eval_scores_pooled.txt')

# Results on pooled SVM model

    Tesing on training set
    EER on train = 2.415
    --------------------------------------------
    Tesing on development set
    EER on dev = 4.621
    --------------------------------------------
    Tesing on evaluation set
    EER on dev = 18.310
    
We are not going to look into injecting 60ms on this as the EER on eval is pretty same we had with models trained on only training data. But we can look into models trained on only Dev data.    

# Training SVM on Development data

In [210]:
# Development i-vectors and labels
gen_labels_dev = np.ones(760)
spf_labels_dev = 0-np.ones(950)

dev_data = n_dev_ivec
dev_labels = np.hstack((gen_labels_dev, spf_labels_dev))

In [211]:
# Fit a model
svm_onDev = LinearSVC(random_state=0)
svm_onDev.fit(dev_data, dev_labels)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
     verbose=0)

In [212]:
# Score using the new dev-data svm model
dev_scores2 = svm_onDev.decision_function(n_dev_ivec)
train_scores2 = svm_onDev.decision_function(train_data)
eval_scores2 = svm_onDev.decision_function(n_eval_ivec)

In [213]:
# Save the new pooled scores
save_scores(train_scores2, 'scores/train_scores_dev.txt')
save_scores(dev_scores2, 'scores/dev_scores_dev.txt')
save_scores(eval_scores2, 'scores/eval_scores_dev.txt')

# Results on dev-data trained SVM model


    Tesing on training set
    EER on train = 15.261
    --------------------------------------------
    Tesing on development set
    EER on dev = 1.520
    --------------------------------------------
    Tesing on evaluation set
    EER on dev = 22.668