### Load libraries

In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import StepLR
from RNN_trainer import train
from datasets import ReplySpoofDataset, collate_fn_pad
from config import LSTM_NUM_LAYERS, HIDDEN_SIZE, BATCH_SIZE, INPUT_SIZE, LINEAR_SIZE, OUTPUT_SIZE, LR, N_EPOCHS, \
    TRAIN_INDEX, VAL_INDEX, MODELS_DIR, MODEL_NAME, OUTPUT_DIR, TEST_RAW_DIR, SCALER_PATH, SCHEDULER_STEP_SIZE, \
    SCHEDULER_GAMMA
from utilities.basic_utils import make_valid_path, get_accelerator, export_incorrect_samples_to_csv
from utilities.model_utils import ModelManager, Metrics
from utilities.disply_utils import plot_losses, info
from RNN_tester import test, test_sample
from models import AntiSpoofingRNN
import IPython.display as ipd
import pickle
import pandas as pd
import csv
import os

In [2]:
torch.manual_seed(2021)

<torch._C.Generator at 0x2bf4596fc70>

### Define datasets

In [3]:
%%time
train_dataset= ReplySpoofDataset(TRAIN_INDEX)
val_dataset = ReplySpoofDataset(VAL_INDEX)

[2021-11-04 01:24:27.866504] -- Loading data (40001 files)...
[2021-11-04 01:26:29.873367] -- Done...
[2021-11-04 01:26:29.882920] -- Loading data (9999 files)...
[2021-11-04 01:27:32.765530] -- Done...
Wall time: 3min 4s


### Define model and data loaders

In [4]:
net = AntiSpoofingRNN(INPUT_SIZE, HIDDEN_SIZE, LSTM_NUM_LAYERS, LINEAR_SIZE, OUTPUT_SIZE)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE, collate_fn=collate_fn_pad)
val_loader = DataLoader(val_dataset, shuffle=True, batch_size=BATCH_SIZE, collate_fn=collate_fn_pad)

### Set loss function and optimizer

In [None]:
%%time
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=LR)
scheduler = StepLR(optimizer, step_size=SCHEDULER_STEP_SIZE, gamma=SCHEDULER_GAMMA)
model_manager = ModelManager(MODEL_NAME, make_valid_path(MODELS_DIR, is_dir=True, exist_ok=True))
model, model_manager = train(net, model_manager, train_loader, val_loader, N_EPOCHS, loss_fn, optimizer, scheduler,
                             get_accelerator('cuda'))


[2021-11-04 01:27:32.805064] -- Training
[2021-11-04 01:27:36.186132] -- Current lr = [0.0005]
[2021-11-04 01:28:22.282983] -- Training:   batch 150/626 from epoch 01/60 -- Loss = 0.49967899918556213
[2021-11-04 01:28:53.240267] -- Training:   batch 300/626 from epoch 01/60 -- Loss = 0.3753916919231415
[2021-11-04 01:29:24.569645] -- Training:   batch 450/626 from epoch 01/60 -- Loss = 0.46179118752479553
[2021-11-04 01:29:55.905288] -- Training:   batch 600/626 from epoch 01/60 -- Loss = 0.5969678163528442
[2021-11-04 01:30:01.115374] -- Training:   batch 626/626 from epoch 01/60 -- Loss = 0.20117299258708954
[2021-11-04 01:30:04.802873] -- Validating: batch 050/157 from epoch 01/60 -- Loss = 0.5535688400268555
[2021-11-04 01:30:08.451172] -- Validating: batch 100/157 from epoch 01/60 -- Loss = 0.5063414573669434
[2021-11-04 01:30:12.121405] -- Validating: batch 150/157 from epoch 01/60 -- Loss = 0.5063385963439941
[2021-11-04 01:30:12.596545] -- Validating: batch 157/157 from epoch 0

[2021-11-04 01:44:48.504905] -- Training:   batch 150/626 from epoch 08/60 -- Loss = 0.44398587942123413
[2021-11-04 01:45:20.491653] -- Training:   batch 300/626 from epoch 08/60 -- Loss = 0.6115612983703613
[2021-11-04 01:45:51.830327] -- Training:   batch 450/626 from epoch 08/60 -- Loss = 0.5954432487487793
[2021-11-04 01:46:23.386654] -- Training:   batch 600/626 from epoch 08/60 -- Loss = 0.5476207137107849
[2021-11-04 01:46:28.854385] -- Training:   batch 626/626 from epoch 08/60 -- Loss = 0.23809656500816345
[2021-11-04 01:46:32.537923] -- Validating: batch 050/157 from epoch 08/60 -- Loss = 0.5462130308151245
[2021-11-04 01:46:36.195502] -- Validating: batch 100/157 from epoch 08/60 -- Loss = 0.5255399346351624
[2021-11-04 01:46:39.860022] -- Validating: batch 150/157 from epoch 08/60 -- Loss = 0.670250415802002
[2021-11-04 01:46:40.368643] -- Validating: batch 157/157 from epoch 08/60 -- Loss = 0.5007554292678833
[2021-11-04 01:46:40.368643] -- Epoch 08/60 -- Train loss = 0.5

### Dump model manager

In [None]:
with open(os.path.join(MODELS_DIR, MODEL_NAME+'.mgr'), 'wb') as handler:
    pickle.dump(model_manager, handler)

### Plot losses

In [None]:
plot_losses("Losses", model_manager.epochs_losses_train, model_manager.epochs_losses_val)

In [None]:
print(model_manager.last_updated)

### Validating model performance

In [None]:
# Used saved model to ensure that weights were successfully saved
info("Validating model")
model = AntiSpoofingRNN(INPUT_SIZE, HIDDEN_SIZE, LSTM_NUM_LAYERS, LINEAR_SIZE, OUTPUT_SIZE)
model_manager = ModelManager(MODEL_NAME, make_valid_path(MODELS_DIR, is_dir=True, exist_ok=True))
model, _ = model_manager.load_checkpoint(MODEL_NAME + '.pkl', model)

In [None]:
# First find train metrics
info("Validating performance on train data")
files, y_true, y_pred, y_prob, threshold = test(model, train_loader, pred_threshold='auto',
                                                device=get_accelerator('cuda'))
export_incorrect_samples_to_csv(files, y_pred, y_true, os.path.join(OUTPUT_DIR, 'train_incorrect.csv'))

info(f"Classification threshold = {threshold}")
info("Calculating train metrics")
train_metrics, train_metrics_str = Metrics.calculate_metrics(y_true, y_pred, y_prob)
info(f"Train accuracy = {train_metrics['Accuracy']:0.4f}")

# Second, find validation metrics
info("Validating performance on validation data")
val_dataset = ReplySpoofDataset(VAL_INDEX)
val_loader = DataLoader(val_dataset, shuffle=True, batch_size=BATCH_SIZE, collate_fn=collate_fn_pad)
files, y_true, y_pred, y_prob, threshold = test(model, val_loader, pred_threshold=threshold,
                                                device=get_accelerator('cuda'))

export_incorrect_samples_to_csv(files, y_pred, y_true, os.path.join(OUTPUT_DIR, 'val_incorrect.csv'))

info("Calculating validation metrics")
val_metrics, val_metrics_str = Metrics.calculate_metrics(y_true, y_pred, y_prob)
info(f"Validation accuracy = {val_metrics['Accuracy']:0.4f}")

# Save metrics to file
with open(os.path.join(OUTPUT_DIR, 'metrics.csv'), 'w') as f:
    train_metrics['#'] = 'Train'
    val_metrics['#'] = 'Validation'
    w = csv.DictWriter(f, sorted(val_metrics.keys()))
    w.writeheader()
    w.writerow(train_metrics)
    w.writerow(val_metrics)

info("Validating model done!")


### Generating labels of test set

In [None]:
with open(SCALER_PATH, "rb") as f:
    scaler = pickle.load(f)

In [None]:
test_files = []
test_preds = []
for file_id, wav_file in enumerate(os.listdir(TEST_RAW_DIR)):
    audio_path = os.path.join(TEST_RAW_DIR, wav_file)
    out = test_sample(model, audio_path, scaler, pred_threshold=threshold) #0.2630763351917267
    test_files.append(wav_file)
    test_preds.append(out)
    if (file_id + 1) % 500 ==0:
        info(f"{file_id+1:4d} file(s) processed")
    

In [None]:
out_file = os.path.join(OUTPUT_DIR, 'test_predictions.csv')
pd.DataFrame(zip(test_files, test_preds)).to_csv(out_file, index=False, header=['Files', 'Predictions'])

### Listen and Test some sample file

In [None]:
sample_audio = 'E:/Datasets/ID R&D/data/raw/Testing_Data/sample_0000.wav'
ipd.Audio(sample_audio)

In [None]:
out = test_sample(model, sample_audio, scaler, return_code=False, pred_threshold=threshold)
info(f"The sample audio '{os.path.basename(sample_audio)}' is: {out}")