# Inference

**Predict the timestamps for possible subtitles of your input audio**

In [17]:
import os
import warnings

import librosa
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
from catalyst.dl import SupervisedRunner, CallbackOrder, Callback, CheckpointCallback
from fastprogress import progress_bar

In [18]:
from utils import check_type, extract_audio, mono_load, get_duration
from config import InferenceConfig as IC
from csrc.configurations import DatasetConfig as DC
from csrc.configurations import ModelConfig as MC

In [19]:
# For better debugging

os.environ["CUDA_LAUCH_BLOCKING"] = "1"

## User configurations

In [20]:
### IMPORTANT PARAM
PERIOD = 5 # 10, 5, 3, 2, 1
### IMPORTANT PARAM
THRESHOLD = 0.8 # 0.5, 0.9

In [21]:
# Those need no modification.

# Audio Sample rate. DO NOT change.
SR = DC.dataset_sample_rate

# Tag encoding. DO NOT change.
CODING = IC.coding_map

In [22]:
# Set and audiohandler your input file.

### Target file for inferencing.
TARGET_FILE_PATH = "./src/src-test/src-test.wav"

TARGET_FILE_PATH = extract_audio(TARGET_FILE_PATH) if check_type(TARGET_FILE_PATH) else TARGET_FILE_PATH

### Output csv file name.
OUTPUT_FILE_NAME = "test"

OUTPUT_FILE = f"./inf/{OUTPUT_FILE_NAME}.csv"
OUTPUT_SOURCE_FILE = f"./inf/{OUTPUT_FILE_NAME}-all.csv"
if os.path.exists(OUTPUT_FILE):
    print("!Warning: Output file already exists, are you sure to rewrite the file?")

# Target model used for this prediction. Must be corresponding to the model during the training process.

### Set the model file path. The file path must be valid
MODEL_PATH = "./train/logs/sp2-32000hz/checkpoints/best.pth"

print(f"USING MODEL: {MODEL_PATH}")

USING MODEL: ./train/logs/sp2-32000hz/checkpoints/best.pth


## Dataset

In [23]:
from csrc.dataset import PANNsDataset

## Model

In [24]:
from csrc.models import PANNsCNN14Att, AttBlock

## Inference Settings

In [25]:
device = torch.device('cuda:0')

In [26]:
model = PANNsCNN14Att(**MC.sed_model_config)
model.att_block = AttBlock(2048, 2, activation='sigmoid')
model.att_block.init_weights()
model.load_state_dict(torch.load(MODEL_PATH)['model_state_dict'])
model.to(device)
model.eval()

PANNsCNN14Att(
  (spectrogram_extractor): Spectrogram(
    (stft): STFT(
      (conv_real): Conv1d(1, 513, kernel_size=(1024,), stride=(320,), bias=False)
      (conv_imag): Conv1d(1, 513, kernel_size=(1024,), stride=(320,), bias=False)
    )
  )
  (logmel_extractor): LogmelFilterBank()
  (spec_augmenter): SpecAugmentation(
    (time_dropper): DropStripes()
    (freq_dropper): DropStripes()
  )
  (bn0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv_block1): ConvBlock(
    (conv1): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (conv_block2): ConvBlock(
    (conv1): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False

In [27]:
y, _ = mono_load(TARGET_FILE_PATH)

Loading file: ./src/src-test/src-test.wav


In [12]:
dataframe = pd.DataFrame()

## Prediction begin!

In [13]:
from utils import get_duration
audio_duration = get_duration(audio_file_path=TARGET_FILE_PATH, y=y, sr=SR)

print(f"Audio file duration: {audio_duration}s")

There is inconsistency between the audio waveform and header metadata.This could be ignored.
Audio file duration: 7462.54934375s


In [14]:
audios = []

len_y = len(y)
start = 0
end = PERIOD * SR

# Split audio into clips.
while True:
    y_batch = y[start:end].astype(np.float32)
    if len(y_batch) != PERIOD * SR:
        y_pad = np.zeros(PERIOD * SR, dtype=np.float32)
        y_pad[:len(y_batch)] = y_batch
        audios.append(y_pad)
        break
    start = end
    end += PERIOD * SR
    audios.append(y_batch)

# Get tensors
arrays = np.asarray(audios)
tensors = torch.from_numpy(arrays)

estimated_event_list = []
global_time = 0.0
for image in progress_bar(tensors):
    image = image.view(1, image.size(0))
    image = image.to(device)

    with torch.no_grad():
        prediction = model(image)
        framewise_outputs = prediction["framewise_output"].detach(
            ).cpu().numpy()[0]
            
    thresholded = framewise_outputs >= THRESHOLD

    for target_idx in range(thresholded.shape[1]):
        if thresholded[:, target_idx].mean() == 0:
            pass
        else:
            detected = np.argwhere(thresholded[:, target_idx]).reshape(-1)
            head_idx = 0
            tail_idx = 0
            while True:
                if (tail_idx + 1 == len(detected)) or (
                        detected[tail_idx + 1] - 
                        detected[tail_idx] != 1):
                    onset = 0.01 * detected[
                        head_idx] + global_time
                    offset = 0.01 * detected[
                        tail_idx] + global_time
                    onset_idx = detected[head_idx]
                    offset_idx = detected[tail_idx]
                    max_confidence = framewise_outputs[
                        onset_idx:offset_idx, target_idx].max()
                    mean_confidence = framewise_outputs[
                        onset_idx:offset_idx, target_idx].mean()
                    estimated_event = {
                        "speech_recognition": CODING[target_idx],
                        "start": onset,
                        "end": offset,
                        "max_confidence": max_confidence,
                        "mean_confidence": mean_confidence,
                    }
                    estimated_event_list.append(estimated_event)
                    head_idx = tail_idx + 1
                    tail_idx = tail_idx + 1
                    if head_idx >= len(detected):
                        break
                else:
                    tail_idx += 1
    global_time += PERIOD
    
prediction_df = pd.DataFrame(estimated_event_list)

## Post process

In [15]:
# Secure output file offset: max offset should be less than audio duration.

max_offset = prediction_df.iloc[-1].end
if max_offset > audio_duration:
    prediction_df.iloc[-1].end = audio_duration

In [16]:
prediction_df[prediction_df.speech_recognition=="speech"].to_csv(OUTPUT_FILE, index=False)
prediction_df.to_csv(OUTPUT_SOURCE_FILE, index=False)