In [25]:
import pandas as pd
import tensorflow as tf

# Automatically reload imported modules that are changed outside this notebook
# More pixels in figures
import matplotlib.pyplot as plt
plt.rcParams["figure.dpi"] = 200


In [32]:
import numpy as np
np_rng = np.random.default_rng(1)

tf.random.set_seed(np_rng.integers(0, tf.int64.max))



import urllib.parse
from IPython.display import display, Markdown

import os

from lidbox.meta import (
    common_voice,
    generate_label2target,
    verify_integrity,
    read_audio_durations,
    random_oversampling_on_split
)

tf.random.set_seed(np_rng.integers(0, tf.int64.max))

train = pd.read_csv("train.tsv", sep="\t")
test = pd.read_csv("test.tsv", sep="\t")
dev = pd.read_csv("dev.tsv", sep="\t")

train["path"] = train["path"].apply(lambda x: x[:-3] + "mp3")
test["path"] = test["path"].apply(lambda x: x[:-3] + "mp3")
dev["path"] = dev["path"].apply(lambda x: x[:-3] + "mp3")

train["split"] = "train"
test["split"] = "test"
dev["split"] = "dev"
meta = pd.concat([train, test, dev])


In [33]:
meta.loc[meta["locale"] != "kz", "path"] = "/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/" +  meta.loc[meta["locale"] != "kz"]["locale"] + "/clips/" + meta.loc[meta["locale"] != "kz"]["path"]
targets = {"kz": 0, "ru": 1, "en":2, "other":3}
meta["target"] = meta["locale"]
meta.loc[(meta["locale"] != "kz") & (meta["locale"] != "ru") & (meta["locale"]!="en"), "target"] = "other"
meta = meta.loc[meta["path"] != "/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/kz/clips/5f590a130a73c.mp3"]
meta = meta.loc[meta["path"] != "/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/kz/clips/5ef9bd9ba7029.mp3"]

meta["id"] = meta["Unnamed: 0"].apply(str)
meta["target"] = meta["target"].map(targets)

workdir = "/tf/datasets/gru"




In [34]:
import scipy.signal

from lidbox.features import audio, cmvn
import lidbox.data.steps as ds_steps


TF_AUTOTUNE = tf.data.experimental.AUTOTUNE


def metadata_to_dataset_input(meta):
    return {
        "id": tf.constant(meta.id, tf.string),
        "path": tf.constant(meta.path, tf.string),
        "target": tf.constant(meta.target, tf.int32),
        "split": tf.constant(meta.split, tf.string),
    }

def read_mp3(x):
    s, r = audio.read_mp3(x["path"])
    out_rate = 16000
    s = audio.resample(s, r, out_rate)
    s = audio.peak_normalize(s, dBFS=-3.0)
    s = audio.remove_silence(s, out_rate)
    return dict(x, signal=s, sample_rate=out_rate)


def random_filter(x):
    def scipy_filter(s, N=10):
        b = np_rng.normal(0, 1, N)
        return scipy.signal.lfilter(b, 1.0, s).astype(np.float32), b
    s, _ = tf.numpy_function(
        scipy_filter,
        [x["signal"]],
        [tf.float32, tf.float64],
        name="np_random_filter")
    s = tf.cast(s, tf.float32)
    s = audio.peak_normalize(s, dBFS=-3.0)
    return dict(x, signal=s)


def random_speed_change(ds):
    return ds_steps.random_signal_speed_change(ds, min=0.9, max=1.1, flag=None)


def create_signal_chunks(ds):
    ds = ds_steps.repeat_too_short_signals(ds, 3200)
    ds = ds_steps.create_signal_chunks(ds, 3200, 800)
    return ds


def batch_extract_features(x):
    with tf.device("GPU"):
        signals, rates = x["signal"], x["sample_rate"]
        S = audio.spectrograms(signals, rates[0])
        S = audio.linear_to_mel(S, rates[0])
        S = tf.math.log(S + 1e-6)
        S = cmvn(S, normalize_variance=False)
    return dict(x, logmelspec=S)

def pipeline_from_meta(data, split):
    if split == "train":
        data = data.sample(frac=1, random_state=np_rng.bit_generator)

    ds = (tf.data.Dataset
            .from_tensor_slices(metadata_to_dataset_input(data))
            .map(read_mp3, num_parallel_calls=TF_AUTOTUNE))

    if split == "train":
        return (ds
            .apply(random_speed_change)
            .prefetch(1)
            .map(random_filter, num_parallel_calls=TF_AUTOTUNE)
            .apply(create_signal_chunks)
            .batch(1)
            .map(batch_extract_features, num_parallel_calls=TF_AUTOTUNE)
            .unbatch())
    else:
        return (ds
            .apply(create_signal_chunks)
            .batch(1)
            .map(batch_extract_features, num_parallel_calls=TF_AUTOTUNE)
            .unbatch()
            .prefetch(1))


cachedir = os.path.join(workdir, "cache")

split2ds = {split: pipeline_from_meta(meta[meta["split"]==split], split)
            for split in meta.split.unique()}

2021-06-04 12:42:20.243 I lidbox.data.steps: Applying random resampling to signals with a random speed ratio chosen uniformly at random from [0.900, 1.100]
2021-06-04 12:42:20.273 I lidbox.data.steps: Repeating all signals until they are at least 3200 ms
2021-06-04 12:42:20.289 I lidbox.data.steps: Dividing every signal in the dataset into new signals by creating signal chunks of length 3200 ms and offset 800 ms. Maximum amount of padding allowed in the last chunk is 0 ms.
2021-06-04 12:42:20.706 I lidbox.data.steps: Repeating all signals until they are at least 3200 ms
2021-06-04 12:42:20.722 I lidbox.data.steps: Dividing every signal in the dataset into new signals by creating signal chunks of length 3200 ms and offset 800 ms. Maximum amount of padding allowed in the last chunk is 0 ms.
2021-06-04 12:42:21.115 I lidbox.data.steps: Repeating all signals until they are at least 3200 ms
2021-06-04 12:42:21.131 I lidbox.data.steps: Dividing every signal in the dataset into new signals by

In [35]:
meta = meta.set_index('id')

In [5]:
import lidbox.models.bi_gru as bi_gru
model_input_type = "logmelspec"

def create_model(num_freq_bins, num_labels):
    model = bi_gru.create([None, num_freq_bins], num_labels, channel_dropout_rate=0.8)
    model.compile(
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4), 
        metrics=tf.keras.metrics.sparse_categorical_accuracy)
    return model


model = create_model(
    num_freq_bins=20 if model_input_type == "mfcc" else 40,
    num_labels=4)
model.summary()

Model: "BGRU"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, None, 40)]        0         
_________________________________________________________________
channel_dropout_0.80 (Spatia (None, None, 40)          0         
_________________________________________________________________
BGRU_1 (Bidirectional)       (None, None, 1024)        1701888   
_________________________________________________________________
BGRU_2 (Bidirectional)       (None, 1024)              4724736   
_________________________________________________________________
BGRU_2_bn (BatchNormalizatio (None, 1024)              4096      
_________________________________________________________________
fc_relu_1 (Dense)            (None, 1024)              1049600   
_________________________________________________________________
fc_relu_1_bn (BatchNormaliza (None, 1024)              4096   

In [6]:
callbacks = [
    tf.keras.callbacks.TensorBoard(
        log_dir=os.path.join(cachedir, "tensorboard", model.name),
        update_freq="epoch",
        write_images=True,
        profile_batch=0,
    ),
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=5,
    ),
    tf.keras.callbacks.ModelCheckpoint(
        os.path.join(cachedir, "model", model.name),
        monitor='val_loss',
        save_weights_only=True,
        save_best_only=True,
        verbose=1,
    ),
]


def as_model_input(x):
    return x["logmelspec"], x["target"]



In [8]:
print("preparing datasets")

train_ds = split2ds["train"].map(as_model_input)
dev_ds = split2ds["dev"].map(as_model_input)

 
print("start training")    
with tf.device("GPU"):
    history = model.fit(
        train_ds.batch(32).repeat(100),
        steps_per_epoch=2576,
        validation_data=dev_ds.batch(32).repeat(100),
        validation_steps=961,
        callbacks=callbacks,
        verbose=1,
        epochs=100)

tf.keras.models.save_model(
    model, "gru.h5", overwrite=True, include_optimizer=True, save_format=None,
    signatures=None, options=None, save_traces=True
)

preparing datasets
start training
Epoch 1/100

Epoch 00001: val_loss did not improve from 1.42752
Epoch 2/100

Epoch 00002: val_loss did not improve from 1.42752
Epoch 3/100

Epoch 00003: val_loss improved from 1.42752 to 1.40748, saving model to /tf/datasets/gru/cache/model/BGRU
Epoch 4/100

Epoch 00004: val_loss improved from 1.40748 to 1.35796, saving model to /tf/datasets/gru/cache/model/BGRU
Epoch 5/100

Epoch 00005: val_loss did not improve from 1.35796
Epoch 6/100

Epoch 00006: val_loss improved from 1.35796 to 1.11465, saving model to /tf/datasets/gru/cache/model/BGRU
Epoch 7/100

Epoch 00007: val_loss improved from 1.11465 to 0.81236, saving model to /tf/datasets/gru/cache/model/BGRU
Epoch 8/100

Epoch 00008: val_loss did not improve from 0.81236
Epoch 9/100

Epoch 00009: val_loss did not improve from 0.81236
Epoch 10/100

Epoch 00010: val_loss did not improve from 0.81236
Epoch 11/100

Epoch 00011: val_loss improved from 0.81236 to 0.73672, saving model to /tf/datasets/gru/ca

In [13]:

def predictions_to_dataframe(ids, predictions):
    return (pd.DataFrame.from_dict({"id": ids, "prediction": predictions})
            #.set_index("id", drop=True, verify_integrity=True)
            #.sort_index()
           )

def predict_with_model(model, ds, predict_fn=None):
    """
    Map callable model over all batches in ds, predicting values for each element at key 'input'.
    """
    if predict_fn is None:
        def predict_fn(x):
            with tf.device("GPU"):
                return x["id"], model(x["input"], training=False)

    ids = []
    predictions = []
    for id, pred in ds.map(predict_fn, num_parallel_calls=TF_AUTOTUNE).unbatch().as_numpy_iterator():
        ids.append(id.decode("utf-8"))
        predictions.append(pred)

    return predictions_to_dataframe(ids, predictions)

In [14]:
chunk2pred = predict_with_model(
    model=model,
    ds=split2ds["test"].map(lambda x: dict(x, input=x["logmelspec"])).batch(32),
    #predict_fn=predict_with_ap_loss
    )



In [20]:
chunk2pred

Unnamed: 0_level_0,prediction
id,Unnamed: 1_level_1
71684-000001,"[-6.6333766, -1.7371142, -1.8038024, -0.418573]"
71684-000002,"[-11.755868, -2.9275784, -0.9396695, -0.5875112]"
71684-000003,"[-13.905518, -1.6065432, -0.9924134, -0.8469087]"
71684-000004,"[-11.246083, -2.9696379, -0.882289, -0.62580544]"
88574-000001,"[-3.2370036, -1.7754825, -3.876099, -0.2606049]"
...,...
259-000003,"[-15.581085, -0.47045177, -8.218401, -0.980802]"
12245-000001,"[-12.51034, -3.3660367, -2.6351688, -0.112314366]"
12245-000002,"[-13.431554, -2.9179392, -2.675808, -0.13113156]"
12245-000003,"[-12.88188, -3.1726012, -3.3085423, -0.08171614]"


In [16]:
chunk2pred = chunk2pred.set_index("id")

In [21]:
from lidbox.util import merge_chunk_predictions


utt2pred = merge_chunk_predictions(chunk2pred)
utt2pred

Unnamed: 0_level_0,prediction
id,Unnamed: 1_level_1
0,"[-9.260615, -2.600781, -3.7446494, -0.2280062]"
1,"[-10.804147, -2.717781, -3.5896728, -0.28567746]"
100,"[-14.39197, -0.8050288, -3.9227448, -0.8455879]"
1000,"[-9.586703, -1.5442146, -4.1786733, -0.64503604]"
10000,"[-15.005293, -4.760601, -2.2678008, -0.118896626]"
...,...
99974,"[-0.6190233, -1.7051761, -7.9411693, -1.3518697]"
99983,"[-1.4948798, -0.9733737, -8.4881315, -1.2547462]"
99986,"[-0.4074719, -3.2042804, -5.582465, -1.2688036]"
99987,"[-0.8040076, -1.6110811, -7.738201, -1.083901]"


In [36]:
from sklearn.metrics import classification_report

test_meta = meta[meta["split"]=="test"].join(utt2pred, how="outer")
assert not test_meta.isna().any(axis=None), "failed to join predictions"

true_sparse = test_meta.target.to_numpy(np.int32)
pred_dense = np.stack(test_meta.prediction.apply(np.argmax))

report = classification_report(true_sparse, pred_dense, target_names=list(targets.keys()), labels=range(4))
print(report)



              precision    recall  f1-score   support

          kz       0.99      0.64      0.77     17341
          ru       0.74      0.46      0.57     10379
          en       0.98      0.35      0.51     12964
       other       0.41      0.92      0.57     15084

    accuracy                           0.61     55768
   macro avg       0.78      0.59      0.61     55768
weighted avg       0.78      0.61      0.62     55768



In [37]:
test_meta

Unnamed: 0_level_0,Unnamed: 0,path,locale,split,target,prediction
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...,ru,test,1,"[-9.260615, -2.600781, -3.7446494, -0.2280062]"
0,0,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...,ky,test,3,"[-9.260615, -2.600781, -3.7446494, -0.2280062]"
0,0,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...,zh-CN,test,3,"[-9.260615, -2.600781, -3.7446494, -0.2280062]"
1,1,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...,ta,test,3,"[-10.804147, -2.717781, -3.5896728, -0.28567746]"
1,1,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...,ky,test,3,"[-10.804147, -2.717781, -3.5896728, -0.28567746]"
...,...,...,...,...,...,...
99974,99974,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...,kz,test,0,"[-0.6190233, -1.7051761, -7.9411693, -1.3518697]"
99983,99983,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...,kz,test,0,"[-1.4948798, -0.9733737, -8.4881315, -1.2547462]"
99986,99986,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...,kz,test,0,"[-0.4074719, -3.2042804, -5.582465, -1.2688036]"
99987,99987,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...,kz,test,0,"[-0.8040076, -1.6110811, -7.738201, -1.083901]"
