In [81]:
import pandas as pd
import tensorflow as tf

# Automatically reload imported modules that are changed outside this notebook
# More pixels in figures
import matplotlib.pyplot as plt
plt.rcParams["figure.dpi"] = 200


In [82]:
import numpy as np
np_rng = np.random.default_rng(1)

tf.random.set_seed(np_rng.integers(0, tf.int64.max))



import urllib.parse
from IPython.display import display, Markdown

import os

from lidbox.meta import (
    common_voice,
    generate_label2target,
    verify_integrity,
    read_audio_durations,
    random_oversampling_on_split
)

tf.random.set_seed(np_rng.integers(0, tf.int64.max))

train = pd.read_csv("train.tsv", sep="\t")
test = pd.read_csv("test.tsv", sep="\t")
dev = pd.read_csv("dev.tsv", sep="\t")

train["path"] = train["path"].apply(lambda x: x[:-3] + "mp3")
test["path"] = test["path"].apply(lambda x: x[:-3] + "mp3")
dev["path"] = dev["path"].apply(lambda x: x[:-3] + "mp3")

train["split"] = "train"
test["split"] = "test"
dev["split"] = "dev"
meta = pd.concat([train, test, dev])


In [83]:
# some preprocessing to make sure that the path is correct
meta.loc[meta["locale"] != "kz", "path"] = "/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/" +  meta.loc[meta["locale"] != "kz"]["locale"] + "/clips/" + meta.loc[meta["locale"] != "kz"]["path"]
targets = {"kz": 0, "ru": 1, "en":2, "other":3}
meta["target"] = meta["locale"]
meta.loc[(meta["locale"] != "kz") & (meta["locale"] != "ru") & (meta["locale"]!="en"), "target"] = "other"
meta = meta.loc[meta["path"] != "/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/kz/clips/5f590a130a73c.mp3"]
meta = meta.loc[meta["path"] != "/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/kz/clips/5ef9bd9ba7029.mp3"]

meta["id"] = meta["Unnamed: 0"].apply(str)
meta["target"] = meta["target"].map(targets)

workdir = "/tf/datasets/augmentexCLSTM/"


In [84]:
meta["id"] = meta["path"]

In [85]:
meta.loc[meta["split"]=="test"]

Unnamed: 0.1,Unnamed: 0,path,locale,split,target,id
0,71684,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...,en,test,2,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...
1,88574,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...,kz,test,0,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...
2,17681,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...,kz,test,0,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...
3,544,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...,ta,test,3,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...
4,96896,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...,kz,test,0,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...
...,...,...,...,...,...,...
55763,6225,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...,ru,test,1,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...
55764,563649,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...,en,test,2,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...
55765,12538,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...,kz,test,0,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...
55766,259,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...,ru,test,1,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...


In [86]:
meta.iloc[0]["path"]

'/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/ru/clips/common_voice_ru_19559882.mp3'

In [87]:
import tensorflow_io as tfio

In [88]:
import scipy.signal

from lidbox.features import audio, cmvn
import lidbox.data.steps as ds_steps

# preprocessing of audios

TF_AUTOTUNE = tf.data.experimental.AUTOTUNE


def metadata_to_dataset_input(meta):
    return {
        "id": tf.constant(meta.id, tf.string),
        "path": tf.constant(meta.path, tf.string),
        "target": tf.constant(meta.target, tf.int32),
        "split": tf.constant(meta.split, tf.string),
    }

# reading and normalizing data
def read_mp3(x):
    s, r = audio.read_mp3(x["path"])
    out_rate = 16000
    s = audio.resample(s, r, out_rate)
    s = audio.peak_normalize(s, dBFS=-3.0)
    s = audio.remove_silence(s, out_rate)
    return dict(x, signal=s, sample_rate=out_rate)

# augmentations using random filtering
def random_filter(x):
    def scipy_filter(s, N=10):
        b = np_rng.normal(0, 1, N)
        return scipy.signal.lfilter(b, 1.0, s).astype(np.float32), b
    s, _ = tf.numpy_function(
        scipy_filter,
        [x["signal"]],
        [tf.float32, tf.float64],
        name="np_random_filter")
    s = tf.cast(s, tf.float32)
    s = audio.peak_normalize(s, dBFS=-3.0)
    return dict(x, signal=s)

# significant speed change
def random_speed_change(ds):
    return ds_steps.random_signal_speed_change(ds, min=0.5, max=1.5, flag=None)

# spliting the audio on small chunks
def create_signal_chunks(ds):
    ds = ds_steps.repeat_too_short_signals(ds, 3200)
    ds = ds_steps.create_signal_chunks(ds, 3200, 800)
    return ds


def batch_extract_features(x):
    with tf.device("GPU"):
        signals, rates = x["signal"], x["sample_rate"]
        S = audio.spectrograms(signals, rates[0])
        S = audio.linear_to_mel(S, rates[0])
        S = tf.math.log(S + 1e-6)
        mfccs = tf.signal.mfccs_from_log_mel_spectrograms(S)
        mfccs = mfccs[...,1:21]
        S = cmvn(S, normalize_variance=False)
        mfccs_cmvn = cmvn(mfccs)

        #S = tfio.audio.freq_mask(S, param=10)
        #S = tfio.audio.time_mask(S, param=10)
    return dict(x, logmelspec=S, mfccs=mfccs)


def pipeline_from_meta(data, split):
    if split == "train":
        data = data.sample(frac=1, random_state=np_rng.bit_generator)

    ds = (tf.data.Dataset
            .from_tensor_slices(metadata_to_dataset_input(data))
            .map(read_mp3, num_parallel_calls=TF_AUTOTUNE))

    if split == "train":
        return (ds
            .apply(random_speed_change)
           #.cache(os.path.join(cachedir, "data", split))
            .prefetch(32)
            .map(random_filter, num_parallel_calls=TF_AUTOTUNE)
            .apply(create_signal_chunks)
            .batch(32)
            .map(batch_extract_features, num_parallel_calls=TF_AUTOTUNE)
            .unbatch())
    else:
        return (ds
            .apply(create_signal_chunks)
            .batch(32)
            .map(batch_extract_features, num_parallel_calls=TF_AUTOTUNE)
            .unbatch()
            #.cache(os.path.join(cachedir, "data", split))
            .prefetch(1))


cachedir = os.path.join(workdir, "cache")

split2ds = {split: pipeline_from_meta(meta[meta["split"]==split], split)
            for split in meta.split.unique()}


2021-06-14 08:37:40.069 I lidbox.data.steps: Applying random resampling to signals with a random speed ratio chosen uniformly at random from [0.500, 1.500]
2021-06-14 08:37:40.094 I lidbox.data.steps: Repeating all signals until they are at least 3200 ms
2021-06-14 08:37:40.107 I lidbox.data.steps: Dividing every signal in the dataset into new signals by creating signal chunks of length 3200 ms and offset 800 ms. Maximum amount of padding allowed in the last chunk is 0 ms.
2021-06-14 08:37:40.596 I lidbox.data.steps: Repeating all signals until they are at least 3200 ms
2021-06-14 08:37:40.609 I lidbox.data.steps: Dividing every signal in the dataset into new signals by creating signal chunks of length 3200 ms and offset 800 ms. Maximum amount of padding allowed in the last chunk is 0 ms.
2021-06-14 08:37:40.978 I lidbox.data.steps: Repeating all signals until they are at least 3200 ms
2021-06-14 08:37:40.991 I lidbox.data.steps: Dividing every signal in the dataset into new signals by

In [89]:
# Here are all blocks used to build the model. Retrieved from: https://github.com/py-lidbox/lidbox 
from tensorflow.keras.layers import (
    Activation,
    BatchNormalization,
    Conv1D,
    Dense,
    Dropout,
    Input,
    Layer,
    SpatialDropout1D,
)
from tensorflow.keras.models import Model
import tensorflow as tf

def frame_layer(filters, kernel_size, strides, padding="causal", activation="relu", name="frame"):
    return Conv1D(filters, kernel_size, strides, padding=padding, activation=activation, name=name)


def segment_layer(units, activation="relu", name="segment"):
    return Dense(units, activation=activation, name=name)
class GlobalMeanStddevPooling1D(Layer):
    """
    Compute arithmetic mean and standard deviation of the inputs along the time steps dimension,
    then output the concatenation of the computed stats.
    """
    def call(self, inputs):
        means = tf.math.reduce_mean(inputs, axis=TIME_AXIS, keepdims=True)
        variances = tf.math.reduce_mean(tf.math.square(inputs - means), axis=TIME_AXIS)
        means = tf.squeeze(means, TIME_AXIS)
        stddevs = tf.math.sqrt(tf.clip_by_value(variances, STDDEV_SQRT_MIN_CLIP, variances.dtype.max))
        return tf.concat((means, stddevs), axis=TIME_AXIS)

def as_embedding_extractor(m):
    l = m.get_layer(name="segment1")
    l.activation = None
    return Model(inputs=m.inputs, outputs=l.output)

def frequency_attention(H, d_a=64, d_f=16):
    assert not H.shape[2] % d_f, "amount of frequency channels ({}) must be evenly divisible by the amount of frequency attention bins (d_f={})".format(H.shape[2], d_f)
    # Note, we assume that H.shape = (batch_size, T, d_h), but the paper assumes the timesteps come last
    x = Dense(d_a, activation="relu", use_bias=False, name="Wf_1")(H)
    F_A = Dense(d_f, activation="softmax", use_bias=False, name="Wf_2")(x)
    # Apply frequency attention on d_f bins
    F_A = Reshape((F_A.shape[1] or -1, F_A.shape[2], 1), name="expand_bin_weight_dim")(F_A)
    H_bins = Reshape((H.shape[1] or -1, d_f, H.shape[2] // d_f), name="partition_freq_bins")(H)
    H_bins = Multiply(name="freq_attention")([F_A, H_bins])
    # Merge weighted frequency bins
    H_weighted = Reshape((H.shape[1] or -1, H.shape[2]), name="merge_weighted_bins")(H_bins)
    return H_weighted

In [1]:
from tensorflow.keras.layers import (
    Activation,
    BatchNormalization,
    Conv1D,
    Conv2D,
    Dropout,
    Dense,
    GaussianNoise,
    Input,
    Layer,
    LSTM,
    Multiply,
    Reshape,
)
# Assuming spectral features (Batch, Time, Channels), where freq. channels are always last
TIME_AXIS = 1
STDDEV_SQRT_MIN_CLIP = 1e-10
def frequency_attention(H, d_a=64, d_f=16):
    assert not H.shape[2] % d_f, "amount of frequency channels ({}) must be evenly divisible by the amount of frequency attention bins (d_f={})".format(H.shape[2], d_f)
    # Note, we assume that H.shape = (batch_size, T, d_h), but the paper assumes the timesteps come last
    x = Dense(d_a, activation="relu", use_bias=False, name="Wf_1")(H)
    F_A = Dense(d_f, activation="softmax", use_bias=False, name="Wf_2")(x)
    # Apply frequency attention on d_f bins
    F_A = Reshape((F_A.shape[1] or -1, F_A.shape[2], 1), name="expand_bin_weight_dim")(F_A)
    H_bins = Reshape((H.shape[1] or -1, d_f, H.shape[2] // d_f), name="partition_freq_bins")(H)
    H_bins = Multiply(name="freq_attention")([F_A, H_bins])
    # Merge weighted frequency bins
    H_weighted = Reshape((H.shape[1] or -1, H.shape[2]), name="merge_weighted_bins")(H_bins)
    return H_weighted


def create(input_shape, num_outputs, output_activation="log_softmax", use_attention=False, use_conv2d=False, use_lstm=False):
    inputs = Input(shape=input_shape, name="input")
    x = inputs
    x = GaussianNoise(stddev=0.15, name="input_noise")(x)
    x = SpatialDropout1D(0.8, name="channel_dropout")(x)

    if use_conv2d:
        x = Reshape((input_shape[0] or -1, input_shape[1], 1), name="reshape_to_image")(x)
        x = Conv2D(128, (3, 9), (1, 6), activation=None, padding="same", name="conv2d_1")(x)
        x = BatchNormalization(name="conv2d_1_bn")(x)
        x = Activation("relu", name="conv2d_1_relu")(x)
        x = Conv2D(256, (3, 9), (1, 6), activation=None, padding="same", name="conv2d_2")(x)
        x = BatchNormalization(name="conv2d_2_bn")(x)
        x = Activation("relu", name="conv2d_2_relu")(x)
        # x = Reshape((x.shape[1] or -1, x.shape[2] * x.shape[3]), name="flatten_image_channels")(x)
        x = tf.math.reduce_max(x, axis=2, name="maxpool_image_channels")
    
    x = Dropout(rate=0.7, name="dropout1")(x)
    x = frame_layer(512, 5, 1, name="frame1")(x)
    x = frame_layer(512, 3, 2, name="frame2")(x)
    x = frame_layer(512, 3, 3, name="frame3")(x)
    x = Dropout(rate=0.6, name="dropout2")(x)

    if use_lstm:
        x = LSTM(512, name="lstm", return_sequences=True)(x)
    
    x = frame_layer(512, 1, 1, name="frame4")(x)
    x = frame_layer(1500, 1, 1, name="frame5")(x)
    if use_attention:
        x = frequency_attention(x, d_f=60)

    x = GlobalMeanStddevPooling1D(name="stats_pooling")(x)
    x = Dropout(rate=0.5, name="dropout3")(x)

    x = segment_layer(512, name="segment1")(x)
    x = segment_layer(512, name="segment2")(x)
    outputs = Dense(num_outputs, name="output", activation=None)(x)

    if output_activation:
        outputs = Activation(getattr(tf.nn, output_activation), name=str(output_activation))(outputs)
    return Model(inputs=inputs, outputs=outputs, name="CLSTM")

In [2]:
def create_model(num_freq_bins=40, num_labels=len(np.unique(meta.target))):
    m = create(
        input_shape=[None, num_freq_bins],
        use_attention=True, use_conv2d=True, use_lstm=True,
        num_outputs=num_labels)
    m.compile(
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        optimizer=tf.keras.optimizers.Adam(learning_rate=3e-4),
            metrics=tf.keras.metrics.sparse_categorical_accuracy)
    return m

with tf.device("GPU"):
    model = create_model()
    model.summary()
   

callbacks = [
    tf.keras.callbacks.TensorBoard(
        log_dir=os.path.join(cachedir, "tensorboard", model.name),
        update_freq="epoch",
        write_images=True,
        profile_batch=0,
    ),
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=5,
    ),
    tf.keras.callbacks.ModelCheckpoint(
        os.path.join(cachedir, "model", model.name),
        monitor='val_loss',
        save_weights_only=True,
        save_best_only=True,
        verbose=1,
    ),
]


def as_model_input(x):
    return x["logmelspec"], x["target"]

NameError: name 'np' is not defined

In [None]:
print("preparing datasets")

train_ds = split2ds["train"].map(as_model_input)
dev_ds = split2ds["dev"].map(as_model_input)

# training
print("start training")    
with tf.device("GPU"):
    history = model.fit(
        train_ds.batch(32).repeat(100),
        steps_per_epoch=2576,
        validation_data=dev_ds.batch(32).repeat(100),
        validation_steps=961,
        callbacks=callbacks,
        verbose=1,
        epochs=100)

tf.keras.models.save_model(
    model, "augmentedCLSTM.h5", overwrite=True, include_optimizer=True, save_format=None,
    signatures=None, options=None, save_traces=True
)

preparing datasets
start training




Epoch 1/100

Epoch 00001: val_loss improved from inf to 1.61568, saving model to /tf/datasets/augmentexCLSTM/cache/model/CLSTM
Epoch 2/100

Epoch 00002: val_loss did not improve from 1.61568
Epoch 3/100

Epoch 00003: val_loss did not improve from 1.61568
Epoch 4/100

Epoch 00004: val_loss improved from 1.61568 to 1.45143, saving model to /tf/datasets/augmentexCLSTM/cache/model/CLSTM
Epoch 5/100

Epoch 00005: val_loss did not improve from 1.45143
Epoch 6/100

Epoch 00006: val_loss did not improve from 1.45143
Epoch 7/100

Epoch 00007: val_loss did not improve from 1.45143
Epoch 8/100

Epoch 00008: val_loss improved from 1.45143 to 1.19717, saving model to /tf/datasets/augmentexCLSTM/cache/model/CLSTM
Epoch 9/100

Epoch 00009: val_loss did not improve from 1.19717
Epoch 10/100

Epoch 00010: val_loss did not improve from 1.19717
Epoch 11/100

Epoch 00011: val_loss did not improve from 1.19717
Epoch 12/100

Epoch 00012: val_loss improved from 1.19717 to 1.08145, saving model to /tf/dataset

In [61]:
import pandas as pd

from lidbox.util import predict_with_model, classification_report
from lidbox.visualize import draw_confusion_matrix


def load_trained_model():
    model = create_model()
    print(os.path.join(cachedir, "model", model.name))
    model.load_weights(os.path.join(cachedir, "model", model.name))
    return model


def display_classification_report(report):
    for m in ("avg_detection_cost", "avg_equal_error_rate", "accuracy"):
        print("{}: {:.3f}".format(m, report[m]))

    lang_metrics = pd.DataFrame.from_dict(
        {k: v for k, v in report.items() if k in lang2target})
    lang_metrics["mean"] = lang_metrics.mean(axis=1)
    display(lang_metrics.T)

    fig, ax = draw_confusion_matrix(report["confusion_matrix"], lang2target)

model = load_trained_model()

def predict_with_ap_loss(x):
    with tf.device("GPU"):
        # Generate language vector for input spectra
        language_vector = model(x["input"], training=False)
        print(language_vector)
        # Predict languages by computing distances to reference directions
        return x["id"], model.loss.predict(language_vector)






/tf/datasets/augmentexCLSTM/cache/model/CLSTM


In [62]:
meta = meta.set_index("id")

In [63]:
# evaluation on the source domain
chunk2pred = predict_with_model(
    model=model,
    ds=split2ds["test"].map(lambda x: dict(x, input=x["logmelspec"])).batch(32),
    #predict_fn=predict_with_ap_loss
    )



In [64]:
chunk2pred


Unnamed: 0_level_0,prediction
id,Unnamed: 1_level_1
/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/cs/clips/common_voice_cs_20424383.mp3-000001,"[-6.447608, -0.86175203, -2.5532148, -0.6968273]"
/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/cs/clips/common_voice_cs_20424383.mp3-000002,"[-5.3443165, -0.9268003, -2.1176183, -0.73586935]"
/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/cs/clips/common_voice_cs_20424383.mp3-000003,"[-5.570038, -1.1016388, -2.1237652, -0.60828584]"
/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/cs/clips/common_voice_cs_20424383.mp3-000004,"[-5.282734, -1.0610301, -2.595479, -0.5547562]"
/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/cs/clips/common_voice_cs_20424555.mp3-000001,"[-4.649946, -1.0795896, -2.7285242, -0.5354816]"
...,...
/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/zh-CN/clips/common_voice_zh-CN_22242819.mp3-000004,"[-1.519991, -2.1595426, -1.8823164, -0.66617185]"
/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/zh-CN/clips/common_voice_zh-CN_22243431.mp3-000001,"[-3.2410185, -2.0392914, -1.8907969, -0.38595134]"
/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/zh-CN/clips/common_voice_zh-CN_22243666.mp3-000001,"[-3.3413835, -1.7297513, -1.8790188, -0.4548493]"
/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/zh-CN/clips/common_voice_zh-CN_22243666.mp3-000002,"[-3.681122, -1.5264689, -2.3076472, -0.4185373]"


In [65]:
from lidbox.util import merge_chunk_predictions


utt2pred = merge_chunk_predictions(chunk2pred)
utt2pred

Unnamed: 0_level_0,prediction
id,Unnamed: 1_level_1
/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/cs/clips/common_voice_cs_20424383.mp3,"[-5.661174, -0.9878053, -2.3475194, -0.64893466]"
/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/cs/clips/common_voice_cs_20424555.mp3,"[-5.1115446, -0.9818246, -2.9906769, -0.57174474]"
/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/cs/clips/common_voice_cs_20424567.mp3,"[-3.223493, -1.5270684, -2.5562506, -0.40734017]"
/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/cs/clips/common_voice_cs_20424609.mp3,"[-3.3473876, -2.0355728, -1.8754183, -0.3842996]"
/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/cs/clips/common_voice_cs_20424636.mp3,"[-3.9393935, -1.7496147, -1.5359318, -0.5489157]"
...,...
/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/zh-CN/clips/common_voice_zh-CN_22242585.mp3,"[-2.5344598, -2.723149, -1.8907971, -0.35086846]"
/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/zh-CN/clips/common_voice_zh-CN_22242819.mp3,"[-1.7739997, -2.8186498, -1.3505759, -0.71904624]"
/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/zh-CN/clips/common_voice_zh-CN_22243431.mp3,"[-3.2410185, -2.0392914, -1.8907969, -0.38595134]"
/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/zh-CN/clips/common_voice_zh-CN_22243666.mp3,"[-3.5112529, -1.6281102, -2.093333, -0.4366933]"


In [66]:
from sklearn.metrics import classification_report

test_meta = meta[meta["split"]=="test"].join(utt2pred, how="outer")
assert not test_meta.isna().any(axis=None), "failed to join predictions"

true_sparse = test_meta.target.to_numpy(np.int32)
pred_dense = np.stack(test_meta.prediction.apply(np.argmax))

report = classification_report(true_sparse, pred_dense, target_names=list(targets.keys()), labels=range(4))
print(report)



              precision    recall  f1-score   support

          kz       0.99      0.74      0.84     17341
          ru       0.66      0.39      0.49     10379
          en       0.83      0.32      0.47     12964
       other       0.42      0.88      0.57     15084

    accuracy                           0.62     55768
   macro avg       0.72      0.58      0.59     55768
weighted avg       0.74      0.62      0.62     55768



## VOX data

In [67]:
# Here the code is similar to the one used before, but it is used to load the target domain
import numpy as np
np_rng = np.random.default_rng(1)

tf.random.set_seed(np_rng.integers(0, tf.int64.max))



import urllib.parse
from IPython.display import display, Markdown

import os

from lidbox.meta import (
    common_voice,
    generate_label2target,
    verify_integrity,
    read_audio_durations,
    random_oversampling_on_split
)

tf.random.set_seed(np_rng.integers(0, tf.int64.max))

train = pd.read_csv("train.tsv", sep="\t")
test = pd.read_csv("new_test.tsv", sep="\t")
dev = pd.read_csv("new_dev.tsv", sep="\t")

train["path"] = train["path"].apply(lambda x: x[:-3] + "mp3")
test["path"] = test["path"].apply(lambda x: x[:-3] + "mp3")
dev["path"] = dev["path"].apply(lambda x: x[:-3] + "mp3")

train["split"] = "train"
test["split"] = "test"
dev["split"] = "dev"
meta = pd.concat([train, test, dev])


In [68]:
meta.loc[((meta["locale"] != "kz") & ~(((meta["split"] == "dev") | (meta["split"] == "test")) & ((meta["locale"] == "ru") | (meta["locale"] == "kz") | (meta["locale"] == "en")))), "path"] = "/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/" + meta.loc[((meta["locale"] != "kz") & ~(((meta["split"] == "dev") | (meta["split"] == "test")) & ((meta["locale"] == "ru") | (meta["locale"] == "kz") | (meta["locale"] == "en"))))]["locale"]  + "/clips/" + meta.loc[((meta["locale"] != "kz") & ~(((meta["split"] == "dev") | (meta["split"] == "test")) & ((meta["locale"] == "ru") | (meta["locale"] == "kz") | (meta["locale"] == "en"))))]["path"]
targets = {"kz": 0, "ru": 1, "en":2, "other":3}
meta["target"] = meta["locale"]
meta.loc[(meta["locale"] != "kz") & (meta["locale"] != "ru") & (meta["locale"]!="en"), "target"] = "other"
meta = meta.loc[meta["path"] != "/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/kz/clips/5f590a130a73c.mp3"]
meta = meta.loc[meta["path"] != "/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/kz/clips/5ef9bd9ba7029.mp3"]

meta["id"] = str(meta["Unnamed: 0"])
meta["target"] = meta["target"].map(targets)




In [69]:
meta.loc[(meta["split"] == "test") & (meta["locale"] == "ru"), "path"] = meta.loc[(meta["split"] == "test") & (meta["locale"] == "ru")]["path"].apply(lambda x: f"/tf/datasets/vox/ru_test/{x}")
meta.loc[(meta["split"] == "test") & (meta["locale"] == "ru"), "path"]
meta.loc[(meta["split"] == "test") & (meta["locale"] == "kz"), "path"] = meta.loc[(meta["split"] == "test") & (meta["locale"] == "kz")]["path"].apply(lambda x: f"/tf/datasets/vox/kz_test/{x}")
meta.loc[(meta["split"] == "test") & (meta["locale"] == "kz"), "path"] 
meta.loc[(meta["split"] == "test") & (meta["locale"] == "en"), "path"] = meta.loc[(meta["split"] == "test") & (meta["locale"] == "en")]["path"].apply(lambda x: f"/tf/datasets/vox/en_test/{x}")
meta.loc[(meta["split"] == "test") & (meta["locale"] == "en"), "path"] 

0       /tf/datasets/vox/en_test/shrDRhToGpY__U__S133-...
1       /tf/datasets/vox/en_test/mzfg0RGJnV8__U__S123-...
2       /tf/datasets/vox/en_test/-_PPCH3y0eE__U__S1---...
3       /tf/datasets/vox/en_test/DQMxvGYyu6Q__U__S0---...
4       /tf/datasets/vox/en_test/x4lfSc7PrB0__U__S0---...
                              ...                        
9995    /tf/datasets/vox/en_test/KLiy94kfZI4__U__S133-...
9996    /tf/datasets/vox/en_test/YTlliEr5LOA__U__S113-...
9997    /tf/datasets/vox/en_test/bSs0gNq6Kkc__U__S0---...
9998    /tf/datasets/vox/en_test/Da7c-BY6MDA__U__S2---...
9999    /tf/datasets/vox/en_test/VWvPndMo1F8__U__S24--...
Name: path, Length: 10000, dtype: object

In [70]:
meta.loc[meta["split"]=="test", "Unnamed: 0"] = meta.loc[meta["split"]=="test"]["path"]

In [71]:
meta["id"] = meta["Unnamed: 0"].apply(str)

In [72]:
meta.loc[meta["split"] == "test", "id"] = meta.loc[meta["split"] == "test"]["path"]

In [73]:
meta = meta.set_index("Unnamed: 0")
meta.loc[meta["split"]=="test"]

Unnamed: 0_level_0,path,locale,split,target,id
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
/tf/datasets/vox/en_test/shrDRhToGpY__U__S133---0944.430-0958.260.mp3,/tf/datasets/vox/en_test/shrDRhToGpY__U__S133-...,en,test,2,/tf/datasets/vox/en_test/shrDRhToGpY__U__S133-...
/tf/datasets/vox/en_test/mzfg0RGJnV8__U__S123---0427.020-0444.670.mp3,/tf/datasets/vox/en_test/mzfg0RGJnV8__U__S123-...,en,test,2,/tf/datasets/vox/en_test/mzfg0RGJnV8__U__S123-...
/tf/datasets/vox/en_test/-_PPCH3y0eE__U__S1---0398.760-0403.940.mp3,/tf/datasets/vox/en_test/-_PPCH3y0eE__U__S1---...,en,test,2,/tf/datasets/vox/en_test/-_PPCH3y0eE__U__S1---...
/tf/datasets/vox/en_test/DQMxvGYyu6Q__U__S0---1473.480-1485.720.mp3,/tf/datasets/vox/en_test/DQMxvGYyu6Q__U__S0---...,en,test,2,/tf/datasets/vox/en_test/DQMxvGYyu6Q__U__S0---...
/tf/datasets/vox/en_test/x4lfSc7PrB0__U__S0---0125.230-0140.900.mp3,/tf/datasets/vox/en_test/x4lfSc7PrB0__U__S0---...,en,test,2,/tf/datasets/vox/en_test/x4lfSc7PrB0__U__S0---...
...,...,...,...,...,...
/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/it/clips/common_voice_it_20015623.mp3,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...,it,test,3,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...
/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/uk/clips/common_voice_uk_23554602.mp3,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...,uk,test,3,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...
/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/tr/clips/common_voice_tr_20416266.mp3,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...,tr,test,3,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...
/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/it/clips/common_voice_it_20263173.mp3,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...,it,test,3,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...


In [74]:
meta.loc[meta["split"] == "test"] = meta.loc[(meta["split"] == "test") & (meta["target"] != 3)] 

In [75]:
import scipy.signal

from lidbox.features import audio, cmvn
import lidbox.data.steps as ds_steps


TF_AUTOTUNE = tf.data.experimental.AUTOTUNE


def metadata_to_dataset_input(meta):
    return {
        "id": tf.constant(meta.id, tf.string),
        "path": tf.constant(meta.path, tf.string),
        "target": tf.constant(meta.target, tf.int32),
        "split": tf.constant(meta.split, tf.string),
    }

def read_mp3(x):
    s, r = audio.read_mp3(x["path"])
    out_rate = 16000
    s = audio.resample(s, r, out_rate)
    s = audio.peak_normalize(s, dBFS=-3.0)
    s = audio.remove_silence(s, out_rate)
    return dict(x, signal=s, sample_rate=out_rate)


def random_filter(x):
    def scipy_filter(s, N=10):
        b = np_rng.normal(0, 1, N)
        return scipy.signal.lfilter(b, 1.0, s).astype(np.float32), b
    s, _ = tf.numpy_function(
        scipy_filter,
        [x["signal"]],
        [tf.float32, tf.float64],
        name="np_random_filter")
    s = tf.cast(s, tf.float32)
    s = audio.peak_normalize(s, dBFS=-3.0)
    return dict(x, signal=s)


def random_speed_change(ds):
    return ds_steps.random_signal_speed_change(ds, min=0.9, max=1.1, flag=None)


def create_signal_chunks(ds):
    ds = ds_steps.repeat_too_short_signals(ds, 3200)
    ds = ds_steps.create_signal_chunks(ds, 3200, 800)
    return ds


def batch_extract_features(x):
    with tf.device("GPU"):
        signals, rates = x["signal"], x["sample_rate"]
        S = audio.spectrograms(signals, rates[0])
        S = audio.linear_to_mel(S, rates[0])
        S = tf.math.log(S + 1e-6)
        S = cmvn(S, normalize_variance=False)
    return dict(x, logmelspec=S)

def pipeline_from_meta(data, split):
    if split == "train":
        data = data.sample(frac=1, random_state=np_rng.bit_generator)

    ds = (tf.data.Dataset
            .from_tensor_slices(metadata_to_dataset_input(data))
            .map(read_mp3, num_parallel_calls=TF_AUTOTUNE))

    if split == "train":
        return (ds
            .apply(random_speed_change)
           #.cache(os.path.join(cachedir, "data", split))
            .prefetch(1)
            .map(random_filter, num_parallel_calls=TF_AUTOTUNE)
            .apply(create_signal_chunks)
            .batch(1)
            .map(batch_extract_features, num_parallel_calls=TF_AUTOTUNE)
            .unbatch())
    else:
        return (ds
            .apply(create_signal_chunks)
            .batch(1)
            .map(batch_extract_features, num_parallel_calls=TF_AUTOTUNE)
            .unbatch()
            #.cache(os.path.join(cachedir, "data", split))
            .prefetch(1))


cachedir = os.path.join(workdir, "cache")

split2ds = {split: pipeline_from_meta(meta[meta["split"]==split], split)
            for split in meta.split.unique()}

2021-06-14 08:16:01.183 I lidbox.data.steps: Applying random resampling to signals with a random speed ratio chosen uniformly at random from [0.900, 1.100]
2021-06-14 08:16:01.308 I lidbox.data.steps: Repeating all signals until they are at least 3200 ms
2021-06-14 08:16:01.321 I lidbox.data.steps: Dividing every signal in the dataset into new signals by creating signal chunks of length 3200 ms and offset 800 ms. Maximum amount of padding allowed in the last chunk is 0 ms.
2021-06-14 08:16:01.747 I lidbox.data.steps: Repeating all signals until they are at least 3200 ms
2021-06-14 08:16:01.760 I lidbox.data.steps: Dividing every signal in the dataset into new signals by creating signal chunks of length 3200 ms and offset 800 ms. Maximum amount of padding allowed in the last chunk is 0 ms.
2021-06-14 08:16:02.071 I lidbox.data.steps: Repeating all signals until they are at least 3200 ms
2021-06-14 08:16:02.083 I lidbox.data.steps: Dividing every signal in the dataset into new signals by

In [76]:
meta = meta.set_index("id")

In [77]:
# testing on target domain
chunk2pred = predict_with_model(
    model=model,
    ds=split2ds["test"].map(lambda x: dict(x, input=x["logmelspec"])).batch(32),
    #predict_fn=predict_with_ap_loss
    )



In [78]:
chunk2pred

Unnamed: 0_level_0,prediction
id,Unnamed: 1_level_1
/tf/datasets/vox/en_test/-BwrRlUdfEs__U__S0---0003.940-0020.570.mp3-000001,"[-0.9396108, -2.60894, -1.6099796, -1.09149]"
/tf/datasets/vox/en_test/-BwrRlUdfEs__U__S0---0003.940-0020.570.mp3-000002,"[-1.4211943, -2.865948, -1.5590718, -0.7106717]"
/tf/datasets/vox/en_test/-BwrRlUdfEs__U__S0---0003.940-0020.570.mp3-000003,"[-1.3829262, -2.940194, -1.6502106, -0.68460065]"
/tf/datasets/vox/en_test/-BwrRlUdfEs__U__S0---0003.940-0020.570.mp3-000004,"[-1.1061298, -2.8974087, -1.758347, -0.81719756]"
/tf/datasets/vox/en_test/-BwrRlUdfEs__U__S0---0003.940-0020.570.mp3-000005,"[-0.87291104, -3.4695358, -1.551116, -1.0813907]"
...,...
/tf/datasets/vox/ru_test/ztSbqN-mPtM__U__S20---0277.000-0287.980.mp3-000010,"[-0.33672518, -2.9591024, -3.0301414, -1.6835115]"
/tf/datasets/vox/ru_test/ztSbqN-mPtM__U__S52---0301.200-0306.760.mp3-000001,"[-0.9360744, -2.0881443, -1.4914705, -1.3514011]"
/tf/datasets/vox/ru_test/ztSbqN-mPtM__U__S52---0301.200-0306.760.mp3-000002,"[-1.7620763, -2.266904, -1.3255211, -0.7786726]"
/tf/datasets/vox/ru_test/ztSbqN-mPtM__U__S52---0301.200-0306.760.mp3-000003,"[-2.0707474, -2.0226097, -1.1285616, -0.87203324]"


In [79]:
from lidbox.util import merge_chunk_predictions


utt2pred = merge_chunk_predictions(chunk2pred)
utt2pred

Unnamed: 0_level_0,prediction
id,Unnamed: 1_level_1
/tf/datasets/vox/en_test/-BwrRlUdfEs__U__S0---0003.940-0020.570.mp3,"[-1.2432518, -2.7782032, -1.5437062, -0.98101324]"
/tf/datasets/vox/en_test/-BwrRlUdfEs__U__S0---0743.730-0757.100.mp3,"[-1.3749871, -3.0530126, -1.3759043, -0.8816973]"
/tf/datasets/vox/en_test/-BwrRlUdfEs__U__S100---0650.920-0661.560.mp3,"[-1.144896, -2.6072266, -1.4575762, -1.047575]"
/tf/datasets/vox/en_test/-BwrRlUdfEs__U__S100---0692.790-0704.510.mp3,"[-1.05083, -2.7411377, -1.45535, -1.2267107]"
/tf/datasets/vox/en_test/-BwrRlUdfEs__U__S100---0705.010-0711.610.mp3,"[-2.0527196, -1.8869089, -1.5922439, -0.7132508]"
...,...
/tf/datasets/vox/ru_test/ztSbqN-mPtM__U__S20---0219.180-0230.690.mp3,"[-1.1549723, -2.1252553, -2.3519063, -0.9764026]"
/tf/datasets/vox/ru_test/ztSbqN-mPtM__U__S20---0230.690-0247.370.mp3,"[-1.7505469, -1.7156585, -2.5320637, -0.85430783]"
/tf/datasets/vox/ru_test/ztSbqN-mPtM__U__S20---0247.370-0257.750.mp3,"[-1.9622586, -2.3761766, -2.0075784, -0.92366135]"
/tf/datasets/vox/ru_test/ztSbqN-mPtM__U__S20---0277.000-0287.980.mp3,"[-1.7208687, -1.7513126, -2.4534397, -0.88329303]"


In [80]:
from sklearn.metrics import classification_report

test_meta = meta[meta["split"]=="test"].join(utt2pred, how="outer")
assert not test_meta.isna().any(axis=None), "failed to join predictions"

true_sparse = test_meta.target.to_numpy(np.int32)
pred_dense = np.stack(test_meta.prediction.apply(np.argmax))

report = classification_report(true_sparse, pred_dense, target_names=list(targets.keys()), labels=range(4))
print(report)



              precision    recall  f1-score   support

          kz       0.39      0.34      0.36     13946
          ru       0.85      0.03      0.06     12107
          en       0.77      0.04      0.08     10000
       other       0.00      0.00      0.00         0

    accuracy                           0.16     36053
   macro avg       0.50      0.11      0.13     36053
weighted avg       0.65      0.16      0.19     36053



  _warn_prf(average, modifier, msg_start, len(result))
