In [16]:
import pandas as pd
import tensorflow as tf

# Automatically reload imported modules that are changed outside this notebook
# More pixels in figures
import matplotlib.pyplot as plt
plt.rcParams["figure.dpi"] = 200


In [17]:
import numpy as np
np_rng = np.random.default_rng(1)

tf.random.set_seed(np_rng.integers(0, tf.int64.max))



import urllib.parse
from IPython.display import display, Markdown

import os

from lidbox.meta import (
    common_voice,
    generate_label2target,
    verify_integrity,
    read_audio_durations,
    random_oversampling_on_split
)

tf.random.set_seed(np_rng.integers(0, tf.int64.max))

train = pd.read_csv("train.tsv", sep="\t")
test = pd.read_csv("test.tsv", sep="\t")
dev = pd.read_csv("dev.tsv", sep="\t")

train["path"] = train["path"].apply(lambda x: x[:-3] + "mp3")
test["path"] = test["path"].apply(lambda x: x[:-3] + "mp3")
dev["path"] = dev["path"].apply(lambda x: x[:-3] + "mp3")

train["split"] = "train"
test["split"] = "test"
dev["split"] = "dev"
#test = test.sample(30000, replace=False)
meta = pd.concat([train, test, dev])


In [18]:
# some preprocessing to make sure that the path is correct
meta.loc[meta["locale"] != "kz", "path"] = "/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/" +  meta.loc[meta["locale"] != "kz"]["locale"] + "/clips/" + meta.loc[meta["locale"] != "kz"]["path"]
targets = {"kz": 0, "ru": 1, "en":2, "other":3}
meta["target"] = meta["locale"]
meta.loc[(meta["locale"] != "kz") & (meta["locale"] != "ru") & (meta["locale"]!="en"), "target"] = "other"
meta = meta.loc[meta["path"] != "/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/kz/clips/5f590a130a73c.mp3"]
meta = meta.loc[meta["path"] != "/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/kz/clips/5ef9bd9ba7029.mp3"]

meta["id"] = meta["Unnamed: 0"].apply(str)
meta["target"] = meta["target"].map(targets)



In [19]:
workdir = "/tf/datasets/augmentedXvector/"


In [20]:
meta["id"] = meta["path"]

In [21]:
import tensorflow_io as tfio

In [22]:
import scipy.signal

from lidbox.features import audio, cmvn
import lidbox.data.steps as ds_steps


TF_AUTOTUNE = tf.data.experimental.AUTOTUNE

# preprocessing of audios


def metadata_to_dataset_input(meta):
    return {
        "id": tf.constant(meta.id, tf.string),
        "path": tf.constant(meta.path, tf.string),
        "target": tf.constant(meta.target, tf.int32),
        "split": tf.constant(meta.split, tf.string),
    }


# reading and normalizing data

def read_mp3(x):
    s, r = audio.read_mp3(x["path"])
    out_rate = 16000
    s = audio.resample(s, r, out_rate)
    s = audio.peak_normalize(s, dBFS=-3.0)
    s = audio.remove_silence(s, out_rate)
    return dict(x, signal=s, sample_rate=out_rate)



# augmentations using random filtering
def random_filter(x):
    def scipy_filter(s, N=10):
        b = np_rng.normal(0, 1, N)
        return scipy.signal.lfilter(b, 1.0, s).astype(np.float32), b
    s, _ = tf.numpy_function(
        scipy_filter,
        [x["signal"]],
        [tf.float32, tf.float64],
        name="np_random_filter")
    s = tf.cast(s, tf.float32)
    s = audio.peak_normalize(s, dBFS=-3.0)
    return dict(x, signal=s)


# significant speed change
def random_speed_change(ds):
    return ds_steps.random_signal_speed_change(ds, min=0.5, max=1.5, flag=None)


def create_signal_chunks(ds):
    ds = ds_steps.repeat_too_short_signals(ds, 3200)
    ds = ds_steps.create_signal_chunks(ds, 3200, 800)
    return ds


def batch_extract_features(x):
    with tf.device("GPU"):
        signals, rates = x["signal"], x["sample_rate"]
        S = audio.spectrograms(signals, rates[0])
        S = audio.linear_to_mel(S, rates[0])
        S = tf.math.log(S + 1e-6)
        mfccs = tf.signal.mfccs_from_log_mel_spectrograms(S)
        mfccs = mfccs[...,1:21]
        S = cmvn(S, normalize_variance=False)
        mfccs_cmvn = cmvn(mfccs)

        #S = tfio.audio.freq_mask(S, param=10)
        #S = tfio.audio.time_mask(S, param=10)
    return dict(x, logmelspec=S, mfccs=mfccs)


def pipeline_from_meta(data, split):
    if split == "train":
        data = data.sample(frac=1, random_state=np_rng.bit_generator)

    ds = (tf.data.Dataset
            .from_tensor_slices(metadata_to_dataset_input(data))
            .map(read_mp3, num_parallel_calls=TF_AUTOTUNE))

    if split == "train":
        return (ds
            .apply(random_speed_change)
           #.cache(os.path.join(cachedir, "data", split))
            .prefetch(32)
            .map(random_filter, num_parallel_calls=TF_AUTOTUNE)
            .apply(create_signal_chunks)
            .batch(32)
            .map(batch_extract_features, num_parallel_calls=TF_AUTOTUNE)
            .unbatch())
    else:
        return (ds
            .apply(create_signal_chunks)
            .batch(32)
            .map(batch_extract_features, num_parallel_calls=TF_AUTOTUNE)
            .unbatch()
            #.cache(os.path.join(cachedir, "data", split))
            .prefetch(1))


cachedir = os.path.join(workdir, "cache")



In [23]:
val_data = {split: pipeline_from_meta(meta[meta["split"]==split], split)
            for split in meta.split.unique()}
val_data = val_data['dev']

2021-06-27 11:18:18.170 I lidbox.data.steps: Applying random resampling to signals with a random speed ratio chosen uniformly at random from [0.500, 1.500]
2021-06-27 11:18:18.194 I lidbox.data.steps: Repeating all signals until they are at least 3200 ms
2021-06-27 11:18:18.206 I lidbox.data.steps: Dividing every signal in the dataset into new signals by creating signal chunks of length 3200 ms and offset 800 ms. Maximum amount of padding allowed in the last chunk is 0 ms.
2021-06-27 11:18:18.577 I lidbox.data.steps: Repeating all signals until they are at least 3200 ms
2021-06-27 11:18:18.590 I lidbox.data.steps: Dividing every signal in the dataset into new signals by creating signal chunks of length 3200 ms and offset 800 ms. Maximum amount of padding allowed in the last chunk is 0 ms.
2021-06-27 11:18:18.946 I lidbox.data.steps: Repeating all signals until they are at least 3200 ms
2021-06-27 11:18:18.959 I lidbox.data.steps: Dividing every signal in the dataset into new signals by

In [24]:
split2ds = {split: pipeline_from_meta(meta[meta["split"]==split], split)
            for split in meta.split.unique()}


2021-06-27 11:18:19.549 I lidbox.data.steps: Applying random resampling to signals with a random speed ratio chosen uniformly at random from [0.500, 1.500]
2021-06-27 11:18:19.572 I lidbox.data.steps: Repeating all signals until they are at least 3200 ms
2021-06-27 11:18:19.584 I lidbox.data.steps: Dividing every signal in the dataset into new signals by creating signal chunks of length 3200 ms and offset 800 ms. Maximum amount of padding allowed in the last chunk is 0 ms.
2021-06-27 11:18:19.944 I lidbox.data.steps: Repeating all signals until they are at least 3200 ms
2021-06-27 11:18:19.957 I lidbox.data.steps: Dividing every signal in the dataset into new signals by creating signal chunks of length 3200 ms and offset 800 ms. Maximum amount of padding allowed in the last chunk is 0 ms.
2021-06-27 11:18:20.307 I lidbox.data.steps: Repeating all signals until they are at least 3200 ms
2021-06-27 11:18:20.320 I lidbox.data.steps: Dividing every signal in the dataset into new signals by

In [97]:
# Here are all blocks used to build the model. Retrieved from: https://github.com/py-lidbox/lidbox 

from tensorflow.keras.layers import (
    Activation,
    BatchNormalization,
    Conv1D,
 
   Dense,
    Dropout,
    Input,
    Layer,
    SpatialDropout1D,
)
from tensorflow.keras.models import Model
import tensorflow as tf

# Assuming spectral features (Batch, Time, Channels), where freq. channels are always last
TIME_AXIS = 1
STDDEV_SQRT_MIN_CLIP = 1e-10


class GlobalMeanStddevPooling1D(Layer):
    """
    Compute arithmetic mean and standard deviation of the inputs along the time steps dimension,
    then output the concatenation of the computed stats.
    """
    def call(self, inputs):
        means = tf.math.reduce_mean(inputs, axis=TIME_AXIS, keepdims=True)
        variances = tf.math.reduce_mean(tf.math.square(inputs - means), axis=TIME_AXIS)
        means = tf.squeeze(means, TIME_AXIS)
        stddevs = tf.math.sqrt(tf.clip_by_value(variances, STDDEV_SQRT_MIN_CLIP, variances.dtype.max))
        return tf.concat((means, stddevs), axis=TIME_AXIS)


def frame_layer(filters, kernel_size, strides, padding="causal", activation="relu", name="frame"):
    return Conv1D(filters, kernel_size, strides, padding=padding, activation=activation, name=name)


def segment_layer(units, activation="relu", name="segment"):
    return Dense(units, activation=activation, name=name)


from tensorflow.keras.layers import (
    Activation,
    BatchNormalization,
    Conv1D,
    Conv2D,
    Dropout,
    Dense,
    GaussianNoise,
    Input,
    Layer,
    LSTM,
    Multiply,
    Reshape,
)

from tensorflow.keras.models import Model
import tensorflow as tf


def frequency_attention(H, d_a=64, d_f=16):
    assert not H.shape[2] % d_f, "amount of frequency channels ({}) must be evenly divisible by the amount of frequency attention bins (d_f={})".format(H.shape[2], d_f)
    # Note, we assume that H.shape = (batch_size, T, d_h), but the paper assumes the timesteps come last
    x = Dense(d_a, activation="relu", use_bias=False, name="Wf_1")(H)
    F_A = Dense(d_f, activation="softmax", use_bias=False, name="Wf_2")(x)
    # Apply frequency attention on d_f bins
    F_A = Reshape((F_A.shape[1] or -1, F_A.shape[2], 1), name="expand_bin_weight_dim")(F_A)
    H_bins = Reshape((H.shape[1] or -1, d_f, H.shape[2] // d_f), name="partition_freq_bins")(H)
    H_bins = Multiply(name="freq_attention")([F_A, H_bins])
    # Merge weighted frequency bins
    H_weighted = Reshape((H.shape[1] or -1, H.shape[2]), name="merge_weighted_bins")(H_bins)
    return H_weighted



from tensorflow.keras.layers import (
    Activation,
    Dense,
    Input,
)

from tensorflow.keras.models import Model
import tensorflow as tf



def create(input_shape, num_outputs, output_activation="log_softmax", freq_attention_bins=60):
    inputs = Input(shape=input_shape, name="input")

    x = frame_layer(512, 5, 1, name="frame1")(inputs)
    x = frame_layer(512, 3, 2, name="frame2")(x)
    x = frame_layer(512, 3, 3, name="frame3")(x)
    x = frame_layer(512, 1, 1, name="frame4")(x)
    x = frame_layer(1500, 1, 1, name="frame5")(x)

    x = frequency_attention(x, d_f=freq_attention_bins)

    x = GlobalMeanStddevPooling1D(name="stats_pooling")(x)

    x = segment_layer(512, name="segment1")(x)
    x = segment_layer(512, name="segment2")(x)

    
    return Model(inputs=inputs, outputs=x, name="x-vector-frequency-attention")

In [98]:
def create_model(num_freq_bins=40, num_labels=len(np.unique(meta.target))):
    m = create(
        input_shape=[None, num_freq_bins],
        num_outputs=num_labels)
    """
    m.compile(
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        optimizer=tf.keras.optimizers.Adam(learning_rate=3e-4),
            metrics=tf.keras.metrics.sparse_categorical_accuracy)
    """
    return m

with tf.device("GPU"):
    model = create_model()
    model.summary()
   

Model: "x-vector-frequency-attention"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              [(None, None, 40)]   0                                            
__________________________________________________________________________________________________
frame1 (Conv1D)                 (None, None, 512)    102912      input[0][0]                      
__________________________________________________________________________________________________
frame2 (Conv1D)                 (None, None, 512)    786944      frame1[0][0]                     
__________________________________________________________________________________________________
frame3 (Conv1D)                 (None, None, 512)    786944      frame2[0][0]                     
_______________________________________________________________________

In [110]:
# the Xvector-based model is used as an encoder. That is why two classifiers will use its outputs
model.trainable = False
x = model.layers[-1].output
x = Dense(128, activation = "relu")(x)
predictions = Dense(4, activation = "softmax")(x)
clf1 = Model(inputs = model.input, outputs = predictions, name="clf1")

In [111]:
clf1.summary()

Model: "clf1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              [(None, None, 40)]   0                                            
__________________________________________________________________________________________________
frame1 (Conv1D)                 (None, None, 512)    102912      input[0][0]                      
__________________________________________________________________________________________________
frame2 (Conv1D)                 (None, None, 512)    786944      frame1[0][0]                     
__________________________________________________________________________________________________
frame3 (Conv1D)                 (None, None, 512)    786944      frame2[0][0]                     
_______________________________________________________________________________________________

In [112]:
for l in clf1.layers:
    if l.trainable:
        print(l.name)

dense_24
dense_25


In [113]:
clf1.compile(
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
            metrics=tf.keras.metrics.sparse_categorical_accuracy)

In [114]:
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        optimizer=tf.keras.optimizers.Adam(learning_rate=3e-4),
            metrics=tf.keras.metrics.sparse_categorical_accuracy)



In [104]:
for l in model.layers:
    if l.trainable:
        print(l)
model.summary()

Model: "x-vector-frequency-attention"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              [(None, None, 40)]   0                                            
__________________________________________________________________________________________________
frame1 (Conv1D)                 (None, None, 512)    102912      input[0][0]                      
__________________________________________________________________________________________________
frame2 (Conv1D)                 (None, None, 512)    786944      frame1[0][0]                     
__________________________________________________________________________________________________
frame3 (Conv1D)                 (None, None, 512)    786944      frame2[0][0]                     
_______________________________________________________________________

In [115]:
model.trainable = False
x = model.layers[-1].output
x = Dense(128, activation = "relu")(x)
predictions = Dense(4, activation = "softmax")(x)
clf2 = Model(inputs = model.input, outputs = predictions, name="clf2")

In [116]:
clf2.compile(
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
            metrics=tf.keras.metrics.sparse_categorical_accuracy)

In [117]:
clf2.trainable = True
model.trainable = False
for l in clf2.layers:
    if l.trainable:
        print(l.name)

dense_26
dense_27


## Training

In [118]:
# training with domain adaptation. It was retrieved from: https://github.com/mil-tokyo/MCD_DA
from tqdm import tqdm
EPOCHS = 100 
dev_iterator = iter(split2ds["dev"].batch(32).repeat(1000))
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-4)
optimizer2 = tf.keras.optimizers.Adam(learning_rate=1e-4)

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
val_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
best_acc = 0
counter = 0
MAX_PATIENCE = 5
c = 0

def discrepancy(out1, out2):
    return tf.reduce_mean(tf.abs(out1 - out2))

for epoch in range(EPOCHS):
    print(f"It is epoch number: {epoch}")
    avg_loss = 0
    for i in tqdm(split2ds["train"].batch(32)):
        c += 1
        clf1.trainable = True
        clf2.trainable = True
        model.trainable = True
        target_data = next(dev_iterator)
        s_specs = i['logmelspec']
        t_specs = target_data['logmelspec']
        # Train for classification
        with tf.GradientTape() as tape:
            pred1 = clf1(s_specs)
            pred2 = clf2(s_specs)
            loss_classification_1 = loss_fn(i['target'], pred1)
            loss_classification_2 = loss_fn(i['target'], pred2)
            total_classification_loss = loss_classification_1 + loss_classification_2
            avg_loss += total_classification_loss
            
        train_acc_metric.update_state(i['target'], pred1)
        grads = tape.gradient(total_classification_loss, [clf1.trainable_weights, clf2.trainable_weights, model.trainable_weights])
        optimizer.apply_gradients(zip(grads[0], clf1.trainable_weights))
        optimizer.apply_gradients(zip(grads[1], clf2.trainable_weights))
        optimizer.apply_gradients(zip(grads[2], model.trainable_weights))
        # Train for discrepancy increase
        model.trainable = False
        with tf.GradientTape() as tape2:
            pred1 = clf1(s_specs)
            pred2 = clf2(s_specs)
            loss_classification_1 = loss_fn(i['target'], pred1)
            loss_classification_2 = loss_fn(i['target'], pred2)
            total_classification_loss = loss_classification_1 + loss_classification_2
            pred1 = clf1(t_specs)
            pred2 = clf2(t_specs)
            discrepancy_loss = discrepancy(pred1, pred2)
            dl1 = discrepancy_loss
            loss = total_classification_loss - discrepancy_loss

        grads = tape2.gradient(loss, [clf1.trainable_weights, clf2.trainable_weights])
        optimizer.apply_gradients(zip(grads[0], clf1.trainable_weights))
        optimizer.apply_gradients(zip(grads[1], clf2.trainable_weights))
        # Train for discrepancy decrease
        clf1.trainable = False
        clf2.trainable = False
        model.trainable = True
        dloss = 0
        for k in range(3):
            with tf.GradientTape() as tape3:
                pred1 = clf1(t_specs)
                pred2 = clf2(t_specs)
                discrepancy_loss = discrepancy(pred1, pred2)
                dloss = discrepancy_loss
            grads = tape3.gradient(discrepancy_loss, model.trainable_weights)            
            optimizer2.apply_gradients(zip(grads, model.trainable_weights))
        if c % 1000 == 0:
            tqdm.write(f"loss: {total_classification_loss}, discrepancy before: {dl1}, discrepancy loss after: {dloss}")
            c = 0
    for batch in tqdm(val_data.batch(32)):
        val_preds = clf1(batch['logmelspec'])
        true_vals = batch['target']  
        val_acc_metric.update_state(true_vals, val_preds)
    new_acc = 0
    val_acc = val_acc_metric.result()
    new_acc += val_acc
    val_acc_metric.reset_states()
    train_acc = train_acc_metric.result()
    train_acc_metric.reset_states()
    print(f"Val acc: {val_acc}, Train acc: {train_acc}. Train loss (clf1): {avg_loss/9783}")
    for batch in tqdm(val_data.batch(32)):
        val_preds = clf2(batch['logmelspec'])
        true_vals = batch['target']  
        val_acc_metric.update_state(true_vals, val_preds)

    val_acc = val_acc_metric.result()
    val_acc_metric.reset_states()
    train_acc = train_acc_metric.result()
    train_acc_metric.reset_states()
    print(f"Val acc (clf2): {val_acc}.")
    new_acc += val_acc
    if new_acc / 2 > best_acc:
        print(f"Here is an improvement from {best_acc} to {new_acc / 2}\nSaving results")
        best_acc = new_acc / 2
        counter = 0
        clf1.save("clf1_3/model")
        clf2.save("clf2_3/model")
        model.save("extractor/model")
    else:
        print("No improvements")
        counter += 1
    if counter >= MAX_PATIENCE:
        break

0it [00:00, ?it/s]

It is epoch number: 0


1000it [04:31,  3.72it/s]

loss: 1.6472115516662598, discrepancy before: 0.056961655616760254, discrepancy loss after: 0.057231366634368896


2000it [08:58,  3.80it/s]

loss: 3.13413667678833, discrepancy before: 0.05077614635229111, discrepancy loss after: 0.051046330481767654


3000it [13:25,  3.73it/s]

loss: 2.919963836669922, discrepancy before: 0.044328853487968445, discrepancy loss after: 0.042545855045318604


4000it [17:51,  3.77it/s]

loss: 1.8670859336853027, discrepancy before: 0.04103769361972809, discrepancy loss after: 0.041010163724422455


5000it [22:17,  3.67it/s]

loss: 1.5211219787597656, discrepancy before: 0.011340435594320297, discrepancy loss after: 0.011625243350863457


6000it [26:42,  3.60it/s]

loss: 2.6412782669067383, discrepancy before: 0.034960709512233734, discrepancy loss after: 0.034523457288742065


7000it [31:08,  3.75it/s]

loss: 2.044222831726074, discrepancy before: 0.006423259153962135, discrepancy loss after: 0.0063832481391727924


8000it [35:34,  3.74it/s]

loss: 1.3240962028503418, discrepancy before: 0.004808356985449791, discrepancy loss after: 0.0041030957363545895


9000it [39:59,  3.72it/s]

loss: 2.1586060523986816, discrepancy before: 0.005310078151524067, discrepancy loss after: 0.005273367743939161


9778it [43:23,  3.76it/s]
2991it [01:20, 37.27it/s]
0it [00:00, ?it/s]

Val acc: 0.586567223072052, Train acc: 0.513098955154419. Train loss (clf1): 2.1604323387145996


2991it [01:19, 37.57it/s]


Val acc (clf2): 0.5862955451011658.
Here is an improvement from 0 to 0.5864313840866089
Saving results
INFO:tensorflow:Assets written to: clf1_3/model/assets
2021-06-27 18:38:03.501 I tensorflow: Assets written to: clf1_3/model/assets




INFO:tensorflow:Assets written to: clf2_3/model/assets
2021-06-27 18:38:05.228 I tensorflow: Assets written to: clf2_3/model/assets




INFO:tensorflow:Assets written to: extractor/model/assets
2021-06-27 18:38:06.786 I tensorflow: Assets written to: extractor/model/assets


0it [00:00, ?it/s]

It is epoch number: 1


222it [01:03,  3.81it/s]

loss: 1.4493460655212402, discrepancy before: 0.0064748008735477924, discrepancy loss after: 0.006274324841797352


1222it [05:31,  3.68it/s]

loss: 0.9953786134719849, discrepancy before: 0.0002067375462502241, discrepancy loss after: 0.00019329233327880502


2222it [09:57,  3.69it/s]

loss: 1.5613136291503906, discrepancy before: 0.002906474284827709, discrepancy loss after: 0.0026488087605684996


3222it [14:23,  3.75it/s]

loss: 1.5251617431640625, discrepancy before: 0.0015362376580014825, discrepancy loss after: 0.0013950061984360218


4222it [18:50,  3.76it/s]

loss: 1.2038086652755737, discrepancy before: 0.008787691593170166, discrepancy loss after: 0.007556402124464512


5222it [23:16,  3.80it/s]

loss: 0.8245667219161987, discrepancy before: 0.0002649087691679597, discrepancy loss after: 0.00020591109932865947


6222it [27:42,  3.84it/s]

loss: 0.8843764662742615, discrepancy before: 0.0018520738231018186, discrepancy loss after: 0.0017619709251448512


7222it [32:08,  3.76it/s]

loss: 1.863968849182129, discrepancy before: 0.003809521673247218, discrepancy loss after: 0.0038248091004788876


8222it [36:34,  3.79it/s]

loss: 1.0268434286117554, discrepancy before: 0.0034442367032170296, discrepancy loss after: 0.003364184405654669


9222it [41:00,  3.74it/s]

loss: 2.627358913421631, discrepancy before: 0.005002018064260483, discrepancy loss after: 0.004480269737541676


9776it [43:25,  3.75it/s]
2991it [01:19, 37.55it/s]
0it [00:00, ?it/s]

Val acc: 0.6673771739006042, Train acc: 0.6325052976608276. Train loss (clf1): 1.4697000980377197


2991it [01:19, 37.45it/s]


Val acc (clf2): 0.666907012462616.
Here is an improvement from 0.5864313840866089 to 0.6671420931816101
Saving results
INFO:tensorflow:Assets written to: clf1_3/model/assets
2021-06-27 19:24:13.595 I tensorflow: Assets written to: clf1_3/model/assets




INFO:tensorflow:Assets written to: clf2_3/model/assets
2021-06-27 19:24:15.533 I tensorflow: Assets written to: clf2_3/model/assets




INFO:tensorflow:Assets written to: extractor/model/assets
2021-06-27 19:24:17.095 I tensorflow: Assets written to: extractor/model/assets


0it [00:00, ?it/s]

It is epoch number: 2


446it [02:02,  3.86it/s]

loss: 1.9978622198104858, discrepancy before: 2.3635129764443263e-05, discrepancy loss after: 2.488081190676894e-05


1446it [06:29,  3.79it/s]

loss: 1.1384044885635376, discrepancy before: 0.0031542563810944557, discrepancy loss after: 0.003146658418700099


2446it [10:55,  3.74it/s]

loss: 0.9339596629142761, discrepancy before: 3.12356036147321e-07, discrepancy loss after: 2.8410627805897093e-07


3446it [15:20,  3.75it/s]

loss: 1.7237623929977417, discrepancy before: 0.003573259338736534, discrepancy loss after: 0.0035681030713021755


4446it [19:46,  3.81it/s]

loss: 0.8444103002548218, discrepancy before: 0.002823175862431526, discrepancy loss after: 0.0025392004754394293


4564it [20:18,  3.75it/s]


KeyboardInterrupt: 

## Testing on common voice

In [108]:
meta = meta.set_index('id')

In [109]:
clf1 = tf.keras.models.load_model('clf1_2/model')

clf2 = tf.keras.models.load_model('clf2_2/model')

In [61]:

def predictions_to_dataframe(ids, predictions):
    return (pd.DataFrame.from_dict({"id": ids, "prediction": predictions})
            #.set_index("id", drop=True, verify_integrity=True)
            #.sort_index()
           )

def predict_with_model(model, ds, predict_fn=None):
    """
    Map callable model over all batches in ds, predicting values for each element at key 'input'.
    """
    if predict_fn is None:
        def predict_fn(x):
            with tf.device("GPU"):
                return x["id"], model(x["input"], training=False)

    ids = []
    predictions = []
    for id, pred in ds.map(predict_fn, num_parallel_calls=TF_AUTOTUNE).unbatch().as_numpy_iterator():
        ids.append(id.decode("utf-8"))
        predictions.append(pred)

    return predictions_to_dataframe(ids, predictions)

In [31]:
chunk2pred = predict_with_model(
    model=clf2,
    ds=split2ds["test"].map(lambda x: dict(x, input=x["logmelspec"])).batch(32),
    #predict_fn=predict_with_ap_loss
    )



In [32]:
chunk2pred = chunk2pred.set_index("id")

In [33]:
from lidbox.util import merge_chunk_predictions


utt2pred = merge_chunk_predictions(chunk2pred)
utt2pred

Unnamed: 0_level_0,prediction
id,Unnamed: 1_level_1
/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/cs/clips/common_voice_cs_20424383.mp3,"[8.308059e-09, 0.0003079729, 6.093462e-05, 0.9..."
/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/cs/clips/common_voice_cs_20424555.mp3,"[2.471661e-07, 0.005254525, 0.00096489495, 0.9..."
/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/cs/clips/common_voice_cs_20424567.mp3,"[1.87723e-07, 0.80234873, 5.8355516e-05, 0.197..."
/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/cs/clips/common_voice_cs_20424609.mp3,"[1.9055422e-05, 0.7478529, 1.27799485e-05, 0.2..."
/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/cs/clips/common_voice_cs_20424636.mp3,"[2.5810184e-06, 0.056320507, 0.001680822, 0.94..."
...,...
/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/zh-CN/clips/common_voice_zh-CN_22242585.mp3,"[3.313049e-08, 8.7356224e-05, 0.0008840677, 0...."
/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/zh-CN/clips/common_voice_zh-CN_22242819.mp3,"[0.00059783406, 0.08054378, 0.7682407, 0.1506177]"
/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/zh-CN/clips/common_voice_zh-CN_22243431.mp3,"[1.5969928e-05, 0.0027630564, 0.12403288, 0.87..."
/tf/datasets/data_untar/cv-corpus-6.1-2020-12-11/zh-CN/clips/common_voice_zh-CN_22243666.mp3,"[9.9802e-09, 8.252818e-06, 1.6951846e-05, 0.99..."


### Results of clf2

In [34]:


from sklearn.metrics import classification_report

test_meta = meta[meta["split"]=="test"].join(utt2pred, how="outer")
assert not test_meta.isna().any(axis=None), "failed to join predictions"

true_sparse = test_meta.target.to_numpy(np.int32)
pred_dense = np.stack(test_meta.prediction.apply(np.argmax))

report = classification_report(true_sparse, pred_dense, target_names=list(targets.keys()), labels=range(4))
print(report)



              precision    recall  f1-score   support

          kz       1.00      1.00      1.00     17341
          ru       0.88      0.75      0.81     10379
          en       0.87      0.87      0.87     12964
       other       0.77      0.84      0.80     15084

    accuracy                           0.88     55768
   macro avg       0.88      0.86      0.87     55768
weighted avg       0.88      0.88      0.88     55768



Since the results vary, it is reasonable to conduct the experiment further

## Testing on vox

In [128]:
chunk2pred = predict_with_model(
    model=clf2,
    ds=split2ds["test"].map(lambda x: dict(x, input=x["logmelspec"])).batch(32),
    #predict_fn=predict_with_ap_loss
    )



In [129]:
chunk2pred = chunk2pred.set_index("id")

In [130]:
meta

Unnamed: 0_level_0,path,locale,split,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1486,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...,ru,train,1.0
56701,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...,kz,train,0.0
3364,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...,ru,train,1.0
110475,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...,rw,train,3.0
45384,/tf/datasets/data_untar/cv-corpus-6.1-2020-12-...,en,train,2.0
...,...,...,...,...
/tf/datasets/vox/ru_dev/BH8c4SbgXss__U__S251---1604.030-1609.420.mp3,/tf/datasets/vox/ru_dev/BH8c4SbgXss__U__S251--...,ru,dev,1.0
/tf/datasets/vox/ru_dev/--RxvUW3u7M__U__S0---0552.770-0565.180.mp3,/tf/datasets/vox/ru_dev/--RxvUW3u7M__U__S0---0...,ru,dev,1.0
/tf/datasets/vox/ru_dev/kZ8LKE26cl0__U__S1---0100.470-0114.760.mp3,/tf/datasets/vox/ru_dev/kZ8LKE26cl0__U__S1---0...,ru,dev,1.0
/tf/datasets/vox/ru_dev/ya9uyy12vvM__U__S208---1384.770-1393.130.mp3,/tf/datasets/vox/ru_dev/ya9uyy12vvM__U__S208--...,ru,dev,1.0


In [131]:
from lidbox.util import merge_chunk_predictions


utt2pred = merge_chunk_predictions(chunk2pred)
utt2pred

Unnamed: 0_level_0,prediction
id,Unnamed: 1_level_1
/tf/datasets/vox/en_test/-BwrRlUdfEs__U__S0---0003.940-0020.570.mp3,"[0.82074064, 0.036507558, 0.05600509, 0.086746..."
/tf/datasets/vox/en_test/-BwrRlUdfEs__U__S0---0743.730-0757.100.mp3,"[0.7609731, 0.050528564, 0.07821112, 0.11028719]"
/tf/datasets/vox/en_test/-BwrRlUdfEs__U__S100---0650.920-0661.560.mp3,"[0.99982405, 1.17218315e-05, 0.00012380008, 4...."
/tf/datasets/vox/en_test/-BwrRlUdfEs__U__S100---0692.790-0704.510.mp3,"[0.99906653, 9.269332e-05, 0.0005545609, 0.000..."
/tf/datasets/vox/en_test/-BwrRlUdfEs__U__S100---0705.010-0711.610.mp3,"[0.9915779, 0.0011870286, 0.0039456706, 0.0032..."
...,...
/tf/datasets/vox/ru_test/ztSbqN-mPtM__U__S20---0219.180-0230.690.mp3,"[0.9704744, 0.0048984527, 0.01188823, 0.012738..."
/tf/datasets/vox/ru_test/ztSbqN-mPtM__U__S20---0230.690-0247.370.mp3,"[0.9708305, 0.005451752, 0.0102613745, 0.01345..."
/tf/datasets/vox/ru_test/ztSbqN-mPtM__U__S20---0247.370-0257.750.mp3,"[0.9644639, 0.006060426, 0.01391795, 0.015557733]"
/tf/datasets/vox/ru_test/ztSbqN-mPtM__U__S20---0277.000-0287.980.mp3,"[0.80379754, 0.04677399, 0.054464877, 0.09496365]"


In [132]:


from sklearn.metrics import classification_report

test_meta = meta[meta["split"]=="test"].join(utt2pred, how="outer")
assert not test_meta.isna().any(axis=None), "failed to join predictions"

true_sparse = test_meta.target.to_numpy(np.int32)
pred_dense = np.stack(test_meta.prediction.apply(np.argmax))

report = classification_report(true_sparse, pred_dense, target_names=list(targets.keys()), labels=range(4))
print(report)



              precision    recall  f1-score   support

          kz       0.39      0.99      0.55     13946
          ru       0.00      0.00      0.00     12107
          en       0.47      0.00      0.00     10000
       other       0.00      0.00      0.00         0

    accuracy                           0.38     36053
   macro avg       0.21      0.25      0.14     36053
weighted avg       0.28      0.38      0.22     36053



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
