##### Copyright 2019 Google LLC.

Licensed under the Apache License, Version 2.0 (the "License");

In [1]:
#@title Default title text
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

##### Full-flow evaluation

A colab for testing the full flow of calculating embeddings and train/eval using sklearn models. Since this notebook doesn't parallelize (like the apache beam tools do) and computing embeddings is computationally expensive, **please use the mutli-step beam-based tools if** you'd like to eval a large dataset, eval a custom dataset, or train a Keras model.

Please be sure to use a **Python 3** kernel. **Running on GPU** significantly speeds up the process as well.

Conceptual overview of this colab:

1. Read `TensorFlow Dataset` data as numpy
1. Convert audio to float and resample
1. Convert audio to embeddings
1. Train and eval sklearn model

In [2]:
tfds_dataset_name = 'crema_d'  #@param
REQUIRED_SAMPLE_RATE_ = 16000

In [6]:
# Read the data into numpy arrays.
import collections
SingleSplit = collections.namedtuple(
    'SingleSplit', ['audio', 'labels', 'speaker_id'])
Data = collections.namedtuple(
    'Data', ['train', 'validation', 'test'])

import tensorflow.compat.v2 as tf
tf.enable_v2_behavior()
assert tf.executing_eagerly()
import tensorflow_datasets as tfds
def _dat_from_split(split):
  np_generator = tfds.as_numpy(tfds.load(tfds_dataset_name, split=split))
  dat = [(x['audio'], x['label'], x['speaker_id']) for x in np_generator]
  audio, labels, speaker_id = zip(*dat)

  import numpy as np
  labels = np.array(labels, dtype=np.int16)
  speaker_id = np.array(speaker_id)
  assert len(audio) == labels.size == speaker_id.size
  assert labels.ndim == speaker_id.ndim == 1
  print(f'Finished {split}')
  return audio, labels, speaker_id

all_data = Data(
    train=SingleSplit(*_dat_from_split('train')),
    validation=SingleSplit(*_dat_from_split('validation')),
    test=SingleSplit(*_dat_from_split('test')))

2022-11-04 10:14:37.272147: W tensorflow/core/platform/cloud/google_auth_provider.cc:184] All attempts to get a Google authentication bearer token failed, returning an empty token. Retrieving token from files failed with "NOT_FOUND: Could not locate the credentials file.". Retrieving token from GCE failed with "FAILED_PRECONDITION: Error executing an HTTP request: libcurl code 6 meaning 'Couldn't resolve host name', error details: Could not resolve host: metadata".


[1mDownloading and preparing dataset 579.25 MiB (download: 579.25 MiB, generated: 1.65 GiB, total: 2.21 GiB) to /Users/morgan/tensorflow_datasets/crema_d/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/5144 [00:00<?, ? examples/s]

Shuffling /Users/morgan/tensorflow_datasets/crema_d/1.0.0.incomplete3W380S/crema_d-train.tfrecord*...:   0%|  …

Generating validation examples...:   0%|          | 0/738 [00:00<?, ? examples/s]

Shuffling /Users/morgan/tensorflow_datasets/crema_d/1.0.0.incomplete3W380S/crema_d-validation.tfrecord*...:   …

Generating test examples...:   0%|          | 0/1556 [00:00<?, ? examples/s]

Shuffling /Users/morgan/tensorflow_datasets/crema_d/1.0.0.incomplete3W380S/crema_d-test.tfrecord*...:   0%|   …

[1mDataset crema_d downloaded and prepared to /Users/morgan/tensorflow_datasets/crema_d/1.0.0. Subsequent calls will reuse this data.[0m


2022-11-04 10:17:02.987654: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Finished train
Finished validation
Finished test


In [7]:
# Make the audio floats, and resample the audio if necessary.
import collections
import librosa
import numpy as np
FloatData = collections.namedtuple('FloatData', ['train', 'validation', 'test'])

sample_rate = tfds.builder(tfds_dataset_name).info.features['audio'].sample_rate
def _int_to_float(audio_int16, split_name):
  float_audio_16k = []
  for i, samples in enumerate(audio_int16):
    float_audio = samples.astype(np.float32) / np.iinfo(np.int16).max
    if sample_rate != REQUIRED_SAMPLE_RATE_:
      float_audio = librosa.core.resample(
          float_audio, orig_sr=sample_rate, target_sr=16000, 
          res_type='kaiser_best')
    float_audio_16k.append(float_audio)
    if i % 50 == 0:
      print(f'Finished resampling {i} / {len(audio_int16)} for {split_name}')
  print(f'Finished {split_name}')
  return float_audio_16k


float_audio_16k = FloatData(
    train=_int_to_float(all_data.train.audio, 'train'),
    validation=_int_to_float(all_data.validation.audio, 'validation'),
    test=_int_to_float(all_data.test.audio, 'test'))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes

Finished resampling 0 / 5144 for train
Finished resampling 50 / 5144 for train
Finished resampling 100 / 5144 for train
Finished resampling 150 / 5144 for train
Finished resampling 200 / 5144 for train
Finished resampling 250 / 5144 for train
Finished resampling 300 / 5144 for train
Finished resampling 350 / 5144 for train
Finished resampling 400 / 5144 for train
Finished resampling 450 / 5144 for train
Finished resampling 500 / 5144 for train
Finished resampling 550 / 5144 for train
Finished resampling 600 / 5144 for train
Finished resampling 650 / 5144 for train
Finished resampling 700 / 5144 for train
Finished resampling 750 / 5144 for train
Finished resampling 800 / 5144 for train
Finished resampling 850 / 5144 for train
Finished resampling 900 / 5144 for train
Finished resampling 950 / 5144 for train
Finished resampling 1000 / 5144 for train
Finished resampling 1050 / 5144 for train
Finished resampling 1100 / 5144 for train
Finished resampling 1150 / 5144 for train
Finished resamp

In [8]:
tfhub_model_name = 'https://tfhub.dev/google/nonsemantic-speech-benchmark/trill-distilled/1'  #@param
output_key = 'embedding'  #@param

In [9]:
# Convert the audio to embeddings. Preaverage the embeddings across time.
import tensorflow_hub as hub
model = hub.load(tfhub_model_name)

In [10]:
import collections
Embeddings = collections.namedtuple(
    'Embeddings', ['train', 'validation', 'test'])

def _calc_embeddings(cur_float_audio, split_name):
  cur_embeddings = []
  for i, float_samples in enumerate(cur_float_audio):
    tf_out = model(tf.constant(float_samples, tf.float32),
                  tf.constant(16000, tf.int32))
    embedding_2d = tf_out[output_key]
    assert embedding_2d.ndim == 2
    embedding_1d = np.mean(embedding_2d, axis=0)
    cur_embeddings.append(embedding_1d)
    if i % 50 == 0:
      print(f'Finished embedding {i} / {len(cur_float_audio)} for {split_name}')
  print(f'Finished {split_name}')
  cur_embeddings = np.array(cur_embeddings, dtype=np.float32)
  return cur_embeddings

embeddings = Embeddings(
    train=_calc_embeddings(float_audio_16k.train, 'train'),
    validation=_calc_embeddings(float_audio_16k.validation, 'validation'),
    test=_calc_embeddings(float_audio_16k.test, 'test'))
assert embeddings.train.shape[1] == embeddings.validation.shape[1] == embeddings.test.shape[1]
assert embeddings.train.shape[0] == all_data.train.labels.shape[0] == all_data.train.speaker_id.shape[0]
assert embeddings.validation.shape[0] == all_data.validation.labels.shape[0] == all_data.validation.speaker_id.shape[0]
assert embeddings.test.shape[0] == all_data.test.labels.shape[0] == all_data.test.speaker_id.shape[0]
assert not np.isnan(embeddings.train).any()
assert not np.isnan(embeddings.validation).any()
assert not np.isnan(embeddings.test).any()

Finished embedding 0 / 5144 for train
Finished embedding 50 / 5144 for train
Finished embedding 100 / 5144 for train
Finished embedding 150 / 5144 for train
Finished embedding 200 / 5144 for train
Finished embedding 250 / 5144 for train
Finished embedding 300 / 5144 for train
Finished embedding 350 / 5144 for train
Finished embedding 400 / 5144 for train
Finished embedding 450 / 5144 for train
Finished embedding 500 / 5144 for train
Finished embedding 550 / 5144 for train
Finished embedding 600 / 5144 for train
Finished embedding 650 / 5144 for train
Finished embedding 700 / 5144 for train
Finished embedding 750 / 5144 for train
Finished embedding 800 / 5144 for train
Finished embedding 850 / 5144 for train
Finished embedding 900 / 5144 for train
Finished embedding 950 / 5144 for train
Finished embedding 1000 / 5144 for train
Finished embedding 1050 / 5144 for train
Finished embedding 1100 / 5144 for train
Finished embedding 1150 / 5144 for train
Finished embedding 1200 / 5144 for trai

In [16]:
model_name = 'LogisticRegression_balanced'  #@param

In [17]:
from sklearn import linear_model

def get_sklearn_model(model_name):
  return {
      'LogisticRegression': lambda: linear_model.LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial'),
      'LogisticRegression_balanced': lambda: linear_model.LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial', class_weight='balanced'),
  }[model_name]()

def _speaker_normalization(embedding_np, speaker_id_np):
  """Normalize embedding features by per-speaker statistics."""
  all_speaker_ids = np.unique(speaker_id_np)
  for speaker in all_speaker_ids:
    cur_i = speaker_id_np == speaker
    embedding_np[cur_i] -= embedding_np[cur_i].mean(axis=0)
    stds = embedding_np[cur_i].std(axis=0)
    stds[stds == 0] = 1
    embedding_np[cur_i] /= stds

  return embedding_np

# Train models.
d = get_sklearn_model(model_name)
normalized_train = _speaker_normalization(
    embeddings.train, all_data.train.speaker_id)
d.fit(normalized_train, all_data.train.labels)

# Eval.
normalized_validation = _speaker_normalization(
    embeddings.validation, all_data.validation.speaker_id)
eval_score = d.score(normalized_validation, all_data.validation.labels)
print(f'{model_name} eval score: {eval_score}')

# Test.
normalized_test = _speaker_normalization(
    embeddings.test, all_data.test.speaker_id)
test_score = d.score(normalized_test, all_data.test.labels)
print(f'{model_name} test score: {test_score}')

LogisticRegression_balanced eval score: 0.6355013550135501
LogisticRegression_balanced test score: 0.6240359897172236


