[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/rbg-research/AI-Training/blob/main/voice-analytics/classification/Tutorial-2.ipynb)

In [1]:
import random
random.seed(7)

# End-to-End Audio Classification - Common Voice Corpus - Gender Prediction

In [2]:
!pip install speechbrain

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/bin/python3.8 -m pip install --upgrade pip' command.[0m


In [3]:
!pip install torch

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/bin/python3.8 -m pip install --upgrade pip' command.[0m


## 1. Data Preparation

In [4]:
import datasets
import pandas as pd

In [5]:
# loading train and test data 

dataset = datasets.load_dataset('common_voice', 'ta') # loads tamil corpus

train = dataset["train"] # get the train split

test = dataset["test"] # get the test split

Reusing dataset common_voice (/home/ubuntu/.cache/huggingface/datasets/common_voice/ta/6.1.0/078d412587e9efeb0ae2e574da99c31e18844c496008d53dc5c60f4159ed639b)


  0%|          | 0/5 [00:00<?, ?it/s]

In [6]:
# making a dataframe for faster processing
train_files, train_labels = dataset["train"]["path"], dataset["train"]["gender"]
train_df = pd.DataFrame.from_dict({"path": train_files, "label": train_labels})

# filtering the sample that doesn't have the gender class
train_df = train_df[train_df["label"]!=""]
train_df = train_df[train_df["label"]!="others"]
# getting audio files and ground truth labels
train_files, train_labels = list(train_df["path"]), list(train_df["label"])



# repeating same steps to test corpus also
test_files, test_labels = dataset["test"]["path"], dataset["test"]["gender"]
test_df = pd.DataFrame.from_dict({"path": test_files, "label": test_labels})

# filtering the sample that doesn't have the gender class
test_df = test_df[test_df["label"]!=""]
test_df = test_df[test_df["label"]!="other"]
# getting audio files and ground truth labels
test_files, test_labels = list(test_df["path"]), list(test_df["label"])


In [7]:
# number of training samples availble for each class
print("number of train samples per class 'male':", train_labels.count("male"))
print("number of train samples per class 'female':", train_labels.count("female"))

number of train samples per class 'male': 885
number of train samples per class 'female': 770


In [8]:
# number of test samples availble for each class
print("number of test samples per class 'male':", test_labels.count("male"))
print("number of test samples per class 'female':", test_labels.count("female"))

number of test samples per class 'male': 1073
number of test samples per class 'female': 166


### 2. Train, Validation and Test sets 

In [9]:
import os
import json
import librosa
import soundfile as sf
from sklearn.model_selection import train_test_split


In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    train_files, 
    train_labels, 
    test_size=0.2
)

```{
  "id-1": {
    "wav": "{data_root}/audio_file_path",
    "length": 14.335,
    "spk_id": "class_0"
  },
  "7id-2": {
    "wav": "{data_root}/audio_file_path",
    "length": 12.01,
    "spk_id": "class_1"
  },
  "id-3": {
    "wav": "{data_root}/audio_file_path",
    "length": 11.965,
    "spk_id": "class_0"
  },
}```

In [11]:
train_files[0]

'/home/ubuntu/.cache/huggingface/datasets/downloads/extracted/bbc7e5ef7e1baed3b86a20ce9e146e069a26e1bebd1feadc09a550c84ff03cf0/cv-corpus-6.1-2020-12-11/ta/clips/common_voice_ta_19093432.mp3'

In [12]:
def prepare_data(files, labels, data_dir="data/", i=0):
    dictionary = dict()
    for file, label in zip(files, labels):
        wave_file = file.split("/")[-1]
        wave_file = wave_file.replace(".mp3", ".wav")
        write_file = os.path.join(data_dir, wave_file)
        x , sr = librosa.load(file, sr=16000)
        duration = librosa.get_duration(y=x, sr=sr)
        sf.write(write_file, x, sr, 'PCM_16')
        dictionary[str(i)] = {
            "wav": "{data_root}/" + wave_file,
            "length": duration,
            "label": str(label),
        }
        i += 1
    return dictionary, i

In [13]:
os.makedirs("data/")
train_dict, counter = prepare_data(X_train, y_train)





































































In [14]:
valid_dict, counter = prepare_data(X_test, y_test, i=counter)



















In [15]:
test_dict, counter = prepare_data(test_files, test_labels, i=counter)

































































In [16]:
f = open("train.json", "w")
json.dump(train_dict, f)
f.close()

f = open("valid.json", "w")
json.dump(valid_dict, f)
f.close()

f = open("test.json", "w")
json.dump(test_dict, f)
f.close()

### 3. Training

Network Architecture             |  Residual Block
:-------------------------:|:-------------------------:
![](src/ecapa-diagram.pbm)  |  ![](src/ecapa-residual.pbm)

Dialated CNN
![](src/dialted.pbm)

[ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in TDNN Based Speaker Verification](https://arxiv.org/pdf/2005.07143.pdf)

In [17]:
!python3.8 train.py train.yaml

Downloading http://www.openslr.org/resources/28/rirs_noises.zip to ./data/rirs_noises.zip
rirs_noises.zip: 1.31GB [01:40, 13.0MB/s]                                       
Extracting ./data/rirs_noises.zip to ./data
speechbrain.core - Beginning experiment!
speechbrain.core - Experiment folder: ./results/7
speechbrain.dataio.encoder - Load called, but CategoricalEncoder is not empty. Loaded data will overwrite everything. This is normal if there is e.g. an unk label defined at init.
speechbrain.core - Info: ckpt_interval_minutes arg from hparam file is used
speechbrain.core - 4.5M trainable parameters in Classifier
speechbrain.utils.checkpoints - Would load a checkpoint here, but none found yet.
speechbrain.utils.epoch_loop - Going into epoch 1
100%|█████████████████████████| 83/83 [00:34<00:00,  2.43it/s, train_loss=0.272]
100%|███████████████████████████████████████████| 21/21 [00:00<00:00, 29.65it/s]
speechbrain.nnet.schedulers - Changing lr from 0.001 to 0.00097
speechbrain.utils.tra

speechbrain.utils.checkpoints - Saved an end-of-epoch checkpoint in results/7/save/CKPT+2021-10-26+02-25-50+00
speechbrain.utils.epoch_loop - Going into epoch 14
100%|████████████████████████| 83/83 [00:34<00:00,  2.40it/s, train_loss=0.0907]
100%|███████████████████████████████████████████| 21/21 [00:00<00:00, 29.12it/s]
speechbrain.nnet.schedulers - Changing lr from 0.00066 to 0.00063
speechbrain.utils.train_logger - Epoch: 14, lr: 6.56e-04 - train loss: 9.07e-02 - valid loss: 1.22e-02, valid error: 3.02e-03
speechbrain.utils.checkpoints - Saved an end-of-epoch checkpoint in results/7/save/CKPT+2021-10-26+02-26-25+00
speechbrain.utils.checkpoints - Deleted checkpoint in results/7/save/CKPT+2021-10-26+02-25-15+00
speechbrain.utils.checkpoints - Deleted checkpoint in results/7/save/CKPT+2021-10-26+02-25-50+00
speechbrain.utils.epoch_loop - Going into epoch 15
100%|████████████████████████| 83/83 [00:34<00:00,  2.40it/s, train_loss=0.0765]
100%|██████████████████████████████████████████

100%|████████████████████████| 83/83 [00:34<00:00,  2.43it/s, train_loss=0.0481]
100%|███████████████████████████████████████████| 21/21 [00:00<00:00, 29.13it/s]
speechbrain.nnet.schedulers - Changing lr from 0.00031 to 0.00029
speechbrain.utils.train_logger - Epoch: 27, lr: 3.12e-04 - train loss: 4.81e-02 - valid loss: 1.68e-02, valid error: 3.02e-03
speechbrain.utils.checkpoints - Saved an end-of-epoch checkpoint in results/7/save/CKPT+2021-10-26+02-34-04+00
speechbrain.utils.checkpoints - Deleted checkpoint in results/7/save/CKPT+2021-10-26+02-33-29+00
speechbrain.utils.epoch_loop - Going into epoch 28
100%|████████████████████████| 83/83 [00:34<00:00,  2.40it/s, train_loss=0.0526]
100%|███████████████████████████████████████████| 21/21 [00:00<00:00, 29.32it/s]
speechbrain.nnet.schedulers - Changing lr from 0.00029 to 0.00026
speechbrain.utils.train_logger - Epoch: 28, lr: 2.85e-04 - train loss: 5.26e-02 - valid loss: 1.41e-02, valid error: 3.02e-03
speechbrain.utils.checkpoints - S

### 4. Testing

In [18]:
import torchaudio
from speechbrain.pretrained import EncoderClassifier

In [19]:
!mkdir best_model/
!cp results/7/save/label_encoder.txt best_model/


In [20]:
!cp "`ls -td results/7/save/CKPT* | tail -1`"/* best_model/

In [21]:
!cp -r "test.yaml" best_model/

In [22]:
classifier = EncoderClassifier.from_hparams(source="best_model/",
                                            hparams_file='test.yaml',
                                            savedir="best_model/")

In [23]:
true_labels = list()
pred_labels = list()
for item in test_dict:
    true_labels.append(test_dict[item]["label"])
    audio_file = test_dict[item]["wav"].replace("{data_root}", "data")
    signal, fs = torchaudio.load(audio_file)
    output_probs, score, index, text_lab = classifier.classify_batch(signal)
    pred_labels.append(text_lab)

### 5. Performance Observation 

In [24]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

In [25]:
y_pred = pred_labels
y_true = true_labels
print(f'Test Set Accuracy score =  {100*accuracy_score(y_true, y_pred):.3f}%') #same as model.score(X_test, y_test)
print(f'Test Set Precision score =  {100*precision_score(y_true, y_pred, average="macro"):.3f}%')
print(f'Test Set Recall score =  {100*recall_score(y_true, y_pred, average="macro"):.3f}%')
print(f'Test Set F-score score =  {100*f1_score(y_true, y_pred, average="macro"):.3}%')

Test Set Accuracy score =  82.647%
Test Set Precision score =  66.410%
Test Set Recall score =  72.159%
Test Set F-score score =  68.4%


### 6. Transfer Learning for Classification  - Same Task

In [26]:
import numpy as np
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.preprocessing import LabelEncoder

In [27]:
with open("train.json") as json_file:
    train_dict = json.load(json_file)
    
with open("valid.json") as json_file:
    valid_dict = json.load(json_file)

In [28]:
random.seed(10)
new_dict = {**train_dict, **valid_dict}
l = list(new_dict.items())
random.shuffle(l)

In [29]:
train_dict = dict(l[:int(0.8*len(l))])
valid_dict = dict(l[int(0.8*len(l)):])

In [30]:

def get_feature_embeddings(temp_dict):
    true_labels = list()
    features = list()
    for item in temp_dict:
        true_labels.append(temp_dict[item]["label"])
        audio_file = temp_dict[item]["wav"].replace("{data_root}", "data")
        signal, fs = torchaudio.load(audio_file)
        embeddings = classifier.encode_batch(signal)
        features.append(embeddings.tolist()[0][0])

    feature_matrix = np.asarray(features)
    return feature_matrix, true_labels

In [31]:
train_mat, train_labels = get_feature_embeddings(train_dict)
valid_mat, valid_labels = get_feature_embeddings(valid_dict)
test_mat, test_labels = get_feature_embeddings(test_dict)

In [32]:
train_mat.shape

(1324, 512)

In [33]:
valid_mat.shape

(331, 512)

In [34]:
test_mat.shape

(1239, 512)

In [35]:
encoder = LabelEncoder()
encoder.fit((train_labels + valid_labels + test_labels))

encoded_train_labels = encoder.transform(train_labels)
encoded_valid_labels = encoder.transform(valid_labels)
encoded_test_labels = encoder.transform(test_labels)

In [36]:
def residual_block(x, filters, conv_num=3, activation="relu"):
    # Shortcut
    s = keras.layers.Conv1D(filters, 1, padding="same")(x)
    for i in range(conv_num - 1):
        x = keras.layers.Conv1D(filters, 3, padding="same")(x)
        x = keras.layers.Activation(activation)(x)
    x = keras.layers.Conv1D(filters, 3, padding="same")(x)
    x = keras.layers.Add()([x, s])
    x = keras.layers.Activation(activation)(x)
    return keras.layers.MaxPool1D(pool_size=2, strides=2)(x)


def build_model(input_shape, num_classes):
    inputs = keras.layers.Input(shape=input_shape, name="input")

    x = residual_block(inputs, 16, 2)
    x = residual_block(x, 32, 2)
    x = residual_block(x, 64, 3)
    x = residual_block(x, 128, 3)
    x = residual_block(x, 128, 3)

    x = keras.layers.AveragePooling1D(pool_size=3, strides=3)(x)
    x = keras.layers.Flatten()(x)
    x = keras.layers.Dense(256, activation="relu")(x)
    x = keras.layers.Dense(128, activation="relu")(x)

    outputs = keras.layers.Dense(num_classes, activation="sigmoid", name="output")(x)

    return keras.models.Model(inputs=inputs, outputs=outputs)

In [37]:
model = build_model((512, 1), 1)

model.summary()

2021-10-26 02:39:42.578378: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2021-10-26 02:39:42.603132: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-26 02:39:42.603645: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties: 
pciBusID: 0000:00:1e.0 name: Tesla T4 computeCapability: 7.5
coreClock: 1.59GHz coreCount: 40 deviceMemorySize: 14.75GiB deviceMemoryBandwidth: 298.08GiB/s
2021-10-26 02:39:42.605002: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2021-10-26 02:39:42.607596: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
2021-10-26 02:39:42.609288: I tensorflow/stream_executor/platform/default/d

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              [(None, 512, 1)]     0                                            
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 512, 16)      64          input[0][0]                      
__________________________________________________________________________________________________
activation (Activation)         (None, 512, 16)      0           conv1d_1[0][0]                   
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 512, 16)      784         activation[0][0]                 
______________________________________________________________________________________________

In [38]:
model.compile(
    optimizer="Adam", loss="binary_crossentropy", metrics=["accuracy"]
)

model_save_filename = "model.h5"

earlystopping_cb = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
mdlcheckpoint_cb = keras.callbacks.ModelCheckpoint(
    model_save_filename, monitor="val_accuracy", save_best_only=True
)

In [39]:
model.fit(train_mat, encoded_train_labels, batch_size=16,
          epochs=10, verbose=1, validation_data=(valid_mat, encoded_valid_labels),
         callbacks=[earlystopping_cb, mdlcheckpoint_cb],)

Epoch 1/10


2021-10-26 02:39:44.983748: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
2021-10-26 02:39:45.228583: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f2c3e071b80>

In [40]:
y_true = encoded_test_labels
y_pred = model.predict(test_mat)
y_pred = np.where(y_pred >= 0.5, 1, 0).flatten()
print(f'Test Set Accuracy score =  {100*accuracy_score(y_true, y_pred):.3f}%') #same as model.score(X_test, y_test)
print(f'Test Set Precision score =  {100*precision_score(y_true, y_pred, average="macro"):.3f}%')
print(f'Test Set Recall score =  {100*recall_score(y_true, y_pred, average="macro"):.3f}%')
print(f'Test Set F-score score =  {100*f1_score(y_true, y_pred, average="macro"):.3}%')

Test Set Accuracy score =  74.980%
Test Set Precision score =  60.076%
Test Set Recall score =  66.968%
Test Set F-score score =  60.9%


### 7. Transfer Learning for Classification  - Different Task : Speaker Identification

In [41]:
import torch

In [43]:
train_files, train_labels = dataset["train"]["file"], dataset["train"]["speaker_id"]

In [44]:
speakers = list(set(train_labels))

In [46]:
signal, fs = torchaudio.load("src/samples_audio_samples_example1.wav")
speaker_1_embeddings = classifier.encode_batch(signal)

signal, fs = torchaudio.load("src/samples_audio_samples_example2.flac")
speaker_2_embeddings = classifier.encode_batch(signal)


In [47]:
print(torch.cdist(speaker_1_embeddings, speaker_2_embeddings))

tensor([[[47.9479]]])
