# New keywords

### Create dataset from current audio folder

In [2]:
import os
import tqdm
from pathlib import Path
import pandas as pd
import torchaudio
import librosa
import IPython.display as ipd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import load_dataset, load_metric

PATH_TO_AUDIO = "google_speech_recognition_v2"

source: https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb#scrollTo=-gh7fQ1XEpC7

In [3]:
data = []

for subdir, dirs, files in os.walk(PATH_TO_AUDIO):
    for file in files:
        if file.endswith(".wav"):
            name = file.split(".")[0]
            label = subdir.split("\\")[-1]
            path = os.path.join(subdir, file)
            data.append({
                "name": name,
                "path": path,
                "keyword": label,
            })

In [4]:
df = pd.DataFrame(data)
print("Labels: ", df["keyword"].unique())
print()
#df.groupby("keyword").count()[["path"]]

Labels:  ['backward' 'bed' 'bird' 'cat' 'dog' 'down' 'eight' 'five' 'follow'
 'forward' 'four' 'go' 'happy' 'house' 'learn' 'left' 'marvin' 'nine' 'no'
 'off' 'on' 'one' 'right' 'seven' 'sheila' 'six' 'stop' 'three' 'tree'
 'two' 'up' 'visual' 'wow' 'yes' 'zero' '_background_noise_']



In [5]:
# we only choose the keywords we want
desired_keywords = ["follow", "go", "happy", "marvin", "stop", "down"]
df = df[df["keyword"].isin(desired_keywords)]
print("Labels: ", df["keyword"].unique())
print()
df.groupby("keyword").count()[["path"]]

Labels:  ['down' 'follow' 'go' 'happy' 'marvin' 'stop']



Unnamed: 0_level_0,path
keyword,Unnamed: 1_level_1
down,3917
follow,1579
go,3880
happy,2054
marvin,2100
stop,3872


In [6]:
idx = np.random.randint(0, len(df))
sample = df.iloc[idx]
path = sample["path"]
label = sample["keyword"]


print(f"ID Location: {idx}")
print(f"      Label: {label}")
print()

speech, sr = torchaudio.load(path)
speech = speech[0].numpy().squeeze()
# speech = librosa.resample(np.asarray(speech), sr, 16_000) # audio is already at 16kHz
ipd.Audio(data=np.asarray(speech), autoplay=True, rate=16000)

ID Location: 9197
      Label: go



In [7]:
# create csv files to be used to load data
save_path = "gsr_v2_cleaned"

train_df, test_df = train_test_split(df, test_size=0.2, random_state=101, stratify=df["keyword"])

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

train_df.to_csv(f"{save_path}/train.csv", sep="\t", encoding="utf-8", index=False)
test_df.to_csv(f"{save_path}/test.csv", sep="\t", encoding="utf-8", index=False)


print(train_df.shape)
print(test_df.shape)

(13921, 3)
(3481, 3)


In [36]:
# Loading the created dataset using datasets

data_files = {
    "train": save_path+"/train.csv", 
    "validation": save_path+"/test.csv",
}

dataset = load_dataset("csv", data_files=data_files, delimiter="\t", )
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]

print(train_dataset)
print(eval_dataset)

Found cached dataset csv (C:/Users/eliot/.cache/huggingface/datasets/csv/default-6b6058aad72b9e5b/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/2 [00:00<?, ?it/s]

Dataset({
    features: ['name', 'path', 'keyword'],
    num_rows: 13921
})
Dataset({
    features: ['name', 'path', 'keyword'],
    num_rows: 3481
})


In [14]:
labels = df["keyword"].unique()
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

id2label["0"]

'down'

In [38]:
from transformers import AutoFeatureExtractor


model_checkpoint = "facebook/wav2vec2-base"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint)



Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000
}

In [40]:
input_column = "path"
output_column = "keyword"
max_duration = 1.0  # seconds
target_sampling_rate = feature_extractor.sampling_rate

def speech_file_to_array(path):
    speech_array, sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(sampling_rate, target_sampling_rate)
    speech = resampler(speech_array).squeeze().numpy()
    return speech

def label_to_id(label, label_list):

    label_list = list(label_list)
    if len(label_list) > 0:
        return label_list.index(label) if label in label_list else -1
    return label

def preprocess_function(examples):
    audio_arrays = [speech_file_to_array(path) for path in examples[input_column]]
    target_list = [label_to_id(label, labels) for label in examples[output_column]]

    result = feature_extractor(
        audio_arrays, 
        sampling_rate=feature_extractor.sampling_rate, 
        max_length=int(feature_extractor.sampling_rate * max_duration), 
        truncation=True,
        padding=True 
    )

    # print(result)
    result["label"] = list(target_list)

    return result

In [41]:
encoded_dataset = dataset.map(preprocess_function, remove_columns=["path", "keyword", "name"], batched=True)
encoded_dataset

Map:   0%|          | 0/13921 [00:00<?, ? examples/s]

Map:   0%|          | 0/3481 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_values', 'label'],
        num_rows: 13921
    })
    validation: Dataset({
        features: ['input_values', 'label'],
        num_rows: 3481
    })
})

In [42]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    model_checkpoint, 
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)


Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForSequenceClassification: ['project_hid.bias', 'quantizer.codevectors', 'quantizer.weight_proj.weight', 'quantizer.weight_proj.bias', 'project_q.bias', 'project_hid.weight', 'project_q.weight']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'projector.weight', 'classifier

In [43]:
batch_size = 32

In [44]:
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"{model_name}-finetuned-ks",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
)

In [None]:
metric = load_metric("accuracy")

In [46]:
import numpy as np

def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

  metric = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [47]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics
)

In [48]:
trainer.train()



  0%|          | 0/545 [00:00<?, ?it/s]

RuntimeError: "nll_loss_forward_reduce_cuda_kernel_2d_index" not implemented for 'Int'