In [4]:
!pip install transformers datasets evaluate jiwer
!pip install transformers[torch]
!pip install accelerate -U

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m51.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jiwer
  Downloading jiwer-3.0.2-py3-none-any.whl (21 kB)
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloadi

In [5]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
from datasets import load_dataset, Audio

minds = load_dataset("PolyAI/minds14", name="en-US", split="train[:100]")

Downloading builder script:   0%|          | 0.00/5.95k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.29k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/471M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [7]:
minds = minds.train_test_split(test_size=0.2)

In [8]:
minds

DatasetDict({
    train: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 80
    })
    test: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 20
    })
})

In [9]:
minds = minds.remove_columns(["english_transcription", "intent_class", "lang_id"])

In [10]:
from transformers import Wav2Vec2Processor, Data2VecAudioForCTC

processor = Wav2Vec2Processor.from_pretrained("facebook/data2vec-audio-large-960h")

Downloading (…)rocessor_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/565 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

In [11]:
minds = minds.cast_column("audio", Audio(sampling_rate=16_000))

In [12]:
def uppercase(example):
    return {"transcription": example["transcription"].upper()}


minds = minds.map(uppercase)

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [13]:
def prepare_dataset(batch):
    audio = batch["audio"]
    batch = processor(audio["array"], sampling_rate=audio["sampling_rate"], text=batch["transcription"])
    batch["input_length"] = len(batch["input_values"][0])
    return batch

In [14]:
encoded_minds = minds.map(prepare_dataset, remove_columns=minds.column_names["train"], num_proc=1)

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [12]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union


@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = "longest"

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"][0]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")

        labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [13]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding="longest")

In [14]:
import evaluate

wer = evaluate.load("wer")

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [15]:
import numpy as np


def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [16]:
from transformers import AutoModelForCTC, TrainingArguments, Trainer


model = Data2VecAudioForCTC.from_pretrained(
    "facebook/data2vec-audio-large-960h",
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id
    )

Downloading (…)lve/main/config.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

In [20]:
training_args = TrainingArguments(
    output_dir="my_awesome_asr_mind_model2",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=2,
    gradient_checkpointing=True,
    fp16=True,
    group_by_length=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_minds["train"],
    eval_dataset=encoded_minds["test"],
    tokenizer=processor,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

/content/my_awesome_asr_mind_model2 is already a clone of https://huggingface.co/FarziBuilder/my_awesome_asr_mind_model2. Make sure you pull the latest changes with `repo.git_pull()`.


Step,Training Loss,Validation Loss


TrainOutput(global_step=2, training_loss=1.659719705581665, metrics={'train_runtime': 11.1451, 'train_samples_per_second': 2.871, 'train_steps_per_second': 0.179, 'total_flos': 1.0266536441677824e+16, 'train_loss': 1.659719705581665, 'epoch': 0.4})

In [3]:
from datasets import load_dataset, Audio

dataset = load_dataset("PolyAI/minds14", "en-US", split="train")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
sampling_rate = dataset.features["audio"].sampling_rate
audio_file = dataset[0]["audio"]["path"]

Downloading builder script:   0%|          | 0.00/5.95k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.29k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/471M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [21]:
model = Data2VecAudioForCTC.from_pretrained("FarziBuilder/my_awesome_asr_mind_model2")

Downloading (…)lve/main/config.json:   0%|          | 0.00/2.14k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

In [24]:
trainer.push_to_hub()

Upload file pytorch_model.bin:   0%|          | 1.00/1.17G [00:00<?, ?B/s]

Upload file runs/Aug14_12-02-12_a2c107c2cbc2/events.out.tfevents.1692014543.a2c107c2cbc2.3430.1:   0%|        …

Upload file training_args.bin:   0%|          | 1.00/3.87k [00:00<?, ?B/s]

Upload file runs/Aug14_12-01-33_a2c107c2cbc2/events.out.tfevents.1692014506.a2c107c2cbc2.3430.0:   0%|        …

To https://huggingface.co/FarziBuilder/my_awesome_asr_mind_model2
   b8c1474..49b3288  main -> main

   b8c1474..49b3288  main -> main

To https://huggingface.co/FarziBuilder/my_awesome_asr_mind_model2
   49b3288..6eac8ca  main -> main

   49b3288..6eac8ca  main -> main



'https://huggingface.co/FarziBuilder/my_awesome_asr_mind_model2/commit/49b3288c8a7b3f6ab6bc4e7925a0208c1425214c'

In [28]:
from transformers import AutoTokenizer
import torch

model_name = "FarziBuilder/my_awesome_asr_mind_model2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
processor = Wav2Vec2Processor.from_pretrained("facebook/data2vec-audio-large-960h")
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
input_values = processor(ds[0]["audio"]["array"], return_tensors="pt", padding="longest").input_values  # Batch size 1

# Inference with the model
logits = model(input_values).logits

# Decode the predicted IDs
predicted_ids = torch.argmax(logits, dim=-1)
transcription = tokenizer.batch_decode(predicted_ids)


In [29]:
transcription

['A MAN SAID TO THE UNIVERSE SIR I EXIST']