In [5]:
from datasets import load_from_disk
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Trainer, TrainingArguments
from datasets import load_dataset
import os

In [30]:
ds = load_from_disk("/kaggle/input/mathbridge-audio-names/mathbridge_speech")
ds

Dataset({
    features: ['context_before', 'equation', 'context_after', 'spoken_English', 'speech_file'],
    num_rows: 100000
})

In [31]:
ds[0]

{'context_before': 'are modeled by a stochastic control process with variance',
 'equation': '$ \\sigma^2_t $',
 'context_after': 'controlled by the agent and with a mean of zero . This models potential effect of actions centered around the null action . To compute various quantities of interest ,',
 'spoken_English': 'sigma squared sub t.',
 'speech_file': 'speech/tts_0.mp3'}

In [21]:
# Load pre-trained model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self").to("cuda")

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
# load dummy dataset and read soundfiles
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")

# tokenize
input_values = processor(ds[0]["audio"]["array"], return_tensors="pt", padding="longest").input_values.to("cuda")

# retrieve logits
logits = model(input_values).logits

# take argmax and decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


In [29]:
ds[0]

{'file': '/root/.cache/huggingface/datasets/downloads/extracted/dfbece23564f422bc5794f3090902cd16d52d86767b746125ebc2ff3ea5f89ef/dev_clean/1272/135031/1272-135031-0000.flac',
 'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/dfbece23564f422bc5794f3090902cd16d52d86767b746125ebc2ff3ea5f89ef/dev_clean/1272/135031/1272-135031-0000.flac',
  'array': array([-0.00018311, -0.00033569, -0.00021362, ..., -0.00323486,
         -0.00402832, -0.00393677]),
  'sampling_rate': 16000},
 'text': 'BECAUSE YOU WERE SLEEPING INSTEAD OF CONQUERING THE LOVELY ROSE PRINCESS HAS BECOME A FIDDLE WITHOUT A BOW WHILE POOR SHAGGY SITS THERE A COOING DOVE',
 'speaker_id': 1272,
 'chapter_id': 135031,
 'id': '1272-135031-0000'}

In [None]:
# Preprocess function
def prepare_dataset(batch):
    audio = batch["audio"]
    inputs = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt", padding=True)
    batch["input_values"] = inputs.input_values[0]
    batch["labels"] = processor.tokenizer.encode(batch["latex"], return_tensors="pt")[0]
    return batch

In [None]:

# Load and preprocess dataset
dataset = dataset.map(prepare_dataset, remove_columns=["audio", "latex"])

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    evaluation_strategy="epoch",
    learning_rate=1e-4,
    num_train_epochs=5,
    save_steps=500,
    logging_dir='./logs'
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=processor.feature_extractor
)

# Fine-tune the model
trainer.train()
