In [1]:
from datasets import load_from_disk
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Trainer, TrainingArguments
from datasets import load_dataset, Audio
import os
from scipy.io import wavfile
import numpy as np
import torch
from pydub import AudioSegment

In [2]:
ds_arrays = load_from_disk("/kaggle/input/mathbridge-audio-arrays/mathbridge_audio_arrays")
ds_arrays_train = ds_arrays.select(range(80000))
ds_arrays_test = ds_arrays.select(range(80000, 99986))
ds_arrays_train, ds_arrays_test

Loading dataset from disk:   0%|          | 0/33 [00:00<?, ?it/s]

(Dataset({
     features: ['audio'],
     num_rows: 80000
 }),
 Dataset({
     features: ['audio'],
     num_rows: 19986
 }))

In [3]:
ds_arrays_train[0], ds_arrays_test[0]

({'audio': {'path': 'tts_0.mp3',
   'array': array([ 0.03302002,  0.0010376 , -0.02728271, ...,  0.        ,
           0.        ,  0.        ]),
   'sampling_rate': 22050}},
 {'audio': {'path': 'tts_8201.mp3',
   'array': array([ 0.01837158,  0.00418091, -0.00866699, ...,  0.        ,
           0.        ,  0.        ]),
   'sampling_rate': 22050}})

In [4]:
ds = load_from_disk("/kaggle/input/mathbridge-filtered/mathbridge_filtered").select(range(100000))
ds

Dataset({
    features: ['context_before', 'equation', 'context_after', 'spoken_English'],
    num_rows: 100000
})

In [5]:
ds[0]

{'context_before': 'are modeled by a stochastic control process with variance',
 'equation': '$ \\sigma^2_t $',
 'context_after': 'controlled by the agent and with a mean of zero . This models potential effect of actions centered around the null action . To compute various quantities of interest ,',
 'spoken_English': 'sigma squared sub t.'}

In [6]:
# # Load pre-trained model and processor
# processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
# model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")

In [7]:
from transformers import Wav2Vec2CTCTokenizer
from transformers import SeamlessM4TFeatureExtractor
from transformers import Wav2Vec2BertProcessor
from transformers import Wav2Vec2BertForCTC

tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("/kaggle/input/latex-vocab", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
feature_extractor = SeamlessM4TFeatureExtractor.from_pretrained("facebook/w2v-bert-2.0")
processor = Wav2Vec2BertProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
model = Wav2Vec2BertForCTC.from_pretrained(
    "facebook/w2v-bert-2.0",
    attention_dropout=0.0,
    hidden_dropout=0.0,
    feat_proj_dropout=0.0,
    mask_time_prob=0.0,
    layerdrop=0.0,
    ctc_loss_reduction="mean",
    add_adapter=True,
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer),
)



preprocessor_config.json:   0%|          | 0.00/275 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.87k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.32G [00:00<?, ?B/s]

Some weights of Wav2Vec2BertForCTC were not initialized from the model checkpoint at facebook/w2v-bert-2.0 and are newly initialized: ['adapter.layers.0.ffn.intermediate_dense.bias', 'adapter.layers.0.ffn.intermediate_dense.weight', 'adapter.layers.0.ffn.output_dense.bias', 'adapter.layers.0.ffn.output_dense.weight', 'adapter.layers.0.ffn_layer_norm.bias', 'adapter.layers.0.ffn_layer_norm.weight', 'adapter.layers.0.residual_conv.bias', 'adapter.layers.0.residual_conv.weight', 'adapter.layers.0.residual_layer_norm.bias', 'adapter.layers.0.residual_layer_norm.weight', 'adapter.layers.0.self_attn.linear_k.bias', 'adapter.layers.0.self_attn.linear_k.weight', 'adapter.layers.0.self_attn.linear_out.bias', 'adapter.layers.0.self_attn.linear_out.weight', 'adapter.layers.0.self_attn.linear_q.bias', 'adapter.layers.0.self_attn.linear_q.weight', 'adapter.layers.0.self_attn.linear_v.bias', 'adapter.layers.0.self_attn.linear_v.weight', 'adapter.layers.0.self_attn_conv.bias', 'adapter.layers.0.self_

In [8]:
ds_arrays_train = ds_arrays_train.cast_column("audio", Audio(sampling_rate=16_000))
ds_arrays_test = ds_arrays_test.cast_column("audio", Audio(sampling_rate=16_000))

In [9]:
ds_arrays_train[0], ds_arrays_test[0]

({'audio': {'path': 'tts_0.mp3',
   'array': array([ 0.02826029, -0.01515469, -0.00925331, ...,  0.        ,
           0.        ,  0.        ]),
   'sampling_rate': 16000}},
 {'audio': {'path': 'tts_8201.mp3',
   'array': array([ 0.01642108, -0.0040214 , -0.00188781, ...,  0.        ,
           0.        ,  0.        ]),
   'sampling_rate': 16000}})

In [10]:
def prepare_dataset(batch):
    audio = batch["audio"]
    path = audio["path"]
    id = int(path.split("_")[-1].split(".")[0])
    batch["input_features"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    batch["input_length"] = len(batch["input_features"])
    batch["labels"] = processor(text=ds[id]["equation"]).input_ids
    batch["path"] = path
    return batch

In [11]:
ds_arrays_train = ds_arrays_train.map(prepare_dataset, remove_columns=ds_arrays_train.column_names, keep_in_memory=True)
ds_arrays_test = ds_arrays_test.map(prepare_dataset, remove_columns=ds_arrays_test.column_names, keep_in_memory=True)

Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/19986 [00:00<?, ? examples/s]

In [12]:
ds_arrays_train.save_to_disk("train_data")
ds_arrays_test.save_to_disk("test_data")

Saving the dataset (0/20 shards):   0%|          | 0/80000 [00:00<?, ? examples/s]

Saving the dataset (0/5 shards):   0%|          | 0/19986 [00:00<?, ? examples/s]