In [1]:
from datasets import load_from_disk
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Trainer, TrainingArguments
from datasets import load_dataset, Audio
import os
from scipy.io import wavfile
import numpy as np
import torch
from pydub import AudioSegment

In [20]:
ds_arrays = load_from_disk("/kaggle/input/mathbridge-audio-arrays/mathbridge_audio_arrays")
ds_arrays_train = ds_arrays.select(range(80000))
ds_arrays_test = ds_arrays.select(range(80000, 99986))
ds_arrays_train, ds_arrays_test

Loading dataset from disk:   0%|          | 0/33 [00:00<?, ?it/s]

(Dataset({
     features: ['audio'],
     num_rows: 80
 }),
 Dataset({
     features: ['audio'],
     num_rows: 19986
 }))

In [3]:
ds_arrays_train[0], ds_arrays_test[0]

({'audio': {'path': 'tts_0.mp3',
   'array': array([ 0.03302002,  0.0010376 , -0.02728271, ...,  0.        ,
           0.        ,  0.        ]),
   'sampling_rate': 22050}},
 {'audio': {'path': 'tts_8201.mp3',
   'array': array([ 0.01837158,  0.00418091, -0.00866699, ...,  0.        ,
           0.        ,  0.        ]),
   'sampling_rate': 22050}})

In [4]:
ds = load_from_disk("/kaggle/input/mathbridge-filtered/mathbridge_filtered").select(range(100000))
ds

Dataset({
    features: ['context_before', 'equation', 'context_after', 'spoken_English'],
    num_rows: 100000
})

In [5]:
ds[0]

{'context_before': 'are modeled by a stochastic control process with variance',
 'equation': '$ \\sigma^2_t $',
 'context_after': 'controlled by the agent and with a mean of zero . This models potential effect of actions centered around the null action . To compute various quantities of interest ,',
 'spoken_English': 'sigma squared sub t.'}

In [None]:
# # Load pre-trained model and processor
# processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
# model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")

In [None]:
# from transformers import Wav2Vec2CTCTokenizer
# from transformers import SeamlessM4TFeatureExtractor
# from transformers import Wav2Vec2BertProcessor
# from transformers import Wav2Vec2BertForCTC

# tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("/kaggle/input/latex-vocab", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
# feature_extractor = SeamlessM4TFeatureExtractor.from_pretrained("facebook/w2v-bert-2.0")
# processor = Wav2Vec2BertProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
# model = Wav2Vec2BertForCTC.from_pretrained(
#     "facebook/w2v-bert-2.0",
#     attention_dropout=0.0,
#     hidden_dropout=0.0,
#     feat_proj_dropout=0.0,
#     mask_time_prob=0.0,
#     layerdrop=0.0,
#     ctc_loss_reduction="mean",
#     add_adapter=True,
#     pad_token_id=processor.tokenizer.pad_token_id,
#     vocab_size=len(processor.tokenizer),
# )

In [6]:
ds_arrays_train = ds_arrays_train.cast_column("audio", Audio(sampling_rate=16_000))
ds_arrays_test = ds_arrays_test.cast_column("audio", Audio(sampling_rate=16_000))

In [7]:
ds_arrays_train[0], ds_arrays_test[0]

({'audio': {'path': 'tts_0.mp3',
   'array': array([ 0.02826029, -0.01515469, -0.00925331, ...,  0.        ,
           0.        ,  0.        ]),
   'sampling_rate': 16000}},
 {'audio': {'path': 'tts_8201.mp3',
   'array': array([ 0.01642108, -0.0040214 , -0.00188781, ...,  0.        ,
           0.        ,  0.        ]),
   'sampling_rate': 16000}})

In [15]:
from transformers import Wav2Vec2Model, Wav2Vec2Processor
import torch

# Load Wav2Vec 2.0 model and processor
model_name = "facebook/wav2vec2-base"
processor = Wav2Vec2Processor.from_pretrained(model_name)
wav2vec_model = Wav2Vec2Model.from_pretrained(model_name)

def extract_embeddings(audio_array, sample_rate=16000):
    # Ensure the audio is sampled at 16 kHz
    if sample_rate != 16000:
        import torchaudio
        audio_array = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(audio_array)

    # Preprocess the audio
    inputs = processor(audio_array, sampling_rate=16000, return_tensors="pt", padding=True)

    # Extract embeddings from the Wav2Vec 2.0 model
    with torch.no_grad():
        outputs = wav2vec_model(**inputs)
        # Extract last hidden state (embeddings)
        embeddings = outputs.last_hidden_state.squeeze(0)  # Shape: (n, 768)

    return embeddings

preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]



vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

In [18]:
extract_embeddings(ds_arrays_train[234]["audio"]["array"]).shape

torch.Size([246, 768])

In [19]:
# def prepare_dataset(batch):
#     audio = batch["audio"]
#     path = audio["path"]
#     id = int(path.split("_")[-1].split(".")[0])
#     batch["input_features"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
#     batch["input_length"] = len(batch["input_features"])
#     batch["labels"] = processor(text=ds[id]["equation"]).input_ids
#     batch["path"] = path
#     return batch

def prepare_dataset(batch):
    audio = batch["audio"]
    path = audio["path"]
    id = int(path.split("_")[-1].split(".")[0])
    batch["speech_embeddings"] = extract_embeddings(audio["array"])
    batch["equation"] = text=ds[id]["equation"]
    batch["path"] = path
    return batch

In [21]:
ds_arrays_train = ds_arrays_train.map(prepare_dataset, remove_columns=ds_arrays_train.column_names, keep_in_memory=True)
# ds_arrays_test = ds_arrays_test.map(prepare_dataset, remove_columns=ds_arrays_test.column_names, keep_in_memory=True)



Map:   0%|          | 0/80 [00:00<?, ? examples/s]

In [23]:
ds_arrays_train[0]

{'speech_embeddings': [[0.3075016736984253,
   0.3070793151855469,
   0.27560603618621826,
   -0.05633200332522392,
   0.12426012754440308,
   -0.16360528767108917,
   -0.019604193046689034,
   -0.10977634787559509,
   -0.07042849808931351,
   -0.1333235651254654,
   -0.06872347742319107,
   -0.32993778586387634,
   0.21851910650730133,
   0.10042694956064224,
   0.3380042016506195,
   -0.6348705291748047,
   0.343980073928833,
   -0.015872061252593994,
   -0.09951513260602951,
   0.07870731502771378,
   -0.059691086411476135,
   -0.08172813057899475,
   0.4594075083732605,
   0.4710924029350281,
   -0.7833915948867798,
   -0.07434593886137009,
   0.8026136159896851,
   -0.1337631642818451,
   0.054989125579595566,
   0.03487474098801613,
   0.08549459278583527,
   -0.05243844911456108,
   -0.3915528357028961,
   -0.07515531778335571,
   0.5506677627563477,
   -0.3703076243400574,
   -0.17497499287128448,
   -0.0420432984828949,
   0.365499883890152,
   0.06684933602809906,
   -0.32754

In [None]:
ds_arrays_train.save_to_disk("train_data")
ds_arrays_test.save_to_disk("test_data")