In [1]:
# !pip install evaluate

In [2]:
from datasets import load_from_disk
# from evaluate import load
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Trainer, TrainingArguments
import gc

In [3]:
# # Load pre-trained model and processor
# processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
# model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
# model.config.ctc_zero_infinity = True

In [4]:
from transformers import Wav2Vec2CTCTokenizer
from transformers import SeamlessM4TFeatureExtractor
from transformers import Wav2Vec2BertProcessor
from transformers import Wav2Vec2BertForCTC

tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("/kaggle/input/latex-vocab", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
feature_extractor = SeamlessM4TFeatureExtractor.from_pretrained("facebook/w2v-bert-2.0")
processor = Wav2Vec2BertProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
model = Wav2Vec2BertForCTC.from_pretrained(
    "facebook/w2v-bert-2.0",
    attention_dropout=0.0,
    hidden_dropout=0.0,
    feat_proj_dropout=0.0,
    mask_time_prob=0.0,
    layerdrop=0.0,
    ctc_loss_reduction="mean",
    add_adapter=True,
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer),
)
model.config.ctc_zero_infinity = True



preprocessor_config.json:   0%|          | 0.00/275 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.87k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.32G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:

    # processor: Wav2Vec2BertProcessor
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )

        labels_batch = self.processor.pad(
            labels=label_features,
            padding=self.padding,
            return_tensors="pt",
        )
        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        del input_features, label_features
        gc.collect()

        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [None]:
train_set = load_from_disk("/kaggle/input/train-test-data-for-w2v2/train_data")
test_set = load_from_disk("/kaggle/input/train-test-data-for-w2v2/test_data")
train_set, test_set
# .select(range(10**2))

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    run_name="w2v2 10000",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    output_dir='./results',
    logging_dir='./logs',
    num_train_epochs=5,
    # bf16=True,
    fp16=True,
    learning_rate=1e-3,
    # learning_rate=5e-5,
    warmup_steps=500,
    weight_decay=0.01,
    save_total_limit=2,
    # group_by_length=False,
    # gradient_checkpointing=True,
)

In [None]:
# import numpy as np

# bleu = load("bleu")

# def compute_metrics(pred):
#     try:
#         pred_logits = pred.predictions
#         pred_ids = np.argmax(pred_logits, axis=-1)
    
#         pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
    
#         pred_str = processor.batch_decode(pred_ids)
#         # we do not want to group tokens when computing the metrics
#         label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    
#         bleu_val = bleu.compute(predictions=pred_str, references=label_str)
    
#         return {"bleu": bleu_val}
#     except:
#         return {"bleu": 0}

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    # compute_metrics=compute_metrics,
    train_dataset=train_set,
    eval_dataset=test_set,
    tokenizer=processor.feature_extractor,
)

In [None]:
import wandb
wandb.login(key="91a0db028dce6f175361702b5140fa9c941bf8ff")

In [None]:
trainer.train()

In [None]:
model.save_pretrained("./w2v2_finetuned")
processor.save_pretrained("./w2v2_finetuned")