In [1]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [2]:
from datasets import load_from_disk
from evaluate import load
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Trainer, TrainingArguments
import gc

In [3]:
# Load pre-trained model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")

preprocessor_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/162 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:

    # processor: Wav2Vec2BertProcessor
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )

        labels_batch = self.processor.pad(
            labels=label_features,
            padding=self.padding,
            return_tensors="pt",
        )
        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        del input_features, label_features
        gc.collect()

        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [5]:
train_set = load_from_disk("/kaggle/input/train-test-data-for-w2v2/train_data").select(range(100))
test_set = load_from_disk("/kaggle/input/train-test-data-for-w2v2/test_data").select(range(100))
train_set, test_set

(Dataset({
     features: ['input_values', 'input_length', 'labels', 'path'],
     num_rows: 100
 }),
 Dataset({
     features: ['input_values', 'input_length', 'labels', 'path'],
     num_rows: 100
 }))

In [6]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    run_name="w2v2 10000",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    output_dir='./results',
    logging_dir='./logs',
    num_train_epochs=5,
    fp16=True,
    learning_rate=1e-3,
    # learning_rate=5e-5,
    warmup_steps=500,
    # group_by_length=False,
    # gradient_checkpointing=True,
    # save_total_limit=2,
)

In [7]:
import numpy as np

bleu = load("bleu")

def compute_metrics(pred):
    try:
        pred_logits = pred.predictions
        pred_ids = np.argmax(pred_logits, axis=-1)
    
        pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
    
        pred_str = processor.batch_decode(pred_ids)
        # we do not want to group tokens when computing the metrics
        label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    
        bleu_val = bleu.compute(predictions=pred_str, references=label_str)
    
        return {"bleu": bleu_val}
    except:
        return {"bleu": 0}

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [8]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    # compute_metrics=compute_metrics,
    train_dataset=train_set,
    eval_dataset=test_set,
    tokenizer=processor.feature_extractor,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [9]:
import wandb
wandb.login(key="91a0db028dce6f175361702b5140fa9c941bf8ff")

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [10]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mdeeptanshumalu[0m ([33mdeeptanshu-malu[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.18.3
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241119_165200-6wnojmkc[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mw2v2 10000[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/deeptanshu-malu/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/deeptanshu-malu/huggingface/runs/6wnojmkc[0m
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss
0,3979.3108,6433.422363
1,1985.3035,6383.703125
2,3930.4539,6334.327637
3,217.7325,6269.504395


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


TrainOutput(global_step=5, training_loss=2419.620812988281, metrics={'train_runtime': 150.4924, 'train_samples_per_second': 3.322, 'train_steps_per_second': 0.033, 'total_flos': 6.203129621375232e+16, 'train_loss': 2419.620812988281, 'epoch': 3.076923076923077})

In [11]:
model.save_pretrained("./w2v2_finetuned")
processor.save_pretrained("./w2v2_finetuned")

[]