In [1]:
import soundfile as sf

from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration

from transformers import Trainer, TrainingArguments

from torch.utils.data import Dataset
import librosa
import json
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Set, Union

import torch

In [3]:
model = Speech2TextForConditionalGeneration.from_pretrained(
    "facebook/s2t-small-librispeech-asr"
)
processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")

Downloading:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/118M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/242 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/235k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/417k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456 [00:00<?, ?B/s]

In [11]:
class LibriSpeechDataset(Dataset):
	def __init__(self, json_path, processor):
		self.json_path = json_path
		self.data = self.load_data_from_json(json_path)
		self.processor = processor

	def load_data_from_json(self, json_path):
		with open(json_path, "r") as f:
			data = json.load(f)

		data = data["data"][100:]
		return data

	def __getitem__(self, idx):
		audio, _ = librosa.load(self.data[idx]["file"], 16000)
		input_value = self.processor.feature_extractor(audio, sampling_rate=16000)
		# Do some text preprocessing here
		text = self.data[idx]["text"]
		with self.processor.as_target_processor():
			label = self.processor(text).input_ids

		# print(input_value)
		sample = {
			"input_values": input_value["input_features"][0],
			"labels": label
		}
		return sample

	def __len__(self):
		return len(self.data)

In [12]:
train_dataset = LibriSpeechDataset("/root/develop/KIWI-module/code/wav2byte-pipeline/data/en-librispeech-test-clean-pure-99.0-local-wav.json", processor)

In [13]:
train_dataset[0]

{'input_values': array([[-1.9798467 , -1.4037551 , -1.1210709 , ..., -1.4489332 ,
         -1.1271648 , -1.0265579 ],
        [-2.4735565 , -2.4872112 , -1.4730271 , ..., -1.3878232 ,
         -1.3205291 , -1.1077921 ],
        [-0.79646975, -0.79973185, -2.0664742 , ..., -1.1663765 ,
         -1.0828664 , -1.2300966 ],
        ...,
        [-3.1509316 , -2.595523  , -1.366736  , ..., -1.0187485 ,
         -0.9631583 , -0.9974444 ],
        [-1.9022068 , -1.9541186 , -1.3448086 , ..., -1.381081  ,
         -1.1054289 , -1.1910446 ],
        [-2.021996  , -1.1396611 , -0.7968165 , ..., -1.2714534 ,
         -0.90984946, -1.1341885 ]], dtype=float32),
 'labels': [4,
  1388,
  5735,
  7,
  21,
  4,
  1036,
  236,
  8,
  2990,
  429,
  18,
  158,
  1711,
  16,
  6738,
  7,
  163,
  2816,
  25,
  8,
  679,
  1489,
  35,
  428,
  836,
  6,
  625,
  2839,
  2]}

In [25]:
@dataclass
class DataCollatorWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Speech2TextProcessor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(
        self, features: List[Dict[str, Union[List[int], torch.Tensor]]]
    ) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [
            {"input_features": feature["input_values"]} for feature in features
        ]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        # lang_features = [{"lang": feature['lang'] for feature in features}]

        print("@ @ start")
        # feature_extractor 를 명시해준것은, 현재 processor 구현에 pad를 매칭이 안됨 (feature_extractor 에는 있음)
        batch = self.processor.feature_extractor.pad(
            input_features,
            padding=True,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        print("$ $ End")

        with self.processor.as_target_processor():
            labels_batch = self.processor.tokenizer.pad( # tokenizer를 명시해준것은, 현재 processor 구현에 pad를 매칭이 안됨 (tokenizer에는 있음)
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(
            labels_batch.attention_mask.ne(1), -100
        )

        batch["labels"] = labels

        # print(batch)
        return batch

In [26]:
data_collator = DataCollatorWithPadding(processor=processor, padding=True)


In [27]:
trainer = Trainer(model=model, train_dataset=train_dataset, data_collator=data_collator, tokenizer=processor.feature_extractor)

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [28]:
trainer.train()

***** Running training *****
  Num examples = 2520
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 474


@ @ start
$ $ End


KeyboardInterrupt: 