In [1]:
from transformers import SpeechEncoderDecoderModel, Speech2Text2Processor, Speech2TextProcessor
import soundfile as sf

from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

from transformers import Trainer, TrainingArguments

from torch.utils.data import Dataset
import librosa
import json
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Set, Union

import torch


In [2]:
processor = Speech2Text2Processor.from_pretrained('facebook/s2t-wav2vec2-large-en-de')
model = SpeechEncoderDecoderModel.from_pretrained('facebook/s2t-wav2vec2-large-en-de')


In [3]:
wav2vec2_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-xlsr-53")
wav2vec2_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")

Some weights of the model checkpoint at facebook/wav2vec2-large-xlsr-53 were not used when initializing Wav2Vec2ForCTC: ['quantizer.codevectors', 'quantizer.weight_proj.weight', 'project_hid.bias', 'project_hid.weight', 'quantizer.weight_proj.bias', 'project_q.weight', 'project_q.bias']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to u

In [4]:
def from_file(file):
    speech, _ = sf.read(file)
    return speech

In [5]:
filepath = "/root/develop/KIWI-module/code/asr-model-zoo/sample/84-121123-0000.wav"
input_features = processor(
    from_file(filepath), sampling_rate=16_000, return_tensors="pt"
).input_values  # Batch size 1


In [6]:
processor(
    from_file(filepath), sampling_rate=16_000, return_tensors="pt"
)


{'input_values': tensor([[1.9481e-05, 1.9481e-05, 1.9481e-05,  ..., 1.9481e-05, 1.9481e-05,
         1.9481e-05]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]])}

In [7]:
wav2vec2_processor(
	from_file(filepath), sampling_rate=16_000, return_tensors="pt"	
)

{'input_values': tensor([[1.9481e-05, 1.9481e-05, 1.9481e-05,  ..., 1.9481e-05, 1.9481e-05,
         1.9481e-05]])}

In [8]:
processor.__dict__

{'feature_extractor': Wav2Vec2FeatureExtractor {
   "do_normalize": true,
   "feature_extractor_type": "Wav2Vec2FeatureExtractor",
   "feature_size": 1,
   "padding_side": "right",
   "padding_value": 0,
   "return_attention_mask": true,
   "sampling_rate": 16000
 },
 'tokenizer': PreTrainedTokenizer(name_or_path='facebook/s2t-wav2vec2-large-en-de', vocab_size=10224, model_max_len=1024, is_fast=False, padding_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}),
 'current_processor': Wav2Vec2FeatureExtractor {
   "do_normalize": true,
   "feature_extractor_type": "Wav2Vec2FeatureExtractor",
   "feature_size": 1,
   "padding_side": "right",
   "padding_value": 0,
   "return_attention_mask": true,
   "sampling_rate": 16000
 }}

In [9]:
wav2vec2_processor.__dict__

{'feature_extractor': Wav2Vec2FeatureExtractor {
   "do_normalize": true,
   "feature_extractor_type": "Wav2Vec2FeatureExtractor",
   "feature_size": 1,
   "padding_side": "right",
   "padding_value": 0.0,
   "return_attention_mask": false,
   "sampling_rate": 16000
 },
 'tokenizer': PreTrainedTokenizer(name_or_path='facebook/wav2vec2-base', vocab_size=32, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}),
 'current_processor': Wav2Vec2FeatureExtractor {
   "do_normalize": true,
   "feature_extractor_type": "Wav2Vec2FeatureExtractor",
   "feature_size": 1,
   "padding_side": "right",
   "padding_value": 0.0,
   "return_attention_mask": false,
   "sampling_rate": 16000
 }}

In [10]:
processor.tokenizer._tokenize

<bound method Speech2Text2Tokenizer._tokenize of PreTrainedTokenizer(name_or_path='facebook/s2t-wav2vec2-large-en-de', vocab_size=10224, model_max_len=1024, is_fast=False, padding_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'})>

In [11]:
s2t_processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-librispeech-asr")


In [12]:
# text = "<s> I LOVE CAT </s>"
text = "I LOVE CAT"

with s2t_processor.as_target_processor():
	label = s2t_processor(text).input_ids
label

[12, 190, 918, 2]

In [13]:
s2t_processor.decode(label) 

'i love cat'

In [14]:
s2t_processor.tokenizer.sp_model

<sentencepiece.SentencePieceProcessor; proxy of <Swig Object of type 'sentencepiece::SentencePieceProcessor *' at 0x7f3e38d5f300> >

In [15]:
s2t_processor.tokenizer.get_vocab()

{'<s>': 0,
 '<pad>': 1,
 '</s>': 2,
 '<unk>': 3,
 '▁the': 4,
 's': 5,
 '▁and': 6,
 'ed': 7,
 '▁of': 8,
 '▁to': 9,
 '▁a': 10,
 '▁in': 11,
 '▁i': 12,
 '▁he': 13,
 '▁that': 14,
 'ly': 15,
 '▁it': 16,
 '▁was': 17,
 'ing': 18,
 "'": 19,
 '▁his': 20,
 '▁with': 21,
 '▁you': 22,
 '▁on': 23,
 '▁for': 24,
 'y': 25,
 '▁her': 26,
 '▁had': 27,
 '▁be': 28,
 '▁as': 29,
 '▁is': 30,
 '▁but': 31,
 '▁not': 32,
 '▁she': 33,
 'e': 34,
 't': 35,
 'd': 36,
 '▁at': 37,
 '▁him': 38,
 '▁my': 39,
 '▁have': 40,
 '▁by': 41,
 '▁they': 42,
 '▁all': 43,
 '▁this': 44,
 '▁which': 45,
 '▁so': 46,
 '▁said': 47,
 '▁from': 48,
 '▁me': 49,
 '▁one': 50,
 '▁were': 51,
 '▁we': 52,
 'n': 53,
 '▁there': 54,
 '▁no': 55,
 '▁when': 56,
 '▁or': 57,
 '▁who': 58,
 '▁an': 59,
 '▁are': 60,
 '▁their': 61,
 '▁would': 62,
 '▁if': 63,
 '▁them': 64,
 '▁what': 65,
 'er': 66,
 '▁will': 67,
 '▁out': 68,
 '▁been': 69,
 '▁up': 70,
 '▁do': 71,
 '▁man': 72,
 '▁then': 73,
 '▁more': 74,
 '▁your': 75,
 '▁could': 76,
 '▁into': 77,
 '▁now': 78,
 '▁some'

In [16]:
with s2t_processor.as_target_processor():
	s2t_processor.pad??

Object `s2t_processor.pad` not found.


In [17]:
s2t_processor.tokenizer.pad??

[0;31mSignature:[0m
[0ms2t_processor[0m[0;34m.[0m[0mtokenizer[0m[0;34m.[0m[0mpad[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mencoded_inputs[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mtransformers[0m[0;34m.[0m[0mtokenization_utils_base[0m[0;34m.[0m[0mBatchEncoding[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mtransformers[0m[0;34m.[0m[0mtokenization_utils_base[0m[0;34m.[0m[0mBatchEncoding[0m[0;34m][0m[0;34m,[0m [0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mint[0m[0;34m][0m[0;34m][0m[0;34m,[0m [0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mint[0m[0;34m][0m[0;34m][0m[0;34m][0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mint[0m[0;34m][0m[0;34m][0m[0;34m][0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpadding[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mbool[0m[0;34m,[0m [0m

In [18]:
s2t_processor.tokenizer.pad??

[0;31mSignature:[0m
[0ms2t_processor[0m[0;34m.[0m[0mtokenizer[0m[0;34m.[0m[0mpad[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mencoded_inputs[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mtransformers[0m[0;34m.[0m[0mtokenization_utils_base[0m[0;34m.[0m[0mBatchEncoding[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mtransformers[0m[0;34m.[0m[0mtokenization_utils_base[0m[0;34m.[0m[0mBatchEncoding[0m[0;34m][0m[0;34m,[0m [0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mint[0m[0;34m][0m[0;34m][0m[0;34m,[0m [0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mint[0m[0;34m][0m[0;34m][0m[0;34m][0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mint[0m[0;34m][0m[0;34m][0m[0;34m][0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpadding[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mbool[0m[0;34m,[0m [0m

In [19]:
wav2vec2_processor.feature_extractor.pad??

[0;31mSignature:[0m
[0mwav2vec2_processor[0m[0;34m.[0m[0mfeature_extractor[0m[0;34m.[0m[0mpad[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mprocessed_features[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mtransformers[0m[0;34m.[0m[0mfeature_extraction_utils[0m[0;34m.[0m[0mBatchFeature[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mtransformers[0m[0;34m.[0m[0mfeature_extraction_utils[0m[0;34m.[0m[0mBatchFeature[0m[0;34m][0m[0;34m,[0m [0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mtransformers[0m[0;34m.[0m[0mfeature_extraction_utils[0m[0;34m.[0m[0mBatchFeature[0m[0;34m][0m[0;34m,[0m [0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mtransformers[0m[0;34m.[0m[0mfeature_extraction_utils[0m[0;34m.[0m[0mBatchFeature[0m[0;34m][0m[0;34m][0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mtransformers[0m[0;34m.[0m[0mfeature_extraction_utils[0m[0;34m.[0m[0mBatchFeat

In [20]:
wav2vec2_processor.tokenizer.pad??

[0;31mSignature:[0m
[0mwav2vec2_processor[0m[0;34m.[0m[0mtokenizer[0m[0;34m.[0m[0mpad[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mencoded_inputs[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mtransformers[0m[0;34m.[0m[0mtokenization_utils_base[0m[0;34m.[0m[0mBatchEncoding[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mtransformers[0m[0;34m.[0m[0mtokenization_utils_base[0m[0;34m.[0m[0mBatchEncoding[0m[0;34m][0m[0;34m,[0m [0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mint[0m[0;34m][0m[0;34m][0m[0;34m,[0m [0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mint[0m[0;34m][0m[0;34m][0m[0;34m][0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mint[0m[0;34m][0m[0;34m][0m[0;34m][0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpadding[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mbool[0m[0;34m,[0m

In [56]:
class LibriSpeechDataset(Dataset):
	def __init__(self, json_path, tokenizer, feature_extractor):
		self.json_path = json_path
		self.data = self.load_data_from_json(json_path)
		self.tokenizer = tokenizer
		self.feature_extractor = feature_extractor

	def load_data_from_json(self, json_path):
		with open(json_path, "r") as f:
			data = json.load(f)

		data = data["data"][100:]
		return data

	def __getitem__(self, idx):
		audio, _ = librosa.load(self.data[idx]["file"], 16000)
		input_value = self.feature_extractor.feature_extractor(audio, sampling_rate=16000)
		# Do some text preprocessing here
		text = self.data[idx]["text"]
		with self.tokenizer.as_target_processor():
			label = self.tokenizer(text).input_ids

		# print(input_value)
		sample = {
			"input_values": input_value["input_values"][0],
			"labels": label
		}
		return sample

	def __len__(self):
		return len(self.data)

In [57]:
train_dataset = LibriSpeechDataset("/root/develop/KIWI-module/code/wav2byte-pipeline/data/en-librispeech-test-clean-pure-99.0-local-wav.json", 
	tokenizer=s2t_processor,
	feature_extractor=processor)	


In [58]:
train_dataset[0]

{'input_values': array([-0.00974366,  0.00624674,  0.01276135, ...,  0.00269332,
         0.00387779,  0.00150884], dtype=float32),
 'labels': [4,
  1388,
  5735,
  7,
  21,
  4,
  1036,
  236,
  8,
  2990,
  429,
  18,
  158,
  1711,
  16,
  6738,
  7,
  163,
  2816,
  25,
  8,
  679,
  1489,
  35,
  428,
  836,
  6,
  625,
  2839,
  2]}

In [59]:
features = [train_dataset[0], train_dataset[1]]



input_features = [
	{"input_values": feature["input_values"]} for feature in features
]
label_features = [{"input_ids": feature["labels"]} for feature in features]

In [60]:
input_features

[{'input_values': array([-0.00974366,  0.00624674,  0.01276135, ...,  0.00269332,
          0.00387779,  0.00150884], dtype=float32)},
 {'input_values': array([-0.02276293, -0.028083  , -0.0204829 , ...,  0.00307741,
          0.00383742, -0.01136278], dtype=float32)}]

In [61]:
processor.feature_extractor.pad(
	input_features,
	padding=True,
	max_length=1024,
	return_tensors="pt"
)

{'input_values': tensor([[-0.0097,  0.0062,  0.0128,  ...,  0.0027,  0.0039,  0.0015],
        [-0.0228, -0.0281, -0.0205,  ...,  0.0000,  0.0000,  0.0000]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [62]:
wav2vec2_processor.feature_extractor.pad(
	input_features,
	padding=True,
	max_length=1024,
	return_tensors="pt"
)

{'input_values': tensor([[-0.0097,  0.0062,  0.0128,  ...,  0.0027,  0.0039,  0.0015],
        [-0.0228, -0.0281, -0.0205,  ...,  0.0000,  0.0000,  0.0000]])}

In [63]:
@dataclass
class DataCollatorWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Speech2TextProcessor
    feature_extractor: Speech2Text2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(
        self, features: List[Dict[str, Union[List[int], torch.Tensor]]]
    ) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [
            {"input_values": feature["input_values"]} for feature in features
        ]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        # lang_features = [{"lang": feature['lang'] for feature in features}]

        print("@ @ start")
        # feature_extractor 를 명시해준것은, 현재 processor 구현에 pad를 매칭이 안됨 (feature_extractor 에는 있음)
        batch = self.feature_extractor.feature_extractor.pad(
            input_features,
            padding=True,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        print("$ $ End")

        with self.processor.as_target_processor():
            labels_batch = self.processor.tokenizer.pad( # tokenizer를 명시해준것은, 현재 processor 구현에 pad를 매칭이 안됨 (tokenizer에는 있음)
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(
            labels_batch.attention_mask.ne(1), -100
        )

        batch["labels"] = labels


        return batch

In [64]:
data_collator = DataCollatorWithPadding(processor=s2t_processor,feature_extractor=processor, padding=True)


In [65]:
trainer = Trainer(model=model, train_dataset=train_dataset, data_collator=data_collator, tokenizer=s2t_processor.feature_extractor)

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [66]:
trainer.train()

***** Running training *****
  Num examples = 2520
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 474


@ @ start
$ $ End
