In [1]:
from transformers import WhisperTokenizer, WhisperFeatureExtractor
from transformers import GenerationConfig
from transformers import WhisperForConditionalGeneration

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def expand_tokenizer_model(tokenizer, model, new_tokens_list):
    date_info = new_tokens_list
    tokenizer.add_special_tokens(dict(additional_special_tokens=date_info))
    model.resize_token_embeddings(len(tokenizer))
    return tokenizer, model

In [3]:
DATE_TO_INDEX = {'t12.2022.04.28': 0,
                't12.2022.05.05': 1,
                't12.2022.05.17': 2,
                't12.2022.05.19': 3,
                't12.2022.05.24': 4,
                't12.2022.05.26': 5,
                't12.2022.06.02': 6,
                't12.2022.06.07': 7,
                't12.2022.06.14': 8,
                't12.2022.06.16': 9,
                't12.2022.06.21': 10,
                't12.2022.06.23': 11,
                't12.2022.06.28': 12,
                't12.2022.07.05': 13,
                't12.2022.07.14': 14,
                't12.2022.07.21': 15,
                't12.2022.07.27': 16,
                't12.2022.07.29': 17,
                't12.2022.08.02': 18,
                't12.2022.08.11': 19,
                't12.2022.08.13': 20,
                't12.2022.08.18': 21,
                't12.2022.08.23': 22,
                't12.2022.08.25': 23}

In [4]:
""" LOAD PRETRAINED MODEL COMPONENTS """

WHISPER_MODEL_NAME = "openai/whisper-tiny"

# load feature/label processing engines|
feature_extractor = WhisperFeatureExtractor.from_pretrained(WHISPER_MODEL_NAME)
tokenizer = WhisperTokenizer.from_pretrained(WHISPER_MODEL_NAME, task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained(WHISPER_MODEL_NAME)

if ".en" not in WHISPER_MODEL_NAME:
    model.generation_config.language = "english"
    model.generation_config.task = "transcribe"
    model.generation_config.forced_decoder_ids = None


tokenizer, model = expand_tokenizer_model(tokenizer, model, list(DATE_TO_INDEX.keys()))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
txt = 'i love you'
date = 't12.2022.04.28'

txt = date + txt

labels = tokenizer(txt, return_tensors="pt").input_ids.squeeze()
print(labels)
print(tokenizer.decode(labels))

print(len(tokenizer), model.get_input_embeddings())

tensor([50258, 50359, 50363, 51865,    72,   959,   291, 50257])
<|startoftranscript|><|transcribe|><|notimestamps|>t12.2022.04.28i love you<|endoftext|>
51889 Embedding(51889, 384)


In [6]:
tokenizer_old = WhisperTokenizer.from_pretrained(WHISPER_MODEL_NAME, task="transcribe")
tokenizer_new = WhisperTokenizer.from_pretrained(WHISPER_MODEL_NAME, task="transcribe")

date_info = list(DATE_TO_INDEX.keys())
tokenizer_new.add_special_tokens(dict(additional_special_tokens=date_info))

print(len(tokenizer_old), len(tokenizer_new))

model.resize_token_embeddings(len(tokenizer_new))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


51865 51889


Embedding(51889, 384)

In [7]:
## For generation
date = 't12.2022.04.28'

input_indices = tokenizer_new.encode(date)[:-1]
input_indices
# txt_to_generate = "<|startoftranscript|><|transcribe|><|notimestamps|>" + date

# indices_start = tokenizer_new.decode([txt_to_generate]) 



[50258, 50359, 50363, 51865]

In [8]:
Expand model 

SyntaxError: invalid syntax. Perhaps you forgot a comma? (1077441864.py, line 1)

In [None]:
list(model.get_input_embeddings().parameters())[0][100][:10]

tensor([-0.0034,  0.0085,  0.0040, -0.0011,  0.0164,  0.0069,  0.0199,  0.0485,
         0.0426, -0.0036], grad_fn=<SliceBackward0>)

In [None]:
list(model.get_input_embeddings().parameters())[0][100][:10]

tensor([-0.0034,  0.0085,  0.0040, -0.0011,  0.0164,  0.0069,  0.0199,  0.0485,
         0.0426, -0.0036], grad_fn=<SliceBackward0>)

tokenizerocab_size = 50258

In [None]:
txt = ' i love you'

# indices = tokenizer.encode(txt)
# tokenizer.decode(indices)

labels = tokenizer(txt, return_tensors="pt").input_ids.squeeze()
print(tokenizer.decode(labels))

labels_batch = tokenizer.pad({'input_ids': labels}, return_tensors="pt")
labels_batch

<|startoftranscript|><|transcribe|><|notimestamps|> i love you<|endoftext|>


{'input_ids': tensor([50258, 50359, 50363,   741,   959,   291, 50257]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1])}

In [None]:
txt = ' i love you'

indices = tokenizer.encode(txt)
tokenizer.decode(indices)

'<|startoftranscript|><|transcribe|><|notimestamps|> i love you<|endoftext|>'