In [1]:
import os
import sys
from pathlib import Path
sys.path.insert(1, os.path.realpath(os.path.pardir))

import pickle
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn

import safetensors
from accelerate import notebook_launcher

import einops

from utils.data_utils import BrainDataset, get_tokenizer, pad_token_list
from utils.train_utils import TrainConfig, run_train_model, count_parameters, load_model_weights

# from transformers import GPT2Tokenizer

# from models.bert import BrainBert, BertConfig
from models.vq_brain_per_channel import SoundStream, VAEConfig

from dataclasses import dataclass
from simple_parsing.helpers import Serializable

from safetensors.torch import load_model
import albumentations as A

import evaluate
from torchsummary import summary


  from .autonotebook import tqdm as notebook_tqdm
INFO:datasets:PyTorch version 2.3.0+cu121 available.


In [59]:
# device='cuda'
vae_config = VAEConfig(C=256, levels=(8, 8, 6, 5))
vq_vae = SoundStream(**vae_config.to_dict())

vq_weights = r'C:\Users\peter\alvi\brain2text\weights\vq_vae_v1\step_78000_loss_0.0275.safetensors'
vq_vae = load_model_weights(vq_vae, vq_weights)

self.codebook_size 1920
self.downsample 8
load compiled weights


In [2]:
""" LOAD PRETRAINED MODEL COMPONENTS """

WHISPER_MODEL_NAME = "openai/whisper-large-v3"
WHISPER_MODEL_NAME = "openai/whisper-medium"

from transformers import WhisperTokenizer, WhisperFeatureExtractor, WhisperConfig
from transformers import GenerationConfig
from transformers import WhisperForConditionalGeneration
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# import LORA modules
# from peft import LoraConfig, AdaLoraConfig
# from peft import LoraModel, AdaLoraModel, PeftModel, get_peft_model

from audiomentations import Compose, AddGaussianNoise, TimeStretch, LowPassFilter

# load feature/label processing engines
feature_extractor = WhisperFeatureExtractor.from_pretrained(WHISPER_MODEL_NAME)
tokenizer = WhisperTokenizer.from_pretrained(WHISPER_MODEL_NAME, task="transcribe")
# load model
model = WhisperForConditionalGeneration.from_pretrained(WHISPER_MODEL_NAME)
# [PCH] reducing the limit of time-length
# https://github.com/sanchit-gandhi/codesnippets/blob/main/whisper-reduce-context.ipynb
# slice first 1/3 embeddings (=10 seconds input audio)
state_dict = model.state_dict()
state_dict["model.encoder.embed_positions.weight"] = state_dict["model.encoder.embed_positions.weight"][:96//2, :]
# now load these weights back into the Whisper model, this time configured for this new seq len
if ".en" not in WHISPER_MODEL_NAME:
    config = WhisperConfig.from_pretrained(WHISPER_MODEL_NAME
                                        , max_source_positions=96//2
                                        , language = "english"
                                        , task = "transcribe"
                                        , forced_decoder_ids= None
                                        )
# if ".en" not in WHISPER_MODEL_NAME:
#     model.generation_config.language = "english"
#     model.generation_config.task = "transcribe"
#     model.generation_config.forced_decoder_ids = None

model = WhisperForConditionalGeneration(config)

model.load_state_dict(state_dict)
# from dataclasses import dataclass
# from typing import Any, Dict, List, Union'

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


<All keys matched successfully>

In [None]:
ids = tokenizer('love')['input_ids']
tokenizer.decode(ids)

'<|startoftranscript|><|transcribe|><|notimestamps|>love<|endoftext|>'

In [3]:

metric = evaluate.load("wer")

In [None]:
# # window_size = 512 # [PCH] - это решение волюнтаристское - мб надо поменять
# window_size = 768 
# n_electrodes = 256


# train_transform = A.Compose([
    
#     # A.CoarseDropout(fill_value=0, p=0.5),
#     # A.MultiplicativeNoise(multiplier=(0.9, 1.1), p=0.5),
#     # A.GaussNoise(var_limit=0.005, mean=0, p=0.5),

#     A.PadIfNeeded(min_height=window_size, min_width=n_electrodes, position='top_left', 
#                   border_mode=0, value=0, always_apply=True),
#     # A.RandomCrop(height=window_size, width=n_electrodes, always_apply=True),
#     A.Crop(x_min=0, x_max=n_electrodes, y_min=0, y_max=window_size, always_apply=True),

# ])

# test_transform = A.Compose([
#     A.PadIfNeeded(min_height=window_size, min_width=n_electrodes, position='top_left', 
#                   border_mode=0, value=0, always_apply=True),
#     A.Crop(x_min=0, x_max=n_electrodes, y_min=0, y_max=window_size, always_apply=True)
# ])

# data_path = Path(r"C:\Users\peter\alvi\brain2text\competitionData")
# # tokenizer - whisper tokenizer
# # tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# MAX_TOKENS = 26
# tokenize_func = lambda text: tokenizer(text)['input_ids'] 

# train_dataset = BrainDataset(data_path / 'train', tokenize_function=tokenize_func, transform=train_transform, max_tokens=MAX_TOKENS)
# test_dataset = BrainDataset(data_path / 'test', tokenize_function=tokenize_func, transform=test_transform, max_tokens=MAX_TOKENS)

Runed processing of the  C:\Users\peter\alvi\brain2text\competitionData\train
len of the dataset: 8780
max input len 906
median len 297.0
Runed processing of the  C:\Users\peter\alvi\brain2text\competitionData\test
len of the dataset: 880
max input len 919
median len 285.0


In [4]:
from torch.utils.data import Dataset
import numpy as np
from tqdm import tqdm

def check_padding(act):
    # act shape (t, c)
    # flag = (act == 0.0)
    flag = torch.sum(act != 0.0, 1) # sum of all Falses = 0 -> it was padded
    for i in range(len(flag)-1, 0, -1):
        if flag[i] != 0:
            break
    # print(i, flag[i - 1], flag[i], flag[i+1])
    # print(act[:i+1]) - True
    # print(act[i+1:]) - Padded
    end_ind = i
    return end_ind    

class VQBrainDataset(Dataset):
    def __init__(self, braindataset, vq_vae, device): 
        self.braindataset = braindataset
        vq_vae = vq_vae.to(device) 
        self.codebook_size = vq_vae.codebook_size
        new_y = []
        pad_ind = []
        new_x = []
        i = 0
        with torch.no_grad():
            for x, y, dt in tqdm(braindataset):
                # x_tens = torch.tensor(einops.rearrange(x, 't c -> 1 t c' )).to(device)
                x_tens = x.to(device)
                x_tens = vq_vae.get_indices(x_tens) / self.codebook_size
                # x_tens = einops.rearrange(x_tens, '1 t c -> t c')
                new_y.append(y)
                new_x.append(x_tens) 
                for el in x:
                    pad_ind.append(check_padding(el))
                # i+=1    
                # if i > 2:
                #     break
            

        self.inputs = torch.cat(new_x, dim = 0).detach().cpu().numpy()
        self.targets = torch.cat(new_y, 0).detach().cpu().numpy()
        self.pad_ind = np.asarray(pad_ind, dtype=np.int32)

        lens = [s.shape[0] for s in self.inputs]

        print('len of the dataset:', len(self))
        print('max input len', np.max(lens))
        print('median len', np.median(lens))



    def __len__(self) -> int:
        return len(self.inputs)

    def __getitem__(self, idx: int):
        """
        return 
            brain with shape: [time, n_channels]
            target: [n_tokens]
            date_info: 1
        """
        
        input = self.inputs[idx].astype(np.float32)
        target = self.targets[idx]
        
        # date = self.date[idx]
        # date_idx = self.date_to_index[date]
                
        # return input, target#, date_idx
        return {
            "input_features": torch.tensor(input),
            "labels": torch.tensor(target),
        }
    
    
with open('preproc_dataset.pickle', "rb") as f:
    train_dataset = pickle.load(f)   
with open('preproc_test_dataset.pickle', "rb") as f:
    test_dataset = pickle.load(f)      

In [5]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    
    # feature_extractor: Any
    # tokenizer: Any
    device : str
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # first treat the BRAIN INPUTS (already preprocessed)
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        input_features = [torch.tensor(feature["input_features"]) for feature in features]
        # batch = {"input_features": einops.rearrange(torch.stack(input_features, 0), 'b t c -> b c t').to(self.device)}
        batch = {"input_features": einops.rearrange(torch.stack(input_features, 0), 'b t c -> b c t')}
        # batch = self.feature_extractor.pad(input_features, return_tensors="pt")
        # batch = input_features
        # print(batch)

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        label_features = [torch.tensor(feature["labels"]) for feature in features]
        # pad the labels to max length
        # labels_batch = {"input_ids": torch.stack(label_features, 0).to(self.device)}
        labels_batch = {"input_ids": torch.stack(label_features, 0)}
        # labels_batch = self.tokenizer.pad(label_features, return_tensors="pt")
        # labels = label_features

        # replace padding with -100 to ignore loss correctly
        # labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        labels = labels_batch["input_ids"]
        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        # print(labels.shape)
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch
    
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    # tokenizer=tokenizer,
    # feature_extractor=feature_extractor,
    device = 'cuda',
    decoder_start_token_id=model.config.decoder_start_token_id,
)

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments



In [6]:
# set a new convolutional layer to accept different number of input channels
new_conv = nn.Conv1d(256, model.model.encoder.conv1.out_channels, kernel_size=(3,), stride=(1,), padding=(1,))
model.model.encoder.conv1 = new_conv

# freeze decoder self-attention. keep cross-attention
for pn, p in model.model.decoder.named_parameters():
    if 'encoder_attn' in pn:
        p.requires_grad = True
    else:
        p.requires_grad = False

summary(model)

Layer (type:depth-idx)                             Param #
├─WhisperModel: 1-1                                --
|    └─WhisperEncoder: 2-1                         --
|    |    └─Conv1d: 3-1                            787,456
|    |    └─Conv1d: 3-2                            3,146,752
|    |    └─Embedding: 3-3                         (49,152)
|    |    └─ModuleList: 3-4                        302,284,800
|    |    └─LayerNorm: 3-5                         2,048
|    └─WhisperDecoder: 2-2                         --
|    |    └─Embedding: 3-6                         (53,109,760)
|    |    └─WhisperPositionalEmbedding: 3-7        (458,752)
|    |    └─ModuleList: 3-8                        (403,070,976)
|    |    └─LayerNorm: 3-9                         (2,048)
├─Linear: 1-2                                      (53,109,760)
Total params: 816,021,504
Trainable params: 306,221,056
Non-trainable params: 509,800,448


Layer (type:depth-idx)                             Param #
├─WhisperModel: 1-1                                --
|    └─WhisperEncoder: 2-1                         --
|    |    └─Conv1d: 3-1                            787,456
|    |    └─Conv1d: 3-2                            3,146,752
|    |    └─Embedding: 3-3                         (49,152)
|    |    └─ModuleList: 3-4                        302,284,800
|    |    └─LayerNorm: 3-5                         2,048
|    └─WhisperDecoder: 2-2                         --
|    |    └─Embedding: 3-6                         (53,109,760)
|    |    └─WhisperPositionalEmbedding: 3-7        (458,752)
|    |    └─ModuleList: 3-8                        (403,070,976)
|    |    └─LayerNorm: 3-9                         (2,048)
├─Linear: 1-2                                      (53,109,760)
Total params: 816,021,504
Trainable params: 306,221,056
Non-trainable params: 509,800,448

In [7]:
import wandb


In [7]:
# wandb.init()
experiment_path = Path('logs/whisper/petr')
experiment_path.mkdir(parents=True, exist_ok=True)

batch_size = 256
epoch_length = int(len(train_dataset) / batch_size)

training_args = Seq2SeqTrainingArguments(
    output_dir=experiment_path,  # change to a repo name of your choice
    per_device_train_batch_size=batch_size,
    # gradient_accumulation_steps=int(16 / batch_size),  # increase by 2x for every 2x decrease in batch size
    learning_rate=2.5e-5,
    num_train_epochs = 10000,
    warmup_steps=epoch_length,
    gradient_checkpointing=True,
    fp16=False,
    evaluation_strategy="steps",
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    generation_max_length=32,
    save_steps=int(epoch_length / 2),
    eval_steps=int(epoch_length / 2),
    logging_steps=25,
    report_to=["wandb"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
    use_cpu= False
)

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model.to('cuda'),
    # model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=feature_extractor,
    # pin_memory = False,

)
trainer.train()

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mpeter_chizhov[0m. Use [1m`wandb login --relogin`[0m to force relogin


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  attn_output = torch.nn.functional.scaled_dot_product_attention(
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...
                                                       
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.49925422668457, 'eval_wer': 116.31837179720152, 'eval_runtime': 39.8602, 'eval_samples_per_second': 22.077, 'eval_steps_per_second': 0.1, 'epoch': 0.49}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 25/350000 [01:52<365:11:12,  3.76s/it] 

{'loss': 7.4011, 'grad_norm': 12.772808074951172, 'learning_rate': 1.8382352941176472e-05, 'epoch': 0.71}


                                                       
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.301051616668701, 'eval_wer': 102.85298927857531, 'eval_runtime': 39.0715, 'eval_samples_per_second': 22.523, 'eval_steps_per_second': 0.102, 'epoch': 0.97}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 50/350000 [03:41<262:44:09,  2.70s/it] 

{'loss': 3.3663, 'grad_norm': 57.035404205322266, 'learning_rate': 2.499885703182595e-05, 'epoch': 1.43}


                                                       
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.1844725608825684, 'eval_wer': 139.9418499000545, 'eval_runtime': 40.4562, 'eval_samples_per_second': 21.752, 'eval_steps_per_second': 0.099, 'epoch': 1.46}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
                                                        
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.1157736778259277, 'eval_wer': 116.7908413592586, 'eval_runtime': 40.194, 'eval_samples_per_second': 21.894, 'eval_steps_per_second': 0.1, 'epoch': 1.94}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 75/350000 [06:18<405:20:55,  4.17s/it] 

{'loss': 3.206, 'grad_norm': 2.382356882095337, 'learning_rate': 2.4997071144053998e-05, 'epoch': 2.14}


                                                       
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.0913472175598145, 'eval_wer': 117.22696710884972, 'eval_runtime': 39.3842, 'eval_samples_per_second': 22.344, 'eval_steps_per_second': 0.102, 'epoch': 2.43}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 100/350000 [08:09<265:43:24,  2.73s/it]

{'loss': 3.1315, 'grad_norm': 1.5008519887924194, 'learning_rate': 2.499528525628204e-05, 'epoch': 2.86}


                                                        
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.064462423324585, 'eval_wer': 119.48028348173723, 'eval_runtime': 39.8635, 'eval_samples_per_second': 22.075, 'eval_steps_per_second': 0.1, 'epoch': 2.91}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
                                                         
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.0781376361846924, 'eval_wer': 111.19389423950572, 'eval_runtime': 39.4984, 'eval_samples_per_second': 22.279, 'eval_steps_per_second': 0.101, 'epoch': 3.4}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 125/350000 [10:45<478:54:54,  4.93s/it] 

{'loss': 3.0729, 'grad_norm': 2.4670090675354004, 'learning_rate': 2.4993499368510084e-05, 'epoch': 3.57}


                                                        
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.0530638694763184, 'eval_wer': 113.15646011266583, 'eval_runtime': 39.6215, 'eval_samples_per_second': 22.21, 'eval_steps_per_second': 0.101, 'epoch': 3.89}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 150/350000 [12:35<267:56:18,  2.76s/it] 

{'loss': 3.0605, 'grad_norm': 1.1895320415496826, 'learning_rate': 2.499171348073813e-05, 'epoch': 4.29}


                                                        
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.0593481063842773, 'eval_wer': 112.08431764492094, 'eval_runtime': 39.757, 'eval_samples_per_second': 22.134, 'eval_steps_per_second': 0.101, 'epoch': 4.37}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
                                                         
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.045309543609619, 'eval_wer': 113.28366345629657, 'eval_runtime': 39.8285, 'eval_samples_per_second': 22.095, 'eval_steps_per_second': 0.1, 'epoch': 4.86}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 175/350000 [15:11<527:36:21,  5.43s/it] 

{'loss': 3.0182, 'grad_norm': 3.016472578048706, 'learning_rate': 2.4989927592966174e-05, 'epoch': 5.0}


                                                        
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.060825824737549, 'eval_wer': 118.78975104488461, 'eval_runtime': 39.841, 'eval_samples_per_second': 22.088, 'eval_steps_per_second': 0.1, 'epoch': 5.34}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 200/350000 [17:02<274:59:24,  2.83s/it] 

{'loss': 2.9494, 'grad_norm': 2.8484935760498047, 'learning_rate': 2.4988141705194222e-05, 'epoch': 5.71}


                                                        
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.067617416381836, 'eval_wer': 107.1415591495548, 'eval_runtime': 39.5488, 'eval_samples_per_second': 22.251, 'eval_steps_per_second': 0.101, 'epoch': 5.83}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
                                                         
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.0691299438476562, 'eval_wer': 114.22860258041068, 'eval_runtime': 39.8222, 'eval_samples_per_second': 22.098, 'eval_steps_per_second': 0.1, 'epoch': 6.31}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 225/350000 [19:38<717:43:56,  7.39s/it] 

{'loss': 2.9058, 'grad_norm': 1.961161732673645, 'learning_rate': 2.4986355817422267e-05, 'epoch': 6.43}


                                                        
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.0843582153320312, 'eval_wer': 110.99400327094313, 'eval_runtime': 39.5629, 'eval_samples_per_second': 22.243, 'eval_steps_per_second': 0.101, 'epoch': 6.8}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 250/350000 [21:29<274:25:49,  2.82s/it] 

{'loss': 2.8581, 'grad_norm': 4.098196506500244, 'learning_rate': 2.4984569929650308e-05, 'epoch': 7.14}


                                                        
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.0929455757141113, 'eval_wer': 118.04470288933308, 'eval_runtime': 40.2002, 'eval_samples_per_second': 21.89, 'eval_steps_per_second': 0.1, 'epoch': 7.29}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
                                                         
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.093876838684082, 'eval_wer': 110.83045611484646, 'eval_runtime': 39.6922, 'eval_samples_per_second': 22.171, 'eval_steps_per_second': 0.101, 'epoch': 7.77}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 275/350000 [24:09<928:13:56,  9.56s/it] 

{'loss': 2.7855, 'grad_norm': 1.8846482038497925, 'learning_rate': 2.4982784041878353e-05, 'epoch': 7.86}


                                                        
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.134267807006836, 'eval_wer': 111.9571143012902, 'eval_runtime': 40.03, 'eval_samples_per_second': 21.984, 'eval_steps_per_second': 0.1, 'epoch': 8.26}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 300/350000 [26:00<295:01:08,  3.04s/it] 

{'loss': 2.7236, 'grad_norm': 2.2525856494903564, 'learning_rate': 2.49809981541064e-05, 'epoch': 8.57}


                                                        
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.132274627685547, 'eval_wer': 116.33654370343449, 'eval_runtime': 40.0682, 'eval_samples_per_second': 21.963, 'eval_steps_per_second': 0.1, 'epoch': 8.74}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
                                                         
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.171712875366211, 'eval_wer': 115.31891695438851, 'eval_runtime': 39.9167, 'eval_samples_per_second': 22.046, 'eval_steps_per_second': 0.1, 'epoch': 9.23}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 325/350000 [28:39<1215:19:12, 12.51s/it]

{'loss': 2.6475, 'grad_norm': 3.0228281021118164, 'learning_rate': 2.4979212266334446e-05, 'epoch': 9.29}


                                                         
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.179736614227295, 'eval_wer': 121.4610212611303, 'eval_runtime': 40.2041, 'eval_samples_per_second': 21.888, 'eval_steps_per_second': 0.099, 'epoch': 9.71}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 350/350000 [30:30<259:43:35,  2.67s/it] 

{'loss': 2.5879, 'grad_norm': 5.465426921844482, 'learning_rate': 2.497742637856249e-05, 'epoch': 10.0}


                                                        
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.2367851734161377, 'eval_wer': 111.72087952026169, 'eval_runtime': 39.7772, 'eval_samples_per_second': 22.123, 'eval_steps_per_second': 0.101, 'epoch': 10.2}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
                                                         
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.2433199882507324, 'eval_wer': 116.37288751590043, 'eval_runtime': 39.8885, 'eval_samples_per_second': 22.062, 'eval_steps_per_second': 0.1, 'epoch': 10.69}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 375/350000 [33:10<1628:38:18, 16.77s/it]

{'loss': 2.4594, 'grad_norm': 3.4652249813079834, 'learning_rate': 2.4975640490790535e-05, 'epoch': 10.71}


                                                         
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.290276527404785, 'eval_wer': 110.46701799018717, 'eval_runtime': 40.0245, 'eval_samples_per_second': 21.987, 'eval_steps_per_second': 0.1, 'epoch': 11.17}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 400/350000 [35:01<334:45:59,  3.45s/it] 

{'loss': 2.4, 'grad_norm': 3.3927371501922607, 'learning_rate': 2.497385460301858e-05, 'epoch': 11.43}


                                                        
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.297116756439209, 'eval_wer': 116.08213701617301, 'eval_runtime': 39.8637, 'eval_samples_per_second': 22.075, 'eval_steps_per_second': 0.1, 'epoch': 11.66}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 425/350000 [36:53<252:55:09,  2.60s/it] 

{'loss': 2.3369, 'grad_norm': 3.872251510620117, 'learning_rate': 2.4972068715246625e-05, 'epoch': 12.14}


                                                        
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.359389305114746, 'eval_wer': 111.79356714519353, 'eval_runtime': 39.6016, 'eval_samples_per_second': 22.221, 'eval_steps_per_second': 0.101, 'epoch': 12.14}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
                                                         
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.3586254119873047, 'eval_wer': 108.66799927312375, 'eval_runtime': 40.4773, 'eval_samples_per_second': 21.741, 'eval_steps_per_second': 0.099, 'epoch': 12.63}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 450/350000 [39:35<370:33:28,  3.82s/it] 

{'loss': 2.2472, 'grad_norm': 3.8991379737854004, 'learning_rate': 2.497028282747467e-05, 'epoch': 12.86}


                                                        
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.437408924102783, 'eval_wer': 118.6988915137198, 'eval_runtime': 40.0275, 'eval_samples_per_second': 21.985, 'eval_steps_per_second': 0.1, 'epoch': 13.11}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 475/350000 [41:26<263:07:46,  2.71s/it] 

{'loss': 2.1584, 'grad_norm': 4.287566661834717, 'learning_rate': 2.4968496939702715e-05, 'epoch': 13.57}


                                                        
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.437037229537964, 'eval_wer': 108.03198255497001, 'eval_runtime': 39.5158, 'eval_samples_per_second': 22.27, 'eval_steps_per_second': 0.101, 'epoch': 13.6}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
                                                         
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.5436644554138184, 'eval_wer': 113.22914773759767, 'eval_runtime': 39.6052, 'eval_samples_per_second': 22.219, 'eval_steps_per_second': 0.101, 'epoch': 14.09}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 500/350000 [44:04<414:28:33,  4.27s/it] 

{'loss': 2.081, 'grad_norm': 4.522780895233154, 'learning_rate': 2.496671105193076e-05, 'epoch': 14.29}


                                                        
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.5098488330841064, 'eval_wer': 114.75558786116665, 'eval_runtime': 39.8086, 'eval_samples_per_second': 22.106, 'eval_steps_per_second': 0.1, 'epoch': 14.57}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 525/350000 [45:56<214:09:28,  2.21s/it] 

{'loss': 2.0381, 'grad_norm': 6.4750657081604, 'learning_rate': 2.4964925164158808e-05, 'epoch': 15.0}


                                                        
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.5583226680755615, 'eval_wer': 112.15700526985282, 'eval_runtime': 39.7094, 'eval_samples_per_second': 22.161, 'eval_steps_per_second': 0.101, 'epoch': 15.06}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
                                                         
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.547335386276245, 'eval_wer': 116.0276212974741, 'eval_runtime': 39.8997, 'eval_samples_per_second': 22.055, 'eval_steps_per_second': 0.1, 'epoch': 15.54}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 550/350000 [48:37<492:13:00,  5.07s/it] 

{'loss': 1.9364, 'grad_norm': 4.5945611000061035, 'learning_rate': 2.4963139276386852e-05, 'epoch': 15.71}


                                                        
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.5805468559265137, 'eval_wer': 110.73959658368162, 'eval_runtime': 39.7018, 'eval_samples_per_second': 22.165, 'eval_steps_per_second': 0.101, 'epoch': 16.03}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 575/350000 [50:28<269:03:20,  2.77s/it] 

{'loss': 1.8795, 'grad_norm': 4.543772220611572, 'learning_rate': 2.4961353388614894e-05, 'epoch': 16.43}


                                                        
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.6191940307617188, 'eval_wer': 110.17626749045975, 'eval_runtime': 39.9266, 'eval_samples_per_second': 22.04, 'eval_steps_per_second': 0.1, 'epoch': 16.51}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
                                                         
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.6099963188171387, 'eval_wer': 110.57604942758495, 'eval_runtime': 91.9454, 'eval_samples_per_second': 9.571, 'eval_steps_per_second': 0.044, 'epoch': 17.0}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 600/350000 [54:04<979:16:33, 10.09s/it] 

{'loss': 1.8381, 'grad_norm': 4.748746871948242, 'learning_rate': 2.495956750084294e-05, 'epoch': 17.14}


                                                        
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.6652913093566895, 'eval_wer': 108.21370161729966, 'eval_runtime': 40.118, 'eval_samples_per_second': 21.935, 'eval_steps_per_second': 0.1, 'epoch': 17.49}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 625/350000 [55:59<275:56:49,  2.84s/it] 

{'loss': 1.7743, 'grad_norm': 5.405233383178711, 'learning_rate': 2.4957781613070987e-05, 'epoch': 17.86}


                                                        
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.662707567214966, 'eval_wer': 112.10248955115392, 'eval_runtime': 39.7702, 'eval_samples_per_second': 22.127, 'eval_steps_per_second': 0.101, 'epoch': 17.97}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
                                                         
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.721320152282715, 'eval_wer': 110.77594039614755, 'eval_runtime': 39.8415, 'eval_samples_per_second': 22.088, 'eval_steps_per_second': 0.1, 'epoch': 18.46}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 650/350000 [58:38<726:35:15,  7.49s/it] 

{'loss': 1.7118, 'grad_norm': 4.13706636428833, 'learning_rate': 2.495599572529903e-05, 'epoch': 18.57}


                                                        
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.7118959426879883, 'eval_wer': 107.92295111757224, 'eval_runtime': 39.8158, 'eval_samples_per_second': 22.102, 'eval_steps_per_second': 0.1, 'epoch': 18.94}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 675/350000 [1:00:30<282:03:38,  2.91s/it] 

{'loss': 1.6861, 'grad_norm': 4.026116847991943, 'learning_rate': 2.4954209837527076e-05, 'epoch': 19.29}


                                                          
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.7925026416778564, 'eval_wer': 112.32055242594947, 'eval_runtime': 40.0428, 'eval_samples_per_second': 21.976, 'eval_steps_per_second': 0.1, 'epoch': 19.43}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
                                                           
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.7625222206115723, 'eval_wer': 113.9196801744503, 'eval_runtime': 41.59, 'eval_samples_per_second': 21.159, 'eval_steps_per_second': 0.096, 'epoch': 19.91}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 700/350000 [1:03:12<904:14:50,  9.32s/it] 

{'loss': 1.6613, 'grad_norm': 6.823472023010254, 'learning_rate': 2.495242394975512e-05, 'epoch': 20.0}


                                                          
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.7812204360961914, 'eval_wer': 107.92295111757224, 'eval_runtime': 39.7736, 'eval_samples_per_second': 22.125, 'eval_steps_per_second': 0.101, 'epoch': 20.4}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 725/350000 [1:05:06<295:40:28,  3.05s/it] 

{'loss': 1.5764, 'grad_norm': 3.800647735595703, 'learning_rate': 2.4950638061983166e-05, 'epoch': 20.71}


                                                          
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.7972614765167236, 'eval_wer': 110.5033618026531, 'eval_runtime': 40.0104, 'eval_samples_per_second': 21.994, 'eval_steps_per_second': 0.1, 'epoch': 20.89}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
                                                           
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.8295469284057617, 'eval_wer': 110.52153370888607, 'eval_runtime': 39.8474, 'eval_samples_per_second': 22.084, 'eval_steps_per_second': 0.1, 'epoch': 21.37}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 750/350000 [1:07:47<1238:40:11, 12.77s/it]

{'loss': 1.5583, 'grad_norm': 4.04218864440918, 'learning_rate': 2.494885217421121e-05, 'epoch': 21.43}


                                                           
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.8642237186431885, 'eval_wer': 108.95874977285118, 'eval_runtime': 39.8626, 'eval_samples_per_second': 22.076, 'eval_steps_per_second': 0.1, 'epoch': 21.86}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 775/350000 [1:09:39<302:51:07,  3.12s/it] 

{'loss': 1.5396, 'grad_norm': 4.847492218017578, 'learning_rate': 2.4947066286439255e-05, 'epoch': 22.14}


                                                          
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.8761839866638184, 'eval_wer': 110.23078320915864, 'eval_runtime': 39.8793, 'eval_samples_per_second': 22.067, 'eval_steps_per_second': 0.1, 'epoch': 22.34}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
                                                           
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.8497154712677, 'eval_wer': 111.8117390514265, 'eval_runtime': 39.9271, 'eval_samples_per_second': 22.04, 'eval_steps_per_second': 0.1, 'epoch': 22.83}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 800/350000 [1:12:22<1670:32:00, 17.22s/it]

{'loss': 1.4828, 'grad_norm': 6.223440170288086, 'learning_rate': 2.49452803986673e-05, 'epoch': 22.86}


                                                           
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.87460994720459, 'eval_wer': 111.48464473923315, 'eval_runtime': 40.1004, 'eval_samples_per_second': 21.945, 'eval_steps_per_second': 0.1, 'epoch': 23.31}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 825/350000 [1:14:14<336:21:25,  3.47s/it] 

{'loss': 1.4434, 'grad_norm': 4.4652228355407715, 'learning_rate': 2.4943494510895345e-05, 'epoch': 23.57}


                                                          
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.8732523918151855, 'eval_wer': 111.557332364165, 'eval_runtime': 40.2688, 'eval_samples_per_second': 21.853, 'eval_steps_per_second': 0.099, 'epoch': 23.8}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 850/350000 [1:16:06<259:43:06,  2.68s/it] 

{'loss': 1.4167, 'grad_norm': 3.9840736389160156, 'learning_rate': 2.494170862312339e-05, 'epoch': 24.29}


                                                          
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.93746280670166, 'eval_wer': 111.75722333272759, 'eval_runtime': 39.9148, 'eval_samples_per_second': 22.047, 'eval_steps_per_second': 0.1, 'epoch': 24.29}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
                                                           
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.899087905883789, 'eval_wer': 110.43067417772123, 'eval_runtime': 39.9682, 'eval_samples_per_second': 22.017, 'eval_steps_per_second': 0.1, 'epoch': 24.77}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 875/350000 [1:18:45<318:03:00,  3.28s/it] 

{'loss': 1.4114, 'grad_norm': 7.347766399383545, 'learning_rate': 2.4939922735351435e-05, 'epoch': 25.0}


                                                          
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.9547677040100098, 'eval_wer': 111.15755042703978, 'eval_runtime': 40.2666, 'eval_samples_per_second': 21.854, 'eval_steps_per_second': 0.099, 'epoch': 25.26}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 900/350000 [1:20:39<263:17:43,  2.72s/it] 

{'loss': 1.346, 'grad_norm': 5.880929946899414, 'learning_rate': 2.493813684757948e-05, 'epoch': 25.71}


                                                          
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.9590952396392822, 'eval_wer': 110.12175177176087, 'eval_runtime': 40.4275, 'eval_samples_per_second': 21.767, 'eval_steps_per_second': 0.099, 'epoch': 25.74}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
                                                           
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.9844913482666016, 'eval_wer': 109.12229692894783, 'eval_runtime': 40.2067, 'eval_samples_per_second': 21.887, 'eval_steps_per_second': 0.099, 'epoch': 26.23}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 925/350000 [1:23:20<421:38:20,  4.35s/it] 

{'loss': 1.3253, 'grad_norm': 3.218549966812134, 'learning_rate': 2.4936350959807524e-05, 'epoch': 26.43}


                                                          
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.994642734527588, 'eval_wer': 109.17681264764674, 'eval_runtime': 40.047, 'eval_samples_per_second': 21.974, 'eval_steps_per_second': 0.1, 'epoch': 26.71}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 950/350000 [1:25:12<256:39:58,  2.65s/it] 

{'loss': 1.3396, 'grad_norm': 8.815756797790527, 'learning_rate': 2.493456507203557e-05, 'epoch': 27.14}


                                                          
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.983703136444092, 'eval_wer': 109.6492822097038, 'eval_runtime': 40.1237, 'eval_samples_per_second': 21.932, 'eval_steps_per_second': 0.1, 'epoch': 27.2}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
                                                           
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.008893013000488, 'eval_wer': 108.81337452298747, 'eval_runtime': 39.6663, 'eval_samples_per_second': 22.185, 'eval_steps_per_second': 0.101, 'epoch': 27.69}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 975/350000 [1:27:54<491:44:03,  5.07s/it] 

{'loss': 1.2973, 'grad_norm': 3.3412015438079834, 'learning_rate': 2.4932779184263617e-05, 'epoch': 27.86}


                                                          
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 3.9875690937042236, 'eval_wer': 109.2495002725786, 'eval_runtime': 39.922, 'eval_samples_per_second': 22.043, 'eval_steps_per_second': 0.1, 'epoch': 28.17}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 1000/350000 [1:29:47<269:57:33,  2.78s/it]

{'loss': 1.2842, 'grad_norm': 4.048829078674316, 'learning_rate': 2.4930993296491662e-05, 'epoch': 28.57}


                                                           
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.003838062286377, 'eval_wer': 109.8491731782664, 'eval_runtime': 39.9004, 'eval_samples_per_second': 22.055, 'eval_steps_per_second': 0.1, 'epoch': 28.66}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
                                                            
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.033875465393066, 'eval_wer': 112.32055242594947, 'eval_runtime': 39.9756, 'eval_samples_per_second': 22.013, 'eval_steps_per_second': 0.1, 'epoch': 29.14}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 1025/350000 [1:32:28<592:02:19,  6.11s/it] 

{'loss': 1.2713, 'grad_norm': 4.133058071136475, 'learning_rate': 2.4929207408719703e-05, 'epoch': 29.29}


                                                           
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.064146995544434, 'eval_wer': 110.39433036525533, 'eval_runtime': 40.3511, 'eval_samples_per_second': 21.809, 'eval_steps_per_second': 0.099, 'epoch': 29.63}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 1050/350000 [1:34:21<223:32:14,  2.31s/it] 

{'loss': 1.2478, 'grad_norm': 7.588901996612549, 'learning_rate': 2.4927421520947748e-05, 'epoch': 30.0}


                                                           
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.066523551940918, 'eval_wer': 109.8491731782664, 'eval_runtime': 39.9925, 'eval_samples_per_second': 22.004, 'eval_steps_per_second': 0.1, 'epoch': 30.11}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
                                                            
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.048065185546875, 'eval_wer': 110.08540795929494, 'eval_runtime': 40.0234, 'eval_samples_per_second': 21.987, 'eval_steps_per_second': 0.1, 'epoch': 30.6}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 1075/350000 [1:37:04<746:19:04,  7.70s/it] 

{'loss': 1.1986, 'grad_norm': 4.376393795013428, 'learning_rate': 2.4925635633175796e-05, 'epoch': 30.71}


                                                           
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.111323356628418, 'eval_wer': 108.1955297110667, 'eval_runtime': 39.8824, 'eval_samples_per_second': 22.065, 'eval_steps_per_second': 0.1, 'epoch': 31.09}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 1100/350000 [1:38:55<282:36:38,  2.92s/it] 

{'loss': 1.184, 'grad_norm': 3.461092472076416, 'learning_rate': 2.492384974540384e-05, 'epoch': 31.43}


                                                           
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.084444522857666, 'eval_wer': 110.1399236779938, 'eval_runtime': 39.7734, 'eval_samples_per_second': 22.125, 'eval_steps_per_second': 0.101, 'epoch': 31.57}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
                                                            
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.1445770263671875, 'eval_wer': 108.10467017990186, 'eval_runtime': 40.2273, 'eval_samples_per_second': 21.876, 'eval_steps_per_second': 0.099, 'epoch': 32.06}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 1125/350000 [1:41:32<913:28:01,  9.43s/it] 

{'loss': 1.1725, 'grad_norm': 3.181932210922241, 'learning_rate': 2.4922063857631886e-05, 'epoch': 32.14}


                                                           
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.152517795562744, 'eval_wer': 108.35907686716337, 'eval_runtime': 40.0458, 'eval_samples_per_second': 21.975, 'eval_steps_per_second': 0.1, 'epoch': 32.54}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 1150/350000 [1:43:24<293:53:59,  3.03s/it] 

{'loss': 1.1343, 'grad_norm': 4.477943420410156, 'learning_rate': 2.492027796985993e-05, 'epoch': 32.86}


                                                           
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.149371147155762, 'eval_wer': 108.32273305469744, 'eval_runtime': 40.3175, 'eval_samples_per_second': 21.827, 'eval_steps_per_second': 0.099, 'epoch': 33.03}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
                                                            
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.17894172668457, 'eval_wer': 108.75885880428856, 'eval_runtime': 40.1648, 'eval_samples_per_second': 21.91, 'eval_steps_per_second': 0.1, 'epoch': 33.51}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 1175/350000 [1:46:02<1208:43:03, 12.47s/it]

{'loss': 1.1017, 'grad_norm': 3.9043357372283936, 'learning_rate': 2.4918492082087976e-05, 'epoch': 33.57}


                                                            
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.1751508712768555, 'eval_wer': 109.63111030347083, 'eval_runtime': 49.7149, 'eval_samples_per_second': 17.701, 'eval_steps_per_second': 0.08, 'epoch': 34.0}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 1200/350000 [1:48:02<320:30:56,  3.31s/it] 

{'loss': 1.0826, 'grad_norm': 3.7513222694396973, 'learning_rate': 2.491670619431602e-05, 'epoch': 34.29}


                                                           
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.209670066833496, 'eval_wer': 108.84971833545339, 'eval_runtime': 39.8613, 'eval_samples_per_second': 22.077, 'eval_steps_per_second': 0.1, 'epoch': 34.49}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
                                                            
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.187067031860352, 'eval_wer': 109.41304742867526, 'eval_runtime': 39.8952, 'eval_samples_per_second': 22.058, 'eval_steps_per_second': 0.1, 'epoch': 34.97}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]


{'loss': 1.0525, 'grad_norm': 8.758500099182129, 'learning_rate': 2.4914920306544065e-05, 'epoch': 35.0}


                                                            
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.210619926452637, 'eval_wer': 111.43012902053427, 'eval_runtime': 40.1376, 'eval_samples_per_second': 21.925, 'eval_steps_per_second': 0.1, 'epoch': 35.46}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 1250/350000 [1:52:31<332:54:56,  3.44s/it] 

{'loss': 0.9958, 'grad_norm': 3.4650697708129883, 'learning_rate': 2.491313441877211e-05, 'epoch': 35.71}


                                                           
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.231884956359863, 'eval_wer': 109.68562602216974, 'eval_runtime': 40.1567, 'eval_samples_per_second': 21.914, 'eval_steps_per_second': 0.1, 'epoch': 35.94}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 1275/350000 [1:54:22<260:28:27,  2.69s/it] 

{'loss': 0.9757, 'grad_norm': 3.9782111644744873, 'learning_rate': 2.4911348531000155e-05, 'epoch': 36.43}


                                                           
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.244344711303711, 'eval_wer': 108.79520261675451, 'eval_runtime': 40.0644, 'eval_samples_per_second': 21.965, 'eval_steps_per_second': 0.1, 'epoch': 36.43}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
                                                            
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.286551475524902, 'eval_wer': 108.83154642922041, 'eval_runtime': 39.8504, 'eval_samples_per_second': 22.083, 'eval_steps_per_second': 0.1, 'epoch': 36.91}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 1300/350000 [1:56:59<357:54:27,  3.70s/it] 

{'loss': 0.9376, 'grad_norm': 4.66718864440918, 'learning_rate': 2.4909562643228203e-05, 'epoch': 37.14}


                                                           
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.306234359741211, 'eval_wer': 107.66854443031073, 'eval_runtime': 40.289, 'eval_samples_per_second': 21.842, 'eval_steps_per_second': 0.099, 'epoch': 37.4}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 1325/350000 [1:58:51<262:47:31,  2.71s/it] 

{'loss': 0.886, 'grad_norm': 4.196593284606934, 'learning_rate': 2.4907776755456248e-05, 'epoch': 37.86}


                                                           
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.335963249206543, 'eval_wer': 108.15918589860077, 'eval_runtime': 40.0534, 'eval_samples_per_second': 21.971, 'eval_steps_per_second': 0.1, 'epoch': 37.89}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
                                                            
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.402575969696045, 'eval_wer': 108.06832636743596, 'eval_runtime': 39.7724, 'eval_samples_per_second': 22.126, 'eval_steps_per_second': 0.101, 'epoch': 38.37}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 1350/350000 [2:01:28<412:50:12,  4.26s/it] 

{'loss': 0.8194, 'grad_norm': 5.18052339553833, 'learning_rate': 2.490599086768429e-05, 'epoch': 38.57}


                                                           
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.399927616119385, 'eval_wer': 108.83154642922041, 'eval_runtime': 39.7692, 'eval_samples_per_second': 22.128, 'eval_steps_per_second': 0.101, 'epoch': 38.86}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 1375/350000 [2:03:19<263:31:38,  2.72s/it] 

{'loss': 0.7938, 'grad_norm': 5.0511016845703125, 'learning_rate': 2.4904204979912334e-05, 'epoch': 39.29}


                                                           
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.398131847381592, 'eval_wer': 110.01272033436307, 'eval_runtime': 40.2647, 'eval_samples_per_second': 21.855, 'eval_steps_per_second': 0.099, 'epoch': 39.34}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
                                                            
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.4977216720581055, 'eval_wer': 107.2869343994185, 'eval_runtime': 40.4676, 'eval_samples_per_second': 21.746, 'eval_steps_per_second': 0.099, 'epoch': 39.83}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 1400/350000 [2:05:56<431:25:39,  4.46s/it] 

{'loss': 0.7425, 'grad_norm': 7.803206443786621, 'learning_rate': 2.4902419092140382e-05, 'epoch': 40.0}


                                                           
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.518362998962402, 'eval_wer': 108.44993639832819, 'eval_runtime': 39.7805, 'eval_samples_per_second': 22.121, 'eval_steps_per_second': 0.101, 'epoch': 40.31}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 1425/350000 [2:07:49<269:03:20,  2.78s/it] 

{'loss': 0.6639, 'grad_norm': 4.827066898345947, 'learning_rate': 2.4900633204368427e-05, 'epoch': 40.71}


                                                           
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.510213851928711, 'eval_wer': 108.37724877339635, 'eval_runtime': 40.0728, 'eval_samples_per_second': 21.96, 'eval_steps_per_second': 0.1, 'epoch': 40.8}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
                                                            
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.5667829513549805, 'eval_wer': 108.57713974195893, 'eval_runtime': 40.0186, 'eval_samples_per_second': 21.99, 'eval_steps_per_second': 0.1, 'epoch': 41.29}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 1450/350000 [2:10:25<577:57:42,  5.97s/it] 

{'loss': 0.6253, 'grad_norm': 6.165742874145508, 'learning_rate': 2.4898847316596472e-05, 'epoch': 41.43}


                                                           
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.560951232910156, 'eval_wer': 108.63165546065783, 'eval_runtime': 39.623, 'eval_samples_per_second': 22.209, 'eval_steps_per_second': 0.101, 'epoch': 41.77}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 1475/350000 [2:12:16<265:38:53,  2.74s/it] 

{'loss': 0.5752, 'grad_norm': 3.332235097885132, 'learning_rate': 2.4897061428824517e-05, 'epoch': 42.14}


                                                           
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.577775001525879, 'eval_wer': 107.81391968017444, 'eval_runtime': 39.8209, 'eval_samples_per_second': 22.099, 'eval_steps_per_second': 0.1, 'epoch': 42.26}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
                                                            
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.633566379547119, 'eval_wer': 107.17790296202072, 'eval_runtime': 40.4637, 'eval_samples_per_second': 21.748, 'eval_steps_per_second': 0.099, 'epoch': 42.74}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 1500/350000 [2:14:55<720:59:59,  7.45s/it] 

{'loss': 0.5379, 'grad_norm': 3.9266817569732666, 'learning_rate': 2.4895275541052558e-05, 'epoch': 42.86}


                                                           
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.669999122619629, 'eval_wer': 109.39487552244232, 'eval_runtime': 39.6176, 'eval_samples_per_second': 22.212, 'eval_steps_per_second': 0.101, 'epoch': 43.23}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 1525/350000 [2:16:45<282:00:37,  2.91s/it] 

{'loss': 0.5083, 'grad_norm': 17.514564514160156, 'learning_rate': 2.4893489653280606e-05, 'epoch': 43.57}


                                                           
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.656050682067871, 'eval_wer': 108.08649827366892, 'eval_runtime': 39.8701, 'eval_samples_per_second': 22.072, 'eval_steps_per_second': 0.1, 'epoch': 43.71}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
                                                            
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.696556568145752, 'eval_wer': 107.88660730510631, 'eval_runtime': 39.7851, 'eval_samples_per_second': 22.119, 'eval_steps_per_second': 0.101, 'epoch': 44.2}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 1550/350000 [2:19:21<911:47:05,  9.42s/it] 

{'loss': 0.502, 'grad_norm': 3.227090835571289, 'learning_rate': 2.489170376550865e-05, 'epoch': 44.29}


                                                           
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.692729949951172, 'eval_wer': 108.63165546065783, 'eval_runtime': 40.7297, 'eval_samples_per_second': 21.606, 'eval_steps_per_second': 0.098, 'epoch': 44.69}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 1575/350000 [2:21:12<241:54:35,  2.50s/it] 

{'loss': 0.4909, 'grad_norm': 5.766022682189941, 'learning_rate': 2.4889917877736696e-05, 'epoch': 45.0}


                                                           
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.674964427947998, 'eval_wer': 106.81446483736143, 'eval_runtime': 39.6221, 'eval_samples_per_second': 22.21, 'eval_steps_per_second': 0.101, 'epoch': 45.17}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
                                                            
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.709524154663086, 'eval_wer': 109.2495002725786, 'eval_runtime': 39.9007, 'eval_samples_per_second': 22.055, 'eval_steps_per_second': 0.1, 'epoch': 45.66}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 1600/350000 [2:23:51<1202:54:20, 12.43s/it]

{'loss': 0.4661, 'grad_norm': 3.703808546066284, 'learning_rate': 2.488813198996474e-05, 'epoch': 45.71}


                                                            
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.72481632232666, 'eval_wer': 107.97746683627112, 'eval_runtime': 40.3693, 'eval_samples_per_second': 21.799, 'eval_steps_per_second': 0.099, 'epoch': 46.14}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 1625/350000 [2:25:43<310:33:21,  3.21s/it] 

{'loss': 0.4634, 'grad_norm': 2.20501708984375, 'learning_rate': 2.4886346102192785e-05, 'epoch': 46.43}


                                                           
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.725179672241211, 'eval_wer': 107.19607486825369, 'eval_runtime': 40.0706, 'eval_samples_per_second': 21.961, 'eval_steps_per_second': 0.1, 'epoch': 46.63}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
                                                            
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.732062339782715, 'eval_wer': 108.90423405415228, 'eval_runtime': 39.6093, 'eval_samples_per_second': 22.217, 'eval_steps_per_second': 0.101, 'epoch': 47.11}


  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]
  0%|          | 1650/350000 [2:28:20<1604:50:28, 16.59s/it]

{'loss': 0.4601, 'grad_norm': 2.1047542095184326, 'learning_rate': 2.488456021442083e-05, 'epoch': 47.14}


                                                            
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 4.711363792419434, 'eval_wer': 108.9950935853171, 'eval_runtime': 39.9736, 'eval_samples_per_second': 22.015, 'eval_steps_per_second': 0.1, 'epoch': 47.6}


RuntimeError: [enforce fail at inline_container.cc:595] . unexpected pos 2575904448 vs 2575904340



  input_features = [torch.tensor(feature["input_features"]) for feature in features]
  label_features = [torch.tensor(feature["labels"]) for feature in features]


[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A

In [None]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=False)
test_dataloader =  torch.utils.data.DataLoader(test_dataset, batch_size=8, shuffle=False)


In [None]:
# train_dataset_new = VQBrainDataset(train_dataloader, vq_vae=vq_vae, device='cuda')
# with open('preproc_dataset.pickle', 'wb') as f:
#     pickle.dump(train_dataset_new, f) 
# # train_dataset_new

  0%|          | 0/1098 [00:00<?, ?it/s]


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# import pickle
# test_dataset_new = VQBrainDataset(test_dataloader, vq_vae=vq_vae, device='cuda')
# with open('preproc_test_dataset.pickle', 'wb') as f:
#     pickle.dump(test_dataset_new, f) 

100%|██████████| 110/110 [01:20<00:00,  1.37it/s]


len of the dataset: 880
max input len 96
median len 96.0


Layer (type:depth-idx)                             Param #
├─WhisperModel: 1-1                                --
|    └─WhisperEncoder: 2-1                         --
|    |    └─Conv1d: 3-1                            787,456
|    |    └─Conv1d: 3-2                            3,146,752
|    |    └─Embedding: 3-3                         (49,152)
|    |    └─ModuleList: 3-4                        302,284,800
|    |    └─LayerNorm: 3-5                         2,048
|    └─WhisperDecoder: 2-2                         --
|    |    └─Embedding: 3-6                         (53,109,760)
|    |    └─WhisperPositionalEmbedding: 3-7        (458,752)
|    |    └─ModuleList: 3-8                        (403,070,976)
|    |    └─LayerNorm: 3-9                         (2,048)
├─Linear: 1-2                                      (53,109,760)
Total params: 816,021,504
Trainable params: 306,221,056
Non-trainable params: 509,800,448


Layer (type:depth-idx)                             Param #
├─WhisperModel: 1-1                                --
|    └─WhisperEncoder: 2-1                         --
|    |    └─Conv1d: 3-1                            787,456
|    |    └─Conv1d: 3-2                            3,146,752
|    |    └─Embedding: 3-3                         (49,152)
|    |    └─ModuleList: 3-4                        302,284,800
|    |    └─LayerNorm: 3-5                         2,048
|    └─WhisperDecoder: 2-2                         --
|    |    └─Embedding: 3-6                         (53,109,760)
|    |    └─WhisperPositionalEmbedding: 3-7        (458,752)
|    |    └─ModuleList: 3-8                        (403,070,976)
|    |    └─LayerNorm: 3-9                         (2,048)
├─Linear: 1-2                                      (53,109,760)
Total params: 816,021,504
Trainable params: 306,221,056
Non-trainable params: 509,800,448

In [None]:
model(einops.rearrange(torch.tensor(train_dataset[0][0]), 't c -> 1 c t')
      , decoder_input_ids = torch.tensor(train_dataset[0][1]))

IndexError: index out of range in self

In [None]:
model = model.to('cuda')

In [None]:
einops.rearrange(torch.tensor(train_dataset[0][0]).to('cuda'), 't c -> 1 c t').shape

torch.Size([1, 256, 96])

In [None]:
model.generate(einops.rearrange(torch.tensor(train_dataset[0][0]).to('cuda'), 't c -> 1 c t'), max_new_tokens = 26)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
model(einops.rearrange(torch.tensor(train_dataset[0][0]).to('cuda'), 't c -> 1 c t')
      , decoder_input_ids = torch.tensor(train_dataset[0][1]).to('cuda'))

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
project_name = 'vqvae_whisper'
save_folder = Path("logs/chizhov")

train_config = TrainConfig(exp_name='vqvae_pretrain_large_whisper',
                           mixed_precision=True,
                           batch_size=2, 
                           num_workers=3,
                           pin_memory=True, 
                           eval_interval=500, 
                           learning_rate=3e-4,
                           weight_decay=0, 
                           grad_clip=5,
                           lr_decay_iters=30_000, 
                           warmup_iters=1000, 
                           project_name=project_name, 
                           save_folder=save_folder
                          )

# model = torch.compile(model)
args = (model, (train_dataset, test_dataset), train_config)
notebook_launcher(run_train_model, args, num_processes=1)

In [None]:
model.model.encoder.conv1

Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))

In [None]:
#  На вход в whisper encoder идет model.model.encoder.conv1.in_channels каналов
train_dataset_new[0][0].shape, train_dataset_new[0][1].shape 


((96, 256), (26,))

In [None]:
summary(model)

Layer (type:depth-idx)                             Param #
├─WhisperModel: 1-1                                --
|    └─WhisperEncoder: 2-1                         --
|    |    └─Conv1d: 3-1                            492,800
|    |    └─Conv1d: 3-2                            4,916,480
|    |    └─Embedding: 3-3                         (1,920,000)
|    |    └─ModuleList: 3-4                        629,637,120
|    |    └─LayerNorm: 3-5                         2,560
|    └─WhisperDecoder: 2-2                         --
|    |    └─Embedding: 3-6                         66,388,480
|    |    └─WhisperPositionalEmbedding: 3-7        573,440
|    |    └─ModuleList: 3-8                        839,557,120
|    |    └─LayerNorm: 3-9                         2,560
├─Linear: 1-2                                      66,388,480
Total params: 1,609,879,040
Trainable params: 1,607,959,040
Non-trainable params: 1,920,000


Layer (type:depth-idx)                             Param #
├─WhisperModel: 1-1                                --
|    └─WhisperEncoder: 2-1                         --
|    |    └─Conv1d: 3-1                            492,800
|    |    └─Conv1d: 3-2                            4,916,480
|    |    └─Embedding: 3-3                         (1,920,000)
|    |    └─ModuleList: 3-4                        629,637,120
|    |    └─LayerNorm: 3-5                         2,560
|    └─WhisperDecoder: 2-2                         --
|    |    └─Embedding: 3-6                         66,388,480
|    |    └─WhisperPositionalEmbedding: 3-7        573,440
|    |    └─ModuleList: 3-8                        839,557,120
|    |    └─LayerNorm: 3-9                         2,560
├─Linear: 1-2                                      66,388,480
Total params: 1,609,879,040
Trainable params: 1,607,959,040
Non-trainable params: 1,920,000