In [16]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("SKNahin/bengali-transliteration-data")

# Split into training and validation subsets
dataset = dataset["train"].train_test_split(test_size=0.2)
train_data = dataset["train"]
val_data = dataset["test"]

In [None]:
from transformers import AutoTokenizer

# Load a tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-small")

# Tokenization function
def preprocess_function(batch):
    inputs = tokenizer(batch["bn"], max_length=128, truncation=True, padding="max_length")
    targets = tokenizer(batch["rm"], max_length=128, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

# Apply tokenization
train_dataset = train_data.map(preprocess_function, batched=True)
val_dataset = val_data.map(preprocess_function, batched=True)


ValueError: Converting from Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast convertors: ['AlbertTokenizer', 'BartTokenizer', 'BarthezTokenizer', 'BertTokenizer', 'BigBirdTokenizer', 'BlenderbotTokenizer', 'CamembertTokenizer', 'CLIPTokenizer', 'CodeGenTokenizer', 'ConvBertTokenizer', 'DebertaTokenizer', 'DebertaV2Tokenizer', 'DistilBertTokenizer', 'DPRReaderTokenizer', 'DPRQuestionEncoderTokenizer', 'DPRContextEncoderTokenizer', 'ElectraTokenizer', 'FNetTokenizer', 'FunnelTokenizer', 'GPT2Tokenizer', 'HerbertTokenizer', 'LayoutLMTokenizer', 'LayoutLMv2Tokenizer', 'LayoutLMv3Tokenizer', 'LayoutXLMTokenizer', 'LongformerTokenizer', 'LEDTokenizer', 'LxmertTokenizer', 'MarkupLMTokenizer', 'MBartTokenizer', 'MBart50Tokenizer', 'MPNetTokenizer', 'MobileBertTokenizer', 'MvpTokenizer', 'NllbTokenizer', 'OpenAIGPTTokenizer', 'PegasusTokenizer', 'Qwen2Tokenizer', 'RealmTokenizer', 'ReformerTokenizer', 'RemBertTokenizer', 'RetriBertTokenizer', 'RobertaTokenizer', 'RoFormerTokenizer', 'SeamlessM4TTokenizer', 'SqueezeBertTokenizer', 'T5Tokenizer', 'UdopTokenizer', 'WhisperTokenizer', 'XLMRobertaTokenizer', 'XLNetTokenizer', 'SplinterTokenizer', 'XGLMTokenizer', 'LlamaTokenizer', 'CodeLlamaTokenizer', 'GemmaTokenizer', 'Phi3Tokenizer']

In [None]:
def filter_length(batch):
    return 3 <= len(batch["rm"]) <= 128 and 3 <= len(batch["bn"]) <= 128

train_dataset = train_dataset.filter(filter_length)
val_dataset = val_dataset.filter(filter_length)


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_name = "t5-small" 
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  
    learning_rate=5e-5,  
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8, 
    num_train_epochs=1, 
    save_total_limit=1, 
    predict_with_generate=True, 
    logging_dir="./logs",
    gradient_accumulation_steps=1, 
    fp16=True,  
    logging_steps=100,  
    save_steps=500,  
    
    eval_steps=500,  
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

trainer.train()


In [None]:
model.save_pretrained("./bitfest-hackathon-preli-ch1-rubberDuck-model")
tokenizer.save_pretrained("./bitfest-hackathon-preli-ch1-rubberDuck-model")
