<a href="https://colab.research.google.com/github/AshishKhatiwada/Account/blob/master/colabs/intro/Intro_to_Weights_%26_Biases.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
# Step 1: Install libraries
!pip install transformers datasets sentencepiece accelerate torch pandas nltk

# Step 2: Import packages
import pandas as pd
import re
from datasets import load_dataset
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, Seq2SeqTrainer, Seq2SeqTrainingArguments
import nltk
nltk.download('punkt')

# Step 3: Upload your dataset
from google.colab import files
files.upload()  # Upload your 'english_kriol_dataset.csv'

# Step 4: Load and Clean Dataset
df = pd.read_csv('english_kriol_dataset.csv')

# Basic Cleaning
df = df.dropna(subset=['english', 'kriol'])
df['english'] = df['english'].str.strip()
df['kriol'] = df['kriol'].str.strip()
df = df[(df['english'] != "") & (df['kriol'] != "")]
df = df[df['english'].str.split().str.len() <= 50]
df = df[df['kriol'].str.split().str.len() <= 50]

# Save cleaned dataset
df.to_csv('cleaned_english_kriol_dataset.csv', index=False)

# Step 5: Load cleaned dataset into Hugging Face Dataset
dataset = load_dataset("csv", data_files="cleaned_english_kriol_dataset.csv")

# Step 6: Load mBART model and tokenizer
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name, src_lang="en_XX")
model = MBartForConditionalGeneration.from_pretrained(model_name)

# Step 7: Preprocess function
def preprocess_function(examples):
    inputs = tokenizer(examples['english'], truncation=True, padding="max_length", max_length=64)
    targets = tokenizer(examples['kriol'], truncation=True, padding="max_length", max_length=64)
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Step 8: Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results_mbart_kriol",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    save_total_limit=1,
    predict_with_generate=True,
    fp16=True
)

# Step 9: Trainer setup
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["train"],
    tokenizer=tokenizer
)

# Step 10: Fine-tune model
trainer.train()

# Step 11: Save Fine-tuned Model
trainer.save_model("./fine_tuned_mbart_kriol")
!zip -r fine_tuned_mbart_kriol.zip ./fine_tuned_mbart_kriol
files.download("fine_tuned_mbart_kriol.zip")

# Step 12: Translate example English → Kriol
def translate_mbart_kriol(text):
    inputs = tokenizer(text, return_tensors="pt")
    forced_bos_token_id = tokenizer.convert_tokens_to_ids("en_XX")  # Staying within English since Kriol is not natively supported
    output = model.generate(
        **inputs,
        forced_bos_token_id=forced_bos_token_id,
        max_length=128
    )
    return tokenizer.batch_decode(output, skip_special_tokens=True)[0]

# Test Translation
print("English: How are you?")
print("Kriol:", translate_mbart_kriol("How are you?"))


Dictionary extracted successfully: 0 entries found
