In [35]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import MarianMTModel, MarianTokenizer, AdamW
from tqdm import tqdm

In [36]:
import csv

file_path = r"/content/Badaga_Prasunethon - Final Dataset (1).csv"
first_column = []
second_column = []

with open(file_path, newline='') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        first_column.append(row[0])
        second_column.append(row[1])


In [38]:
# Define a custom dataset
class BadagaEnglishDataset(Dataset):
    def __init__(self, badaga_texts, english_texts, tokenizer, max_length=128):
        self.badaga_texts = badaga_texts
        self.english_texts = english_texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.badaga_texts)

    def __getitem__(self, idx):
        badaga_text = self.badaga_texts[idx]
        english_text = self.english_texts[idx]

        # Tokenize the input
        inputs = self.tokenizer.encode_plus(
            badaga_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Tokenize the target
        targets = self.tokenizer.encode_plus(
            english_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': targets['input_ids'].squeeze()
        }

In [39]:
# Load pre-trained model and tokenizer
model_name = "Helsinki-NLP/opus-mt-en-ROMANCE"  # You might need to experiment with different pre-trained models
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)



In [40]:
# Prepare your data
badaga_texts = first_column
english_texts = second_column

In [50]:
dataset = BadagaEnglishDataset(badaga_texts, english_texts, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)


optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 100
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

model.save_pretrained("./badaga_english_model")
tokenizer.save_pretrained("./badaga_english_model")

# Example usage of the fine-tuned model
def translate(text):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    # Move input tensors to the same device as the model
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model.generate(**inputs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test the model
test_text = "Namaskara, OLLenge Idhdhaya"
print(f"Badaga: {test_text}")
print(f"English: {translate(test_text)}")

Epoch 1/2: 100%|██████████| 189/189 [00:15<00:00, 12.06it/s]


Epoch 1/2, Average Loss: 0.0026


Epoch 2/2: 100%|██████████| 189/189 [00:14<00:00, 12.68it/s]
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


Epoch 2/2, Average Loss: 0.0027
Badaga: Namaskara, OLLenge Idhdhaya
English: Greetings, How are You?


In [52]:
test_text = "Ollitha ethi hollava thallu"
print(f"Badaga: {test_text}")
print(f"English: {translate(test_text)}")

Badaga: Ollitha ethi hollava thallu
English: Take only the good leaving behind the bad and win all in this world
