<a href="https://colab.research.google.com/github/AbhiRam162105/Abhiram/blob/main/Sentence_EN_to_TE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install transformers



In [None]:
%pip install tqdm



In [None]:
%pip install torch torchvision torchaudio torchtext

Collecting torchtext
  Downloading torchtext-0.18.0-cp310-cp310-manylinux1_x86_64.whl.metadata (7.9 kB)
Downloading torchtext-0.18.0-cp310-cp310-manylinux1_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchtext
Successfully installed torchtext-0.18.0


In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from tqdm import tqdm
import re

In [None]:
# Preprocessing functions

def decontractions(phrase):
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"won\’t", "will not", phrase)
    phrase = re.sub(r"can\’t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)

    phrase = re.sub(r"n\’t", " not", phrase)
    phrase = re.sub(r"\’re", " are", phrase)
    phrase = re.sub(r"\’s", " is", phrase)
    phrase = re.sub(r"\’d", " would", phrase)
    phrase = re.sub(r"\’ll", " will", phrase)
    phrase = re.sub(r"\’t", " not", phrase)
    phrase = re.sub(r"\’ve", " have", phrase)
    phrase = re.sub(r"\’m", " am", phrase)

    return phrase

def preprocess_english(text):
    # convert all the text into lower letters
    # use this function to remove the contractions: https://gist.github.com/anandborad/d410a49a493b56dace4f814ab5325bbd
    # remove all the spacial characters: except space ' '
    text = text.lower()
    text = decontractions(text)
    text = re.sub(r"([?.!,])", r" \1 ", text)
    text = re.sub(r'[" "]+', " ", text)
    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text)
    text = '<start> ' + text + ' <end>'
    text = text.strip()
    return text

def preprocess_telugu(text):
    # convert all the text into lower letters
    # remove the words betweent brakets ()
    # remove these characters: {'$', ')', '?', '"', '’', '.',  '°', '!', ';', '/', "'", '€', '%', ':', ',', '('}
    # replace these spl characters with space: '\u200b', '\xa0', '-', '/'
    # we have found these characters after observing the data points, feel free to explore more and see if you can do find more
    # you are free to do more proprocessing
    # note that the model will learn better with better preprocessed data
    text = re.sub(r"([?.!,])", r" \1 ", text)
    text = re.sub(r'[" "]+', " ", text)
    text = re.sub('[$)\"’°;\'€%:,(/]', '', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\u200d', ' ', text)
    text = re.sub('\u200c', ' ', text)
    text = re.sub('-', ' ', text)
    text = re.sub('  ', ' ', text)
    text = re.sub('   ', ' ', text)
    text =" ".join(text.split())
    #     text = '<start> ' + text + ' <end>'
    return text


In [None]:
def pad_sentences(sentences, max_len, pad_token_id):
    """Pad each sentence to the maximum length using the pad token ID."""
    return [sentence + [pad_token_id] * (max_len - len(sentence)) for sentence in sentences]

def process_data(df, tokenizer, max_len=None):
    """Process data from DataFrame, tokenize sentences, and optionally pad them."""
    en_lines = []
    te_lines = []
    max_len_current = 0

    for _, row in tqdm(df.iterrows(), desc="Processing CSV data", total=len(df)):
        en_sentence = tokenizer.encode(row['en'], add_special_tokens=True)
        te_sentence = tokenizer.encode(row['te'], add_special_tokens=True)

        en_lines.append(en_sentence)
        te_lines.append(te_sentence)

        max_len_current = max(max_len_current, len(en_sentence), len(te_sentence))

    if max_len is None:
        max_len = max_len_current

    en_padded_lines = pad_sentences(en_lines, max_len, tokenizer.pad_token_id)
    te_padded_lines = pad_sentences(te_lines, max_len, tokenizer.pad_token_id)

    return en_padded_lines, te_padded_lines, max_len

In [None]:
# Dataset class
class TranslationDataset(Dataset):
    def __init__(self, en_data, te_data):
        self.en_data = en_data
        self.te_data = te_data

    def __len__(self):
        return len(self.en_data)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.en_data[idx]),
            'labels': torch.tensor(self.te_data[idx])
        }

In [None]:
# Model definition
class TransformerModel(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size, d_model=512, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.trg_embedding = nn.Embedding(trg_vocab_size, d_model)
        self.transformer = nn.Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout)
        self.fc_out = nn.Linear(d_model, trg_vocab_size)
        self.src_vocab_size = src_vocab_size
        self.trg_vocab_size = trg_vocab_size
        self.d_model = d_model

    def forward(self, src, trg):
        src = self.src_embedding(src) * (self.d_model ** 0.5)
        trg = self.trg_embedding(trg) * (self.d_model ** 0.5)

        src = src.permute(1, 0, 2)  # Shape [seq_len, batch_size, d_model]
        trg = trg.permute(1, 0, 2)  # Shape [seq_len, batch_size, d_model]

        output = self.transformer(src, trg)
        output = self.fc_out(output)
        return output

In [None]:
# Training function
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, labels[:, :-1])
        loss = criterion(outputs.contiguous().view(-1, outputs.size(-1)), labels[:, 1:].contiguous().view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [None]:
# Main execution
if __name__ == "__main__":
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained("Telugu-LLM-Labs/TinyLlama-1.1B-Telugu-Romanization-v0-Instruct")

    # Load data from CSV
    df = pd.read_csv('English-Telugu (1_new).csv')  # Replace with the actual CSV filename

    # Process the data (tokenize and pad)
    en_lines_padded, te_lines_padded, max_len = process_data(df, tokenizer)

    # Create dataset and dataloader
    dataset = TranslationDataset(en_lines_padded, te_lines_padded)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

    # Initialize model
    model = TransformerModel(
        src_vocab_size=tokenizer.vocab_size,
        trg_vocab_size=tokenizer.vocab_size
    ).to(device)

    # Set up optimizer and loss function
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
    criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

    # Training loop
    num_epochs = 10
    for epoch in range(num_epochs):
        avg_loss = train(model, dataloader, optimizer, criterion, device)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

    # Save the model
    torch.save(model.state_dict(), 'en_te_translation_model.pth')
    print("Training completed and model saved.")

    # Print some statistics
    print(f"Maximum sentence length: {max_len}")
    print(f"English data shape: {len(en_lines_padded)} sentences, each with {len(en_lines_padded[0])} tokens")
    print(f"Telugu data shape: {len(te_lines_padded)} sentences, each with {len(te_lines_padded[0])} tokens")

    # Print the first sentence from each language
    print("\nFirst English sentence (tokenized and padded):")
    print(en_lines_padded[0])
    print("\nFirst Telugu sentence (tokenized and padded):")
    print(te_lines_padded[0])

Processing CSV data: 100%|██████████| 14666/14666 [00:16<00:00, 913.06it/s] 
Training:   0%|          | 0/459 [00:00<?, ?it/s]

# Start of Transfer Learning Model

In [None]:
%pip install xformers

Collecting xformers
  Downloading xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Downloading xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl (16.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.7/16.7 MB[0m [31m98.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xformers
Successfully installed xformers-0.0.28.post1


In [None]:
%pip install sentencepiece datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.9 MB/s[0m eta [36m0:00:

# Initial Training Loop

In [None]:
import csv
import sentencepiece as spm
from transformers import AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import torch
import numpy as np

# Step 1: Train a SentencePiece model on the English-Telugu data

# Assuming your parallel data file is a CSV with 'en' and 'te' columns
dataset_file = 'English-Telugu (1_new).csv'

# Prepare data for SentencePiece training by concatenating English and Telugu data into a single file
with open(dataset_file, 'r') as f:
    reader = csv.reader(f)
    next(reader)  # Skip the header row

    with open('combined_texts.txt', 'w') as combined_file:
        for row in reader:
            if len(row) >= 2:  # Check if there are at least two columns
                en_sentence = row[0]
                te_sentence = row[1]
                combined_file.write(f"{en_sentence.strip()}\n")
                combined_file.write(f"{te_sentence.strip()}\n")

# Train SentencePiece BPE model
spm.SentencePieceTrainer.train(
    input='combined_texts.txt',
    model_prefix='en_te_bpe',
    vocab_size=16000,  # Adjust vocab size as needed
    model_type='bpe'
)

# Step 2: Load the SentencePiece model for tokenization
sp = spm.SentencePieceProcessor(model_file='en_te_bpe.model')

# Tokenize data using SentencePiece
def sentencepiece_tokenize(text, sp_model):
    tokenized = sp_model.encode(text, out_type=int)  # Output token IDs
    return tokenized

# Step 3: Update dataset tokenization function to use SentencePiece
def tokenize_function_sentencepiece(examples):
    source = {
        'input_ids': [sentencepiece_tokenize(ex, sp) for ex in examples['en']],
        'attention_mask': [[1] * len(sentencepiece_tokenize(ex, sp)) for ex in examples['en']]
    }
    target = {
        'input_ids': [sentencepiece_tokenize(ex, sp) for ex in examples['te']],
        'attention_mask': [[1] * len(sentencepiece_tokenize(ex, sp)) for ex in examples['te']]
    }
    return {
        'input_ids': source['input_ids'],
        'attention_mask': source['attention_mask'],
        'labels': target['input_ids']
    }

# Step 4: Load the dataset
dataset = load_dataset('csv', data_files=dataset_file, split='train', column_names=['en', 'te'])

# Split into training and evaluation sets
train_dataset = dataset.train_test_split(test_size=0.2)['train']
eval_dataset = dataset.train_test_split(test_size=0.2)['test']

# Tokenize the dataset using SentencePiece
tokenized_train_dataset = train_dataset.map(tokenize_function_sentencepiece, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function_sentencepiece, batched=True)

# Custom data collator for padding
class CustomDataCollator:
    def __call__(self, features):
        max_input_length = max(len(f['input_ids']) for f in features)
        max_label_length = max(len(f['labels']) for f in features)

        input_ids = []
        attention_masks = []
        labels = []

        for feature in features:
            input_padding_length = max_input_length - len(feature['input_ids'])
            label_padding_length = max_label_length - len(feature['labels'])

            input_ids.append(feature['input_ids'] + [0] * input_padding_length)
            attention_masks.append(feature['attention_mask'] + [0] * input_padding_length)
            labels.append(feature['labels'] + [-100] * label_padding_length)  # Use -100 for label padding

        return {
            'input_ids': torch.tensor(input_ids),
            'attention_mask': torch.tensor(attention_masks),
            'labels': torch.tensor(labels),
        }

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",  # Changed to `eval_strategy`
    eval_steps=500,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    weight_decay=0.01,
    num_train_epochs=8,
    save_steps=1000,
    save_total_limit=2,
    fp16=False  # No need for fp16 as flash attention is not being used
)

# Load pre-trained model
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-tc-big-en-it").to('cuda')

# Step 6: Fine-tune the model using the Trainer API
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    data_collator=CustomDataCollator()  # Use the custom data collator
)
# Before training, check the dataset
print("Sample input_ids length:", len(tokenized_train_dataset[0]['input_ids']))
print("Sample labels length:", len(tokenized_train_dataset[0]['labels']))

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("en-te-bpe-model")

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/11733 [00:00<?, ? examples/s]

Map:   0%|          | 0/2934 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/464M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

Sample input_ids length: 15
Sample labels length: 8


Step,Training Loss,Validation Loss


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[54421]], 'forced_eos_token_id': 43017}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[54421]], 'forced_eos_token_id': 43017}


# BackTranslation and Iterative Finetuning

In [10]:
import torch
from tqdm import tqdm
from transformers import AutoModelForSeq2SeqLM

# Function to translate sentences with improved handling of token generation and filtering unwanted symbols
def translate_sentence(sentence, model, sp_model, direction="en-te"):
    inputs = torch.tensor([sp_model.encode(sentence)], device='cuda')

    with torch.no_grad():
        # Adding tqdm progress bar
        for _ in tqdm(range(1), desc=f"Translating {direction}"):
            outputs = model.generate(
                inputs,
                max_length=128,
                repetition_penalty=1.2,  # Repetition penalty to avoid excessive symbols
                num_beams=5,             # Use beam search
                early_stopping=True      # Stop early for efficiency
            )

    # Convert generated tensor to list of token IDs
    token_ids = outputs[0].cpu().tolist()

    # Filter out invalid token IDs (i.e., those out of range for the SentencePiece model)
    token_ids = [token_id for token_id in token_ids if 0 <= token_id < sp_model.get_piece_size()]

    # Decode the valid token IDs to a sentence
    translated_sentence = sp_model.decode(token_ids)

    # Clean up unwanted characters, symbols, or repeated punctuation
    translated_sentence = translated_sentence.replace("?", "").replace("।", "").replace("...", "").strip()

    return translated_sentence

# Function to generate synthetic data pairs (both EN->TE and TE->EN) with progress tracking
def generate_synthetic_pairs_bidirectional(dataset, en_te_model, te_en_model, sp_model, limit=200):
    synthetic_data = {'en': [], 'te': []}

    # Limit the dataset to the first `limit` sentences
    dataset_limited = {'en': dataset['en'][:limit], 'te': dataset['te'][:limit]}

    for en_sentence, te_sentence in tqdm(zip(dataset_limited['en'], dataset_limited['te']), desc="Generating synthetic pairs", total=len(dataset_limited['en'])):
        # Translate English to Telugu
        te_translation = translate_sentence(en_sentence, en_te_model, sp_model, direction="en-te")

        # Translate Telugu to English
        en_translation = translate_sentence(te_sentence, te_en_model, sp_model, direction="te-en")

        # Append translations to synthetic data
        synthetic_data['en'].append(en_translation)
        synthetic_data['te'].append(te_translation)

    return synthetic_data

# Function to fine-tune the bidirectional model iteratively
def fine_tune_bidirectional_model(train_dataset, model, reverse_model, sp_model, tokenizer_function, trainer, num_iterations=1, limit=200):
    for iteration in range(num_iterations):
        print(f"Iteration {iteration+1}")

        # Generate synthetic data pairs
        synthetic_pairs_bidirectional = generate_synthetic_pairs_bidirectional(train_dataset, model, reverse_model, sp_model, limit=limit)

        # Combine original and synthetic data
        combined_data = {
            'en': train_dataset['en'][:limit] + synthetic_pairs_bidirectional['en'],
            'te': train_dataset['te'][:limit] + synthetic_pairs_bidirectional['te']
        }

        combined_dataset = Dataset.from_dict(combined_data)

        # Tokenize combined data
        tokenized_combined_dataset = combined_dataset.map(tokenizer_function, batched=True)

        # Fine-tune the model on the combined dataset
        trainer.train()

        # Save the model after each iteration
        model.save_pretrained(f"en-te-bpe-model-iter-{iteration+1}")

# Step 7: Load Telugu to English model
reverse_model = AutoModelForSeq2SeqLM.from_pretrained("en-te-bpe-model").to('cuda')

# Example usage for translation in both directions
english_sentence = "Hello, how are you?"
telugu_sentence = "నీవు ఎలా ఉన్నావు?"

# Translate English to Telugu
telugu_translation = translate_sentence(english_sentence, model, sp, direction="en-te")
# Translate Telugu to English
english_back_translation = translate_sentence(telugu_sentence, reverse_model, sp, direction="te-en")

print(f"English to Telugu: {telugu_translation}")
print(f"Telugu to English: {english_back_translation}")

# Fine-tune the model iteratively (example fine-tuning call, assuming trainer, tokenizer, and dataset are set up)
# fine_tune_bidirectional_model(train_dataset, model, reverse_model, sp, tokenize_function_sentencepiece, trainer, num_iterations=3, limit=200)

Translating en-te: 100%|██████████| 1/1 [00:02<00:00,  2.54s/it]
Translating te-en: 100%|██████████| 1/1 [00:02<00:00,  2.17s/it]

English to Telugu: . మీరు, మీరు, మీరు ఈ మ్యాచ్?????????????????????????????????????????????????????????????????????????????????????????????.????????????????????????
Telugu to English: ఈ అది????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????





# Calculating BLEU Score

In [12]:
%pip install sacrebleu pandas

Translating en-te: 100%|██████████| 1/1 [00:02<00:00,  2.72s/it]
Translating te-en: 100%|██████████| 1/1 [00:02<00:00,  2.60s/it]

English to Telugu: ఆ తర్వాత, మీరు మీరు ఎలా ఉన్నాయి . .. . .. . . .
Telugu to English: ఈ అది





In [11]:
import pandas as pd
import random
import sacrebleu

# Step 1: Load the CSV data
csv_file_path = 'translations.csv'  # Path to your CSV file
data = pd.read_csv(csv_file_path)

# Step 2: Select random sentences from the CSV
n = 10  # Number of random samples to select
random_samples = data.sample(n)

# Step 3: Prepare the translations and references
references = random_samples['reference'].apply(lambda ref: [ref]).tolist()  # SacreBLEU expects a list of lists
translations = random_samples['translation'].tolist()

# Step 4: Calculate the BLEU score
bleu = sacrebleu.corpus_bleu(translations, references)

# Step 5: Print the BLEU score
print(f"BLEU score: {bleu.score:.2f}")

Translating en-te: 100%|██████████| 1/1 [00:02<00:00,  2.25s/it]
Translating te-en: 100%|██████████| 1/1 [00:01<00:00,  1.96s/it]

English to Telugu: . మీరు, ఆ తర్వాత ఎలా ఎలా ఎలా?????????????????????????????????????????????????????????????????????? ????????????????????????????????????????????????
Telugu to English: ఈ అది????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????



