In [None]:
!pip install datasets



from datasets import load_dataset

# Function to download and save a sample of a given language corpus
def save_language_sample(language_code, output_file, num_examples=700):
    # Load dataset with streaming enabled for efficient processing
    dataset = load_dataset("oscar", f"unshuffled_deduplicated_{language_code}", split="train", trust_remote_code=True, streaming=True)
    
    # Save a limited number of examples to the specified output file
    with open(output_file, "w", encoding="utf-8") as f:
        for i, example in enumerate(dataset):
            f.write(example["text"] + "\n")
            if i >= num_examples - 1:  # Save only the specified number of examples
                break
    print(f"Saved a sample of {language_code.upper()} corpus to {output_file}")

# Save samples for English, French, and Spanish
save_language_sample("en", "english_sample.txt")
save_language_sample("fr", "french_sample.txt")
save_language_sample("es", "spanish_sample.txt")              





!pip install torch
!pip install transformers
!pip install pytorch_lightning
!pip install datasets
!pip install nltk
!pip install sentencepiece
import nltk
nltk.download('punkt')




!pip install "accelerate>=0.26.0"



%pip install evaluate



%pip install transformers
%pip install torch 
%pip install torchtext
%pip install datasets
%pip install numpy tqdm




import torch
from transformers import MarianMTModel, MarianTokenizer
from torch.utils.data import DataLoader, Dataset
import random

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define a Dataset class for loading data from text files
class TextDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=128, fraction=1.0):
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        selected_size = int(len(lines) * fraction)
        self.lines = random.sample(lines, selected_size)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.lines)

    def __getitem__(self, idx):
        line = self.lines[idx].strip()
        encoding = self.tokenizer(line, return_tensors="pt", max_length=self.max_length, padding="max_length", truncation=True)
        return encoding["input_ids"].squeeze(), encoding["attention_mask"].squeeze()

# Load the tokenizers and models for each language pair
tokenizers = {
    "en-fr": MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr"),
    "en-es": MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-es"),
    "fr-es": MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-fr-es"),
    "fr-en": MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-fr-en"),  # Reverse tokenizer for fr to en
    "es-en": MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-es-en"),  # Reverse tokenizer for es to en
    "es-fr": MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-es-fr"),  # Reverse tokenizer for es to fr
}

models = {
    "en-fr": MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-fr").to(device),
    "en-es": MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-es").to(device),
    "fr-es": MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-fr-es").to(device),
    "fr-en": MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-fr-en").to(device),  # Reverse model for fr to en
    "es-en": MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-es-en").to(device),  # Reverse model for es to en
    "es-fr": MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-es-fr").to(device),  # Reverse model for es to fr
}

# Load the datasets with a fraction parameter (e.g., 10% of the original dataset)
english_dataset = TextDataset("english_sample.txt", tokenizers["en-fr"], fraction=0.1)
french_dataset = TextDataset("french_sample.txt", tokenizers["en-fr"], fraction=0.1)
spanish_dataset = TextDataset("spanish_sample.txt", tokenizers["en-es"], fraction=0.1)

# Define dataloaders for each language
batch_size = 8
english_loader = DataLoader(english_dataset, batch_size=batch_size, shuffle=True)
french_loader = DataLoader(french_dataset, batch_size=batch_size, shuffle=True)
spanish_loader = DataLoader(spanish_dataset, batch_size=batch_size, shuffle=True)

# Define the optimizer
optimizer = torch.optim.Adam(
    list(models["en-fr"].parameters()) + 
    list(models["en-es"].parameters()) + 
    list(models["fr-es"].parameters()) + 
    list(models["fr-en"].parameters()) + 
    list(models["es-en"].parameters()) + 
    list(models["es-fr"].parameters()), 
    lr=1e-5
)

# Function for denoising (adding noise to a sentence)
def add_noise(input_ids, noise_prob=0.1):
    noisy_input_ids = input_ids.clone()
    for i in range(input_ids.size(0)):
        for j in range(input_ids.size(1)):
            if random.random() < noise_prob:
                noisy_input_ids[i, j] = tokenizers["en-fr"].pad_token_id  # Mask some tokens as padding for noise
    return noisy_input_ids

# Function for back-translation training step
def back_translate_step(source_ids, source_mask, source_lang, target_lang):
    # Select the appropriate model and tokenizer
    model = models[f"{source_lang}-{target_lang}"]
    tokenizer = tokenizers[f"{source_lang}-{target_lang}"]

    # Generate translation
    with torch.no_grad():
        generated_ids = model.generate(source_ids, attention_mask=source_mask)
    
    # Tokenize back in the target language
    target_ids = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    target_ids = tokenizer(target_ids, return_tensors="pt", padding=True, truncation=True).input_ids.to(device)

    # Translate back to the source language using reverse model
    reverse_model = models[f"{target_lang}-{source_lang}"]
    generated_back = reverse_model.generate(target_ids)
    return generated_back

# Training loop
num_epochs = 1
for epoch in range(num_epochs):
    for model in models.values():
        model.train()
    total_loss = 0
    
    for step, (english_batch, french_batch, spanish_batch) in enumerate(zip(english_loader, french_loader, spanish_loader)):
        
        # Get English data and add noise
        eng_ids, eng_mask = english_batch[0].to(device), english_batch[1].to(device)
        eng_ids_noisy = add_noise(eng_ids)
        
        # Forward pass (denoising autoencoding for English)
        outputs = models["en-fr"](input_ids=eng_ids_noisy, attention_mask=eng_mask, labels=eng_ids)
        loss = outputs.loss
        total_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Debug statements
        print(f"Epoch {epoch+1}, Step {step+1} - English Denoising Loss: {loss.item():.4f}")
        
        # Back-translation English to French to English
        fr_ids = back_translate_step(eng_ids, eng_mask, source_lang="en", target_lang="fr")
        outputs = models["fr-en"](input_ids=fr_ids, labels=eng_ids)
        loss = outputs.loss
        total_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Debug statement
        print(f"Epoch {epoch+1}, Step {step+1} - English to French to English Loss: {loss.item():.4f}")
        
        # Repeat for French and Spanish datasets
        fr_ids, fr_mask = french_batch[0].to(device), french_batch[1].to(device)
        fr_ids_noisy = add_noise(fr_ids)
        outputs = models["fr-es"](input_ids=fr_ids_noisy, attention_mask=fr_mask, labels=fr_ids)
        loss = outputs.loss
        total_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Debug statement
        print(f"Epoch {epoch+1}, Step {step+1} - French Denoising Loss: {loss.item():.4f}")
        
        # Back-translation French to Spanish to French
        sp_ids = back_translate_step(fr_ids, fr_mask, source_lang="fr", target_lang="es")
        outputs = models["es-fr"](input_ids=sp_ids, labels=fr_ids)
        loss = outputs.loss
        total_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Debug statement
        print(f"Epoch {epoch+1}, Step {step+1} - French to Spanish to French Loss: {loss.item():.4f}")
        
        # Same process for Spanish
        sp_ids, sp_mask = spanish_batch[0].to(device), spanish_batch[1].to(device)
        sp_ids_noisy = add_noise(sp_ids)
        outputs = models["es-en"](input_ids=sp_ids_noisy, attention_mask=sp_mask, labels=sp_ids)
        loss = outputs.loss
        total_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Debug statement
        print(f"Epoch {epoch+1}, Step {step+1} - Spanish Denoising Loss: {loss.item():.4f}")
        
        # Back-translation Spanish to English to Spanish
        eng_ids = back_translate_step(sp_ids, sp_mask, source_lang="es", target_lang="en")
        outputs = models["en-es"](input_ids=eng_ids, labels=sp_ids)
        loss = outputs.loss
        total_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Debug statement
        print(f"Epoch {epoch+1}, Step {step+1} - Spanish to English to Spanish Loss: {loss.item():.4f}")

    avg_loss = total_loss / len(english_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

# Save models 
model_save_paths = { "en-fr": "marianmt_en_fr.pth",
 "en-es": "marianmt_en_es.pth",
 "fr-es": "marianmt_fr_es.pth",
 "fr-en": "marianmt_fr_en.pth",
 "es-en": "marianmt_es_en.pth",
 "es-fr": "marianmt_es_fr.pth", } 
for lang_pair, model in models.items():
    torch.save(model.state_dict(), model_save_paths[lang_pair]) 
    print(f"Saved {lang_pair} model to {model_save_paths[lang_pair]}")




import nltk
nltk.download('wordnet')



import sacrebleu
from nltk.translate.meteor_score import single_meteor_score
from sacrebleu.metrics import TER
import random

# Load test data with fraction sampling
def load_test_data(file_path, fraction=0.1):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    # Select a fraction of the dataset
    selected_size = int(len(lines) * fraction)
    lines = random.sample(lines, selected_size)
    
    source_sentences = []
    target_sentences = []
    for line in lines:
        source, target = line.strip().split(" ||| ")
        source_sentences.append(source)
        target_sentences.append(target)
    return source_sentences, target_sentences

# Function to generate translations
def generate_translations(source_sentences, tokenizer, model, max_length=128):
    translations = []
    for sentence in source_sentences:
        inputs = tokenizer(sentence, return_tensors="pt", max_length=max_length, padding="max_length", truncation=True).to(device)
        translated_tokens = model.generate(**inputs)
        translation = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
        translations.append(translation)
    return translations

# Function to compute evaluation metrics
def evaluate_translations(predictions, references):
    # BLEU score
    bleu_score = sacrebleu.corpus_bleu(predictions, [references]).score
    
    # METEOR score (average)
    meteor_scores = [
        single_meteor_score(ref.split(), pred.split())  # Tokenize each reference and prediction
        for ref, pred in zip(references, predictions)
    ]
    avg_meteor_score = sum(meteor_scores) / len(meteor_scores)
    
    # TER score
    ter_metric = TER()
    ter_score = ter_metric.corpus_score(predictions, [references]).score
    
    return bleu_score, avg_meteor_score, ter_score

# Evaluate on each test set
test_files = {
    "english_french": "english_french_test_data.txt",
    "english_spanish": "english_spanish_test_data.txt",
    "french_spanish": "french_spanish_test_data.txt"
}

# Fraction of the dataset to use for testing
test_fraction = 0.001  # 5% of each test dataset

for test_name, file_path in test_files.items():
    print(f"Evaluating {test_name} translation...")
    
    # Load a fraction of the test data
    source_sentences, target_sentences = load_test_data(file_path, fraction=test_fraction)
    
    # Select the appropriate tokenizer and model for the test language pair
    if test_name == "english_french":
        tokenizer = tokenizers["en-fr"]
        model = models["en-fr"]
    elif test_name == "english_spanish":
        tokenizer = tokenizers["en-es"]
        model = models["en-es"]
    elif test_name == "french_spanish":
        tokenizer = tokenizers["fr-es"]
        model = models["fr-es"]
    
    # Generate translations
    predictions = generate_translations(source_sentences, tokenizer, model)
    
    # Calculate metrics
    bleu, meteor, ter = evaluate_translations(predictions, target_sentences)
    
    # Print results
    print(f"{test_name} Evaluation Results:")
    print(f"BLEU Score: {bleu:.2f}")
    print(f"METEOR Score: {meteor:.2f}")
    print(f"TER Score: {ter:.2f}")
    print("\n")


