In [None]:
!pip install -U bitsandbytes
!pip install datasets
!pip install evaluation
!pip install evaluate
!pip install tensorboard
!pip install sacrebleu

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-

In [None]:
import os
import re
import gc
import time
import json
import torch
import openai
import random
import zipfile
import evaluate
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from datasets import Dataset
from accelerate import Accelerator
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    GPT2LMHeadModel,
    GPT2TokenizerFast,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, TaskType

In [None]:
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

# Extract and Load Dataset

zip_path = "/content/txt.zip"
with zipfile.ZipFile(zip_path, "r") as zip_ref:
    zip_ref.extractall("/content/de-fr")

extracted_files = os.listdir("/content/de-fr")
print("Extracted Files:", extracted_files)

de_folder = "/content/de-fr/txt/de"
fr_folder = "/content/de-fr/txt/fr"

def load_and_clean_folder(folder_path):
    """
    Loads all .txt files from a given folder, removes XML-like tags,
    and returns a dictionary mapping filenames to cleaned text.
    """
    texts = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, "r", encoding="utf-8") as f:
                content = f.read()
                cleaned_content = re.sub(r"<[^>]+>", "", content)
                cleaned_content = " ".join(cleaned_content.split())
                texts[filename] = cleaned_content
    return texts

# Load and clean files from both folders
german_texts = load_and_clean_folder(de_folder)
french_texts = load_and_clean_folder(fr_folder)

# Pair files by filename
paired_data = []
common_files = set(german_texts.keys()).intersection(set(french_texts.keys()))
for filename in common_files:
    paired_data.append({
        "german": german_texts[filename],
        "french": french_texts[filename]
    })

print(f"Total paired files found: {len(paired_data)}")

# Sample 1,000 pairs
if len(paired_data) > 1000:
    paired_data = random.sample(paired_data, 1000)

df = pd.DataFrame(paired_data)
df = df.applymap(str.strip)
print("Sample Data:\n", df.head())

BadZipFile: File is not a zip file

In [None]:
pairs = df.to_dict(orient="records")
dataset = Dataset.from_list(pairs)

# Split the dataset
split_dataset = dataset.train_test_split(test_size=0.2, seed=seed)
print("Train samples:", len(split_dataset["train"]))
print("Test samples:", len(split_dataset["test"]))

def add_prompt(example):
    example["prompt"] = "translate German to French: " + example["german"]
    example["target"] = example["french"]
    return example

split_dataset = split_dataset.map(add_prompt)
print("Sample example from training set:", split_dataset["train"][0])


Train samples: 800
Test samples: 200


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Sample example from training set: {'german': 'Tagesordnung der nächsten Sitzung: siehe Protokoll', 'french': 'Ordre du jour de la prochaine séance: voir procès-verbal', 'prompt': 'translate German to French: Tagesordnung der nächsten Sitzung: siehe Protokoll', 'target': 'Ordre du jour de la prochaine séance: voir procès-verbal'}


In [None]:
# Model and Tokenizer for microsoft/phi-2
model_name = "microsoft/phi-2"

# Quantization configuration
quant_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True,
)

# Load the model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quant_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

# Configure LoRA
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=32,
    lora_alpha=64,
    lora_dropout=0.1,
)

# Apply LoRA to the model
model = get_peft_model(model, peft_config)

# Freeze base model parameters except for LoRA layers
for name, param in model.named_parameters():
    if "lora" not in name:
        param.requires_grad = False


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [None]:
# Tokenize the Dataset using batched processing
def tokenize_function(examples):
    prompts = [prompt + " " + tokenizer.eos_token + " " + target for prompt, target in zip(examples["prompt"], examples["target"])]

    return tokenizer(prompts, truncation=True, max_length=1024, padding=True)

# Tokenize the train dataset with batched=True
tokenized_train = split_dataset["train"].map(tokenize_function, batched=True)


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [None]:
# Evaluating base model before fine-tune
import sacrebleu
# Define the model name
model_name = "microsoft/phi-2"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the tokenizer for the base model
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

# Load the model
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
model.eval()
model.config.use_cache = False

# Function to generate translations
def generate_translation_base(german_sentence, num_beams=4, max_length=150):
    prompt_text = (
        "Translate the following German sentence into French accurately and uniquely.\n\n"
        f"German: {german_sentence}\n\nFrench:"
    )
    inputs = tokenizer(prompt_text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    try:
        output_ids = model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_length,
            num_beams=num_beams,
            early_stopping=True,
            repetition_penalty=1.2,
            no_repeat_ngram_size=2,
            top_k=50,
            top_p=0.95,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id,
        )
        generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        if generated_text.startswith(prompt_text):
            generated_text = generated_text[len(prompt_text):]
        return generated_text.strip().split("\n")[0].strip()
    except Exception as e:
        print(f"Error generating translation: {e}")
        return None

# Load the test dataset (assuming split_dataset is defined)
test_dataset = split_dataset["test"]

# Store predictions and references
predictions = []
references = []

# Process in batches
batch_size = 8
for i in tqdm(range(0, len(test_dataset), batch_size), desc="Evaluating Base Model"):
    batch_prompts = [test_dataset[i+j]["german"] for j in range(batch_size) if i+j < len(test_dataset)]
    batch_refs = [test_dataset[i+j]["target"] for j in range(batch_size) if i+j < len(test_dataset)]

    batch_predictions = [generate_translation_base(prompt, num_beams=4, max_length=150) for prompt in batch_prompts]

    for pred, ref in zip(batch_predictions, batch_refs):
        if pred is not None and pred != "":
            predictions.append(pred)
            references.append(ref)

# Compute BLEU score
if predictions:
    bleu = sacrebleu.corpus_bleu(predictions, [[ref] for ref in references])
    print("BLEU Score for Base Model:", bleu.score)
else:
    print("No valid translations generated. Skipping BLEU score.")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Evaluating Base Model:   0%|          | 0/25 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# Fine-tune base model

from transformers import (
    TrainerCallback
)

# Clear GPU Cache After Each Step
class ClearCacheCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        torch.cuda.empty_cache()
        gc.collect()

torch.cuda.empty_cache()
gc.collect()

# Model and Tokenizer Setup
model_name = "microsoft/phi-2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

# Quantization Configuration
quant_config = BitsAndBytesConfig(
    load_in_8bit=True,
)

# Load the model with the quantization configuration.
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map={"": 0}  # Forces all parameters onto GPU 0
)
model.config.use_cache = False

# LoRA (PEFT) Configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=64,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA configuration to the model
model = get_peft_model(model, lora_config)

# Freeze Non-LoRA Parameters
for name, param in model.named_parameters():
    if "lora" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

# Check LoRA Layers for Gradient Tracking
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable parameters: {trainable_params}")
if trainable_params == 0:
    raise ValueError("No parameters are trainable. Check your LoRA configuration.")

# Data Collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Fine-Tuning Training Arguments
training_args = TrainingArguments(
    output_dir="./phi-2-finetuned-b",
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    fp16=True,
    save_steps=1000,
    logging_dir="./logs/model_B_logs",
    logging_steps=100,
    save_total_limit=2
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    data_collator=data_collator,
    callbacks=[ClearCacheCallback]
)

torch.cuda.empty_cache()
gc.collect()

print("Starting fine-tuning on Dataset A to create Model B...")
train_results = trainer.train()
print(train_results)

trainer.save_model("./my_finetuned_phi_2")
print("Fine-tuning complete. Model saved successfully.")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Trainable parameters: 9175040




Starting fine-tuning on Dataset A to create Model B...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33maida-farshiali[0m ([33maida-farshiali-rptu[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss
100,2.8025
200,2.3279
300,2.1447
400,2.0288
500,2.0436
600,1.9778
700,1.9884
800,1.8654


TrainOutput(global_step=800, training_loss=2.147381553649902, metrics={'train_runtime': 4482.0323, 'train_samples_per_second': 0.357, 'train_steps_per_second': 0.178, 'total_flos': 2.6127108145152e+16, 'train_loss': 2.147381553649902, 'epoch': 2.0})
Fine-tuning complete. Model saved successfully.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
save_path = "/content/drive/MyDrive/my_finetuned_phi_2"

local_model_path = "./my_finetuned_phi_2"

tokenizer.save_pretrained(local_model_path)

shutil.copytree(local_model_path, save_path)

print(f"Model and tokenizer saved to {save_path}")

Model and tokenizer saved to /content/drive/MyDrive/my_finetuned_phi_2


In [None]:
# Evaluate model on fine-tuned data

bleu_metric = evaluate.load("sacrebleu")

torch.cuda.empty_cache()
gc.collect()

model_path = "/content/drive/MyDrive/my_finetuned_phi_2"

quant_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_path)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config=quant_config,
    device_map="auto"
)
model.eval()

def generate_translations_batch(prompts, num_beams=4, max_length=128, max_new_tokens=50):
    """
    Generates translations for a batch of input prompts.
    """
    with torch.no_grad():
        inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        output_ids = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_length=max_length,
            num_beams=num_beams,
            early_stopping=True,
            repetition_penalty=1.2,
            no_repeat_ngram_size=2,
            top_k=50,
            top_p=0.95,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id,
            max_new_tokens=max_new_tokens
        )
        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    return outputs

test_dataset = split_dataset["test"]

all_prompts = [f"Translate the following German sentence into French:\n\nGerman: {ex['german']}\n\nFrench:"
               for ex in test_dataset]
all_references = [ex["target"] for ex in test_dataset]

predictions = []
batch_size = 16
for i in tqdm(range(0, len(all_prompts), batch_size), desc="Evaluating Model B"):
    batch_prompts = all_prompts[i: i + batch_size]
    batch_outputs = generate_translations_batch(batch_prompts, num_beams=4, max_length=128, max_new_tokens=50)

    for prompt, output in zip(batch_prompts, batch_outputs):
        output = output.replace(prompt, "").strip().split("\n")[0].strip()
        predictions.append(output)

if predictions:
    bleu_result = bleu_metric.compute(predictions=predictions, references=[[ref] for ref in all_references])
    print("SacreBLEU Score for Model B:", bleu_result["score"])
else:
    print("No valid translations generated.")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Evaluating Model B:   0%|          | 0/13 [00:00<?, ?it/s]

Both `max_new_tokens` (=50) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=50) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=50) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=50) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both

SacreBLEU Score for Model B: 7.568234239548304e-196


In [None]:
# Generating synthesize data_1

GDRIVE_PATH = "/content/drive/MyDrive/generated_pairs"
if not os.path.exists(GDRIVE_PATH):
    os.makedirs(GDRIVE_PATH)

# Set up the OpenAI-compatible client
client = openai.Client(
    api_key="",
    base_url="https://api.sambanova.ai/v1",
)

existing_sentences = set()

def generate_synthetic_data(prompt, max_tokens=4096, temperature=0.9, top_p=0.95, retries=5):
    """Requests translation pairs from the SamBanova API with retry logic for rate limits."""
    for attempt in range(retries):
        try:
            response = client.chat.completions.create(
                model="Meta-Llama-3.1-70B-Instruct",
                messages=[
                    {"role": "system", "content": "You are an advanced AI assistant. Your task is to generate diverse, complex, and unique translation pairs between German and French."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=max_tokens,
                temperature=temperature,
                top_p=top_p,
            )
            return response.choices[0].message.content
        except openai.APIError as e:
            if "rate_limit_exceeded" in str(e):
                print(f"Rate limit exceeded. Retrying... (Attempt {attempt+1}/{retries})")
                time.sleep(30)
            else:
                print(f"API request failed due to unexpected error: {e}")
                return None
    print("Max retries reached. Skipping this batch.")
    return None

TOTAL_PAIRS = 1000
BATCH_SIZE = 25
NUM_CALLS = TOTAL_PAIRS // BATCH_SIZE

for i in range(NUM_CALLS):
    print(f"Requesting batch {i+1}/{NUM_CALLS} ({BATCH_SIZE} pairs)...")
    prompt_text = (
            "Generate exactly 25 unique and diverse German–French translation pairs. "
            "Each pair must be represented as a JSON object with two keys: 'german' and 'french'.\n\n"
            "### INSTRUCTIONS:\n"
            "- Ensure that the generated sentences are distinct and vary in structure.\n"
            "- Use a range of sentence types such as questions, commands, and statements.\n"
            "- Include complex sentence structures, including passive voice, relative clauses, and indirect speech.\n"
            "- Incorporate formal and informal vocabulary, ensuring a balance of both.\n"
            "- Explore different tenses, moods, and aspects of the verbs (e.g., subjunctive, future tense).\n"
            "- The sentences should reflect cultural nuances, including idiomatic expressions or formal phrasing.\n"
            "- Do not repeat sentence patterns; every translation must be unique.\n\n"
            "Example translations:\n"
            "[\n"
            "  {\"german\": \"Hätte ich gewusst, dass er kommt, hätte ich mich besser vorbereitet.\",\n"
            "   \"french\": \"Si j'avais su qu'il venait, je me serais mieux préparé.\"},\n"
            "  {\"german\": \"Die Entscheidung, das Angebot anzunehmen, war alles andere als einfach.\",\n"
            "   \"french\": \"La décision d'accepter l'offre n'a pas été facile du tout.\"},\n"
            "  {\"german\": \"Wenn wir nicht bald handeln, wird sich die Situation weiter verschlechtern.\",\n"
            "   \"french\": \"Si nous n'agissons pas bientôt, la situation continuera à se détériorer.\"}\n"
            "]\n\n"
            "### IMPORTANT RULES:\n"
            "- Generate **exactly 25 pairs**. Do not include more or fewer pairs.\n"
            "- Ensure **every sentence is unique**. Avoid slight variations of the same structure.\n"
            "- The sentences must be **original** and not be derived from common or frequently used phrases."
        )

    generated_pairs = generate_synthetic_data(prompt_text, max_tokens=4096, temperature=0.9, top_p=0.95)

    if generated_pairs is None:
        print(f"Skipping batch {i+1} due to API request failure.")
        continue

    # Check for duplicates before saving
    if generated_pairs in existing_sentences:
        print(f"Duplicate response detected in batch {i+1}. Skipping.")
        continue
    existing_sentences.add(generated_pairs)

    raw_filename = f"{GDRIVE_PATH}/generated_pair_{i+1}.txt"
    with open(raw_filename, "w", encoding="utf-8") as raw_file:
        raw_file.write(generated_pairs)
    print(f"Batch {i+1} saved to {raw_filename}")

    time.sleep(10)

print("\nAll batches completed!")


In [None]:
# Generating synthesize data_2

GDRIVE_PATH = "/content/drive/MyDrive/generated_pairs_2"
if not os.path.exists(GDRIVE_PATH):
    os.makedirs(GDRIVE_PATH)

# Set up the OpenAI-compatible client
client = openai.Client(
    api_key="",
    base_url="https://api.sambanova.ai/v1",
)

existing_sentences = set()

def generate_synthetic_data(prompt, max_tokens=4096, temperature=0.9, top_p=0.95, retries=5):
    """Requests translation pairs from the SamBanova API with retry logic for rate limits."""
    for attempt in range(retries):
        try:
            response = client.chat.completions.create(
                model="Meta-Llama-3.3-70B-Instruct",
                messages=[
                    {"role": "system", "content": "You are an advanced AI assistant. Your task is to generate diverse, complex, and unique translation pairs between German and French."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=max_tokens,
                temperature=temperature,
                top_p=top_p,
            )
            return response.choices[0].message.content
        except openai.APIError as e:
            if "rate_limit_exceeded" in str(e):
                print(f"Rate limit exceeded. Retrying... (Attempt {attempt+1}/{retries})")
                time.sleep(30)
            else:
                print(f"API request failed due to unexpected error: {e}")
                return None
    print("Max retries reached. Skipping this batch.")
    return None

TOTAL_PAIRS = 1000
BATCH_SIZE = 25
NUM_CALLS = TOTAL_PAIRS // BATCH_SIZE

for i in range(NUM_CALLS):
    print(f"Requesting batch {i+1}/{NUM_CALLS} ({BATCH_SIZE} pairs)...")
    prompt_text = (
            "Generate exactly 25 unique and diverse German–French translation pairs. "
            "Each pair must be represented as a JSON object with two keys: 'german' and 'french'.\n\n"
            "### INSTRUCTIONS:\n"
            "- Ensure that the generated sentences are distinct and vary in structure.\n"
            "- Use a range of sentence types such as questions, commands, and statements.\n"
            "- Include complex sentence structures, including passive voice, relative clauses, and indirect speech.\n"
            "- Incorporate formal and informal vocabulary, ensuring a balance of both.\n"
            "- Explore different tenses, moods, and aspects of the verbs (e.g., subjunctive, future tense).\n"
            "- The sentences should reflect cultural nuances, including idiomatic expressions or formal phrasing.\n"
            "- Do not repeat sentence patterns; every translation must be unique.\n\n"
            "Example translations:\n"
            "[\n"
            "  {\"german\": \"Hätte ich gewusst, dass er kommt, hätte ich mich besser vorbereitet.\",\n"
            "   \"french\": \"Si j'avais su qu'il venait, je me serais mieux préparé.\"},\n"
            "  {\"german\": \"Die Entscheidung, das Angebot anzunehmen, war alles andere als einfach.\",\n"
            "   \"french\": \"La décision d'accepter l'offre n'a pas été facile du tout.\"},\n"
            "  {\"german\": \"Wenn wir nicht bald handeln, wird sich die Situation weiter verschlechtern.\",\n"
            "   \"french\": \"Si nous n'agissons pas bientôt, la situation continuera à se détériorer.\"}\n"
            "]\n\n"
            "### IMPORTANT RULES:\n"
            "- Generate **exactly 25 pairs**. Do not include more or fewer pairs.\n"
            "- Ensure **every sentence is unique**. Avoid slight variations of the same structure.\n"
            "- The sentences must be **original** and not be derived from common or frequently used phrases."
        )

    generated_pairs = generate_synthetic_data(prompt_text, max_tokens=4096, temperature=0.9, top_p=0.95)

    if generated_pairs is None:
        print(f"Skipping batch {i+1} due to API request failure.")
        continue

    # Check for duplicates before saving
    if generated_pairs in existing_sentences:
        print(f"Duplicate response detected in batch {i+1}. Skipping.")
        continue
    existing_sentences.add(generated_pairs)

    raw_filename = f"{GDRIVE_PATH}/generated_pair_2_{i+1}.txt"
    with open(raw_filename, "w", encoding="utf-8") as raw_file:
        raw_file.write(generated_pairs)
    print(f"Batch {i+1} saved to {raw_filename}")

    time.sleep(10)

print("\nAll batches completed!")


In [None]:
# Concating synthesized data

GDRIVE_PATH_1 = "/content/drive/MyDrive/generated_pairs"
GDRIVE_PATH_2 = "/content/drive/MyDrive/generated_pairs_2"
OUTPUT_PATH = "/content/dataset_b.json"
MAX_UNIQUE_PAIRS = 1600

# Function to clean the file content and extract the JSON data
def clean_json_content(raw_content):
    """Removes non-JSON content from the beginning and end of the raw file content."""
    start_index = raw_content.find('[')
    end_index = raw_content.rfind(']') + 1
    if start_index != -1 and end_index != -1:
        return raw_content[start_index:end_index]
    return None

# Function to read all .txt files and check for duplicates
def read_and_deduplicate_files(gdrive_paths, max_pairs=MAX_UNIQUE_PAIRS):
    all_sentences = set()
    combined_data = []

    for path in gdrive_paths:
        txt_files = [f for f in os.listdir(path) if f.endswith(".txt")]

        for txt_file in txt_files:
            file_path = os.path.join(path, txt_file)

            with open(file_path, 'r', encoding='utf-8') as file:
                raw_content = file.read()

                cleaned_content = clean_json_content(raw_content)

                if cleaned_content is None:
                    print(f"Error decoding JSON in file: {file_path}")
                    continue

                if cleaned_content in all_sentences:
                    print(f"Duplicate response detected in file: {file_path}. Skipping.")
                    continue

                # Add cleaned content to all_sentences and then parse it
                all_sentences.add(cleaned_content)

                try:
                    json_data = json.loads(cleaned_content)
                    combined_data.extend(json_data)
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON in file: {file_path}. Error: {e}")
                    continue

                if len(all_sentences) >= max_pairs:
                    print(f"Reached {max_pairs} unique pairs. Stopping.")
                    return combined_data

    return combined_data

combined_data = read_and_deduplicate_files([GDRIVE_PATH_1, GDRIVE_PATH_2])

with open(OUTPUT_PATH, 'w', encoding='utf-8') as json_file:
    json.dump(combined_data, json_file, ensure_ascii=False, indent=4)

print(f"Dataset B has been successfully saved at {OUTPUT_PATH}")


Dataset A Train saved at /content/dataset_a_train.json with 800 pairs
Dataset A Test saved at /content/dataset_a_test.json with 200 pairs
Dataset A Train size: 800 pairs
Dataset B size: 1561 pairs
Combined dataset size (before deduplication): 2361 pairs
Unique combined dataset size: 2248 pairs
Dataset C saved at /content/dataset_c.json with 2248 pairs.


In [None]:
# Fine-tune model on synthesize data

torch.cuda.empty_cache()
gc.collect()

# Load the tokenizer and model for microsoft/phi-2
model_name = "microsoft/phi-2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load the synthetic dataset (Dataset B)
dataset_b_path = "/content/dataset_b.json"
with open(dataset_b_path, "r", encoding="utf-8") as f:
    dataset_b = json.load(f)


dataset_b = dataset_b[:1600]
dataset_b = Dataset.from_list(dataset_b)

def tokenize_function(example):
    return tokenizer(example["german"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset_b = dataset_b.map(tokenize_function, batched=True)

# Set up quantization configuration for 8-bit loading
quant_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True
)

# Load the model with quantization configuration
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quant_config)

# Set up LoRA configuration
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=32,
    lora_alpha=64,
    lora_dropout=0.1,
)

# Apply LoRA to the model and freeze base parameters
model = get_peft_model(model, peft_config)
for name, param in model.named_parameters():
    if "lora" not in name:
        param.requires_grad = False

# Data collator for causal language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Fine-tuning configuration
torch.cuda.empty_cache()
gc.collect()

training_args = TrainingArguments(
    output_dir="./phi-2-finetuned-c",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=1e-5,
    fp16=True,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs/model_C_logs",
    logging_steps=100,
    prediction_loss_only=True,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_b,
    data_collator=data_collator,
)

# Check that there are trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable parameters: {trainable_params}")
if trainable_params == 0:
    raise ValueError("No parameters are trainable. Check your LoRA configuration.")

print("Starting fine-tuning on Dataset B to create Model C...")
trainer.train()

trainer.save_model("./my_finetuned_phi_2_synthesized")
print("Fine-tuning complete. Model saved as Model C.")


Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
import shutil
from transformers import GPT2TokenizerFast

save_path = "/content/drive/MyDrive/my_finetuned_phi_2_synthesized"

local_model_path = "./my_finetuned_phi_2_synthesized"

tokenizer.save_pretrained(local_model_path)

shutil.copytree(local_model_path, save_path)

print(f"Model and tokenizer saved to {save_path}")

Model and tokenizer saved to /content/drive/MyDrive/my_finetuned_phi_2_synthesized


In [None]:
# Evaluating model on synthesized data

bleu_metric = evaluate.load("sacrebleu")

torch.cuda.empty_cache()
gc.collect()


model_path = "/content/my_finetuned_phi_2_synthesized"

quant_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_path)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config=quant_config,
    device_map="auto"
)
model.eval()


def generate_translations_batch(prompts, num_beams=4, max_length=150):
    with torch.no_grad():
        inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=128)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        output_ids = model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_length,
            num_beams=num_beams,
            early_stopping=True,
            repetition_penalty=1.2,
            no_repeat_ngram_size=2,
            top_k=50,
            top_p=0.95,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id,
        )
        outputs = [tokenizer.decode(ids, skip_special_tokens=True) for ids in output_ids]
    return outputs


test_dataset = split_dataset["test"]
all_prompts = []
all_references = []
for example in test_dataset:
    prompt = f"translate German to French: {example['german']}"
    all_prompts.append(prompt)
    all_references.append(example["target"])


predictions = []
batch_size = 16
for i in tqdm(range(0, len(all_prompts), batch_size), desc="Evaluating Model C"):
    batch_prompts = all_prompts[i: i + batch_size]
    batch_outputs = generate_translations_batch(batch_prompts, num_beams=4, max_length=150)
    for prompt, output in zip(batch_prompts, batch_outputs):
        if prompt in output:
            output = output.replace(prompt, "")
        output = output.strip().split("\n")[0].strip()
        predictions.append(output)

# Compute BLEU Score Using SacreBLEU
if predictions:
    bleu = sacrebleu.corpus_bleu(predictions, [all_references])
    print("SacreBLEU Score for Model C:", bleu.score)
else:
    print("No valid translations generated.")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Concating both datasets and shuffle

dataset_a_train_path = "/content/dataset_a_train.json"
dataset_a_test_path = "/content/dataset_a_test.json"
dataset_b_path = "/content/dataset_b.json"
dataset_c_path = "/content/dataset_c.json"

# Save Dataset A
if not os.path.exists(dataset_a_train_path):
    with open(dataset_a_train_path, "w", encoding="utf-8") as f:
        json.dump(list(split_dataset["train"]), f, ensure_ascii=False, indent=4)
    print(f"Dataset A Train saved at {dataset_a_train_path} with {len(split_dataset['train'])} pairs")
else:
    print(f"Dataset A Train already exists at {dataset_a_train_path}")

if not os.path.exists(dataset_a_test_path):
    with open(dataset_a_test_path, "w", encoding="utf-8") as f:
        json.dump(list(split_dataset["test"]), f, ensure_ascii=False, indent=4)
    print(f"Dataset A Test saved at {dataset_a_test_path} with {len(split_dataset['test'])} pairs")
else:
    print(f"Dataset A Test already exists at {dataset_a_test_path}")

def clean_json_content(raw_content):
    """Removes non-JSON content from the beginning and end of the raw file content."""
    start_index = raw_content.find('[')
    end_index = raw_content.rfind(']') + 1
    if start_index != -1 and end_index != -1:
        return raw_content[start_index:end_index]
    return None

def read_and_deduplicate_files(gdrive_paths, max_pairs=None):
    all_sentences = set()
    combined_data = []

    for path in gdrive_paths:
        txt_files = [f for f in os.listdir(path) if f.endswith(".txt")]
        for txt_file in txt_files:
            file_path = os.path.join(path, txt_file)
            with open(file_path, 'r', encoding='utf-8') as file:
                raw_content = file.read()
                cleaned_content = clean_json_content(raw_content)
                if cleaned_content is None:
                    print(f"Error decoding JSON in file: {file_path}")
                    continue
                if cleaned_content in all_sentences:
                    print(f"Duplicate response detected in file: {file_path}. Skipping.")
                    continue
                all_sentences.add(cleaned_content)
                try:
                    json_data = json.loads(cleaned_content)
                    combined_data.extend(json_data)
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON in file: {file_path}. Error: {e}")
                    continue
                if max_pairs and len(all_sentences) >= max_pairs:
                    print(f"Reached {max_pairs} unique pairs. Stopping.")
                    return combined_data
    return combined_data

# Combine Dataset A Train and Dataset B to form Dataset C
with open(dataset_a_train_path, "r", encoding="utf-8") as f:
    dataset_a_train = json.load(f)
with open(dataset_b_path, "r", encoding="utf-8") as f:
    dataset_b = json.load(f)

print(f"Dataset A Train size: {len(dataset_a_train)} pairs")
print(f"Dataset B size: {len(dataset_b)} pairs")

combined_dataset = dataset_a_train + dataset_b
print(f"Combined dataset size (before deduplication): {len(combined_dataset)} pairs")

# Deduplicate based on German and French content
unique_pairs = {}
for entry in combined_dataset:
    german = entry.get("German") or entry.get("german")
    french = entry.get("French") or entry.get("french")
    if german is None or french is None:
        continue
    key = (german.strip(), french.strip())
    unique_pairs[key] = {"German": german.strip(), "French": french.strip()}

combined_unique_dataset = list(unique_pairs.values())
print(f"Unique combined dataset size: {len(combined_unique_dataset)} pairs")

# Shuffle the combined unique dataset with a fixed seed for reproducibility
random.seed(42)
random.shuffle(combined_unique_dataset)

with open(dataset_c_path, "w", encoding="utf-8") as f:
    json.dump(combined_unique_dataset, f, ensure_ascii=False, indent=4)

print(f"Dataset C saved at {dataset_c_path} with {len(combined_unique_dataset)} pairs.")


Dataset A Train saved at /content/dataset_a_train.json with 800 pairs
Dataset A Test saved at /content/dataset_a_test.json with 200 pairs
Dataset A Train size: 800 pairs
Dataset B size: 1561 pairs
Combined dataset size (before deduplication): 2361 pairs
Unique combined dataset size: 2248 pairs
Dataset C saved at /content/dataset_c.json with 2248 pairs.


In [None]:
# Fine-tune model on combined dataset

torch.cuda.empty_cache()
gc.collect()

dataset_c_path = "/content/dataset_c.json"
with open(dataset_c_path, "r", encoding="utf-8") as json_file:
    dataset_c = json.load(json_file)

# Convert Dataset C into a Hugging Face Dataset
dataset_c = Dataset.from_list(dataset_c)

model_name = "microsoft/phi-2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    texts = [str(text) for text in examples["German"]]
    return tokenizer(texts, truncation=True, padding="max_length", max_length=128)

tokenized_dataset_c = dataset_c.map(tokenize_function, batched=True)

# Set up quantization configuration for 8-bit precision
quant_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True
)

# Load the model with quantization configuration
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quant_config)

# Set up the LoRA configuration for fine-tuning
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=32,
    lora_alpha=64,
    lora_dropout=0.1,
)

# Apply LoRA configuration to the model
model = get_peft_model(model, peft_config)

# Freeze base model parameters except for LoRA layers
for name, param in model.named_parameters():
    if "lora" not in name:
        param.requires_grad = False


data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="./phi-2-finetuned-on-combined",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=1e-5,
    fp16=True,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs/model_D_logs",
    logging_steps=200,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_c,
    data_collator=data_collator,
)

trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable parameters: {trainable_params}")
if trainable_params == 0:
    raise ValueError("No parameters are trainable. Check your LoRA configuration.")

print("Starting fine-tuning on Dataset C to create Model D...")
trainer.train()

# Save the fine-tuned model (Model D)
trainer.save_model("./my_finetuned_phi_2_on_combined")
print("Fine-tuning complete. Model saved as Model D.")


Map:   0%|          | 0/2391 [00:00<?, ? examples/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Trainable parameters: 36700160
Starting fine-tuning on Dataset C to create Model D...




Step,Training Loss
200,2.3929
400,2.112
600,1.9791
800,1.892




Fine-tuning complete. Model saved as Model D.


In [None]:
import shutil
from transformers import GPT2TokenizerFast

save_path = "/content/drive/MyDrive/my_finetuned_phi_2_on_combined"

local_model_path = "./my_finetuned_phi_2_on_combined"

tokenizer.save_pretrained(local_model_path)

shutil.copytree(local_model_path, save_path)

print(f"Model and tokenizer saved to {save_path}")

Model and tokenizer saved to /content/drive/MyDrive/my_finetuned_phi_2_on_combined


In [None]:
# Evaluate model on combined data

bleu_metric = evaluate.load("sacrebleu")

torch.cuda.empty_cache()
gc.collect()

model_path = "/content/drive/MyDrive/my_finetuned_phi_2_on_combined"

quant_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_path)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config=quant_config,
    device_map="auto"
)
model.eval()

def generate_translations_batch(prompts, num_beams=4, max_length=128, max_new_tokens=50):
    """
    Generates translations for a batch of input prompts.
    """
    with torch.no_grad():
        inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        output_ids = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_length=max_length,
            num_beams=num_beams,
            early_stopping=True,
            repetition_penalty=1.2,
            no_repeat_ngram_size=2,
            top_k=50,
            top_p=0.95,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id,
            max_new_tokens=max_new_tokens  # Add this parameter for controlling the new token length
        )
        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    return outputs

test_dataset = split_dataset["test"]

all_prompts = [f"Translate the following German sentence into French:\n\nGerman: {ex['german']}\n\nFrench:"
               for ex in test_dataset]
all_references = [ex["target"] for ex in test_dataset]

predictions = []
batch_size = 16
for i in tqdm(range(0, len(all_prompts), batch_size), desc="Evaluating Model D"):
    batch_prompts = all_prompts[i: i + batch_size]
    batch_outputs = generate_translations_batch(batch_prompts, num_beams=4, max_length=128, max_new_tokens=50)

    for prompt, output in zip(batch_prompts, batch_outputs):
        output = output.replace(prompt, "").strip().split("\n")[0].strip()
        predictions.append(output)

if predictions:
    bleu_result = bleu_metric.compute(predictions=predictions, references=[[ref] for ref in all_references])
    print("SacreBLEU Score for Model D:", bleu_result["score"])
else:
    print("No valid translations generated.")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Evaluating Model D:   0%|          | 0/13 [00:00<?, ?it/s]

Both `max_new_tokens` (=50) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=50) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=50) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=50) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both

SacreBLEU Score for Model D: 7.624196513770272e-172
