In [1]:
from datasets import load_dataset

squad = load_dataset("squad", split="train")
squad = squad.train_test_split(test_size=0.2, seed=42)

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [2]:
import copy
augmented_squad = copy.deepcopy(squad)

In [58]:
import copy
from deep_translator import GoogleTranslator
from datasets import concatenate_datasets, Dataset
from tqdm import tqdm 



def translate_batch_and_back(sentences_list, intermediate_lang):
    """
    Performs batch back-translation.
    Returns a list of paraphrased strings or None for failed batches.
    """
    try:
        # 1. Translate the entire batch to the intermediate language
        translated_batch = GoogleTranslator(
            source='en', 
            target=intermediate_lang
        ).translate_batch(sentences_list)
        
        # 2. Translate the entire batch back to the target language (English)
        back_translated_batch = GoogleTranslator(
            source=intermediate_lang, 
            target='en'
        ).translate_batch(translated_batch)
        
        return back_translated_batch
    except Exception as e:
        print(f"Error during batch translation: {e}")
        # If the batch fails, return a list of None to skip those records
        return [None] * len(sentences_list)


In [37]:
import torch
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

# --- CONFIGURATION ---
device = "cuda" if torch.cuda.is_available() else "cpu"

# --- LOAD MODEL + TOKENIZER ---
model_name = "facebook/m2m100_418M"   # Use 418M (faster) or 1.2B for higher quality
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name).to(device).half()


def translate_batch(sentences, src_lang, tgt_lang, max_length=256):
    """
    Translates a batch of sentences from src_lang → tgt_lang using M2M100.
    """
    tokenizer.src_lang = src_lang
    encoded = tokenizer(
        sentences,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=max_length
    ).to(device)

    # Force decoder to generate in target language
    generated_tokens = model.generate(
        **encoded,
        forced_bos_token_id=tokenizer.get_lang_id(tgt_lang),
        max_length=max_length
    )

    return [tokenizer.decode(t, skip_special_tokens=True) for t in generated_tokens]


def translate_batch_and_back(sentences_list, intermediate_lang=LANG):
    """
    Performs back-translation with M2M100 (English → LANG → English).
    Returns paraphrased English sentences.
    """
    try:
        # 1. English → Intermediate Language
        translated_batch = translate_batch(sentences_list, "en", intermediate_lang)

        # 2. Intermediate Language → English
        back_translated_batch = translate_batch(translated_batch, intermediate_lang, "en")

        return back_translated_batch

    except Exception as e:
        print(f"Error during batch translation: {e}")
        return [None] * len(sentences_list)


KeyboardInterrupt: 

In [3]:
import torch
from transformers import MarianMTModel, MarianTokenizer

# --- CONFIGURATION ---

device = "cuda" if torch.cuda.is_available() else "cpu"

# --- LOAD MODELS + TOKENIZERS ---

src_to_tgt_model_name = "Helsinki-NLP/opus-mt-en-de"
tgt_to_src_model_name = "Helsinki-NLP/opus-mt-de-en"

tokenizer_src2tgt = MarianTokenizer.from_pretrained(src_to_tgt_model_name, use_fast=True)
model_src2tgt = MarianMTModel.from_pretrained(src_to_tgt_model_name).to(device)

tokenizer_tgt2src = MarianTokenizer.from_pretrained(tgt_to_src_model_name, use_fast=True)
model_tgt2src = MarianMTModel.from_pretrained(tgt_to_src_model_name).to(device)

def translate_batch(sentences, tokenizer, model, max_length=256):
    """
    Translates a batch of sentences using a MarianMT model.
    """
    encoded = tokenizer(
    sentences,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=max_length
    ).to(device)


    generated_tokens = model.generate(
        **encoded,
        max_length=max_length
    )

    return [tokenizer.decode(t, skip_special_tokens=True) for t in generated_tokens]


def back_translate(sentences_list):
    """
    #Performs back-translation English → German → English.
    #Returns paraphrased English sentences.
    """
    try:
        # 1. English → German
        translated_batch = translate_batch(sentences_list, tokenizer_src2tgt, model_src2tgt)


        # 2. German → English
        back_translated_batch = translate_batch(translated_batch, tokenizer_tgt2src, model_tgt2src)

        return back_translated_batch

    except Exception as e:
        print(f"Error during back-translation: {e}")
        return [None] * len(sentences_list)


In [4]:
import copy
from datasets import Dataset, concatenate_datasets
from tqdm import tqdm
import time
import os

# Configuration
BATCH_SIZE = 8
LANG = "de"
SAVE_EVERY = 100   # save after every 10 batches

# Prepare
original_train_dataset = squad['train']
original_list = original_train_dataset.to_list()
total_records = len(original_list)

new_augmented_datasets = []   # temporary storage
batch_cache = []              # in-memory batch cache

print(f"Starting batch augmentation on {total_records} samples...")

for b, i in enumerate(tqdm(range(0, total_records, BATCH_SIZE), desc="Batch Translating")):
    batch_records = original_list[i:i+BATCH_SIZE]
    batch_questions = [r['question'] for r in batch_records]
    
    paraphrased_questions = back_translate(batch_questions)
    
    for j, record in enumerate(batch_records):
        p = paraphrased_questions[j]
        if p and isinstance(p, str):
            new_record = dict(record)  # shallow copy only
            global_index = i + j
            new_record['id'] = f"{record['id']}-aug-{LANG}-{global_index}"
            new_record['question'] = p
            batch_cache.append(new_record)
    
    # Flush to Dataset every SAVE_EVERY batches
    if (b + 1) % SAVE_EVERY == 0 or i + BATCH_SIZE >= total_records:
        temp_dataset = Dataset.from_list(batch_cache)
        new_augmented_datasets.append(temp_dataset)
        batch_cache.clear()  # free memory
        #time.sleep(2)  # optional cooldown

print("Merging all temporary datasets...")
new_augmented_dataset = concatenate_datasets(new_augmented_datasets)
print("✅ Done! Final size:", len(new_augmented_dataset))


Starting batch augmentation on 70079 samples...


Batch Translating: 100%|██████████| 8760/8760 [1:15:21<00:00,  1.94it/s]

Merging all temporary datasets...
✅ Done! Final size: 70079





In [8]:
augmented_squad['train'] = new_augmented_dataset
print(augmented_squad)

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 70079
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 17520
    })
})


save df

In [9]:
import pandas as pd 

# Define the filename
output_file_path = "augmented_squad.csv"

augmented_squad['train'].to_pandas().to_csv(output_file_path, index=False)

print(f"DataFrame successfully saved to {output_file_path}")

DataFrame successfully saved to augmented_squad.csv


check

In [11]:
for i in range(61000,61010):
    print(augmented_squad['train']['question'][i])
    print(squad['train']['question'][i])


How many households are there in Tucson from 2010 on?
How many households are there in Tucson as of 2010?
The spread of Buddhism led to what great effort?
Buddhism's spread led to what large-scale effort?
In which year was the Jolt Hall of Fame recorded by DeveloperWorks?
The Jolt Hall of Fame inducted DeveloperWorks in what year?
What year was Ali killed?
In what year was Ali killed?
The pub "The Bag o'Nails" was a corruption of what word?
The pub "The Bag o'Nails" was a corruption of what word?
Madonna was born in what religion?
Madonna was born to which religion?
What conflict did the Corporate Library find with Comcast's board?
What conflict did Corporate Library note with Comcast's Board?
Which state had the most pilots per capita than any other U.S. state?
Which state had the most pilots per capita than any other US state?
What were slaves not allowed to do?
Slaves were not allowed to do what?
Where does the ferry end to France?
Where does the ferry to France terminate?


In [12]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [13]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [14]:
tokenized_squad = augmented_squad.map(
    preprocess_function,
    batched=True,
    remove_columns=augmented_squad["train"].column_names,
)

Map:   0%|          | 0/70079 [00:00<?, ? examples/s]

In [15]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("./drill04_qa_model")

# Freeze first 2 transformer layers
for layer in model.distilbert.transformer.layer[:2]:
    for param in layer.parameters():
        param.requires_grad = False

# Check which parameters are trainable
trainable_params = [n for n, p in model.named_parameters() if p.requires_grad]
print(f"Trainable parameters: {len(trainable_params)}")


Trainable parameters: 70


In [16]:
print(augmented_squad["train"]['question'])
print(squad["train"]['question'])


Column(['What was the cost of primary and secondary education under Gaddafi?', 'What was the price of early HDTVs?', 'At its peak, the imperialist forces controlled all but how many states in Mexico?', 'Serbo-Croatian is the only Slavic language that uses which two scripts together?', 'What is the third - hot desert in the world?'])
Column(['What was the cost of primary and secondary education under Gaddafi?', 'What was the price for early HDTVs?', 'At its zenith, the imperialist forces controlled all but how many states in Mexico?', 'Serbo-Croatian is the only Slavic language to use what two scripts together?', 'What is the third hottest desert in the world?'])


In [None]:
# ✅ Use GPU if available
import torch
from transformers import Trainer, TrainingArguments

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"There are {torch.cuda.device_count()} GPU(s) available.")
    print("Device name:", torch.cuda.get_device_name(0))
    model.to(device)
else:
    device = torch.device("cpu")
    print("No GPU found, using CPU.")

# ✅ Training configuration (optimized for disk usage)
training_args = TrainingArguments(
    output_dir="drill04+DE_Backtranslated",       # folder to save model
    eval_strategy="epoch",         # correct parameter name
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    save_strategy="epoch",               # save checkpoint only once per epoch
    save_total_limit=1,                  # keep only the last checkpoint
    load_best_model_at_end=True,         # optional, keeps best checkpoint
    push_to_hub=False,
    report_to="none",                    # disable wandb or tensorboard logs
    logging_dir=None,                    # avoid creating logging folders
    #fp16=True
)

# ✅ Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    tokenizer=tokenizer,                 # fixed from "processing_class"
    data_collator=data_collator,
)

# ✅ Try to resume training if a checkpoint exists
import os
last_checkpoint = None
if os.path.isdir(training_args.output_dir):
    from transformers.trainer_utils import get_last_checkpoint
    last_checkpoint = get_last_checkpoint(training_args.output_dir)

if last_checkpoint:
    print(f"Resuming training from checkpoint: {last_checkpoint}")
    trainer.train(resume_from_checkpoint=last_checkpoint)
else:
    print("Starting training from scratch...")
    trainer.train()


There are 1 GPU(s) available.
Device name: NVIDIA GeForce RTX 3050 Laptop GPU


  trainer = Trainer(


Starting training from scratch...


Epoch,Training Loss,Validation Loss
1,1.0485,1.09861
2,0.9205,1.098198
3,0.865,1.124761
4,0.7738,1.141018


In [18]:
# Save the model and tokenizer
trainer.save_model("./drill04+DE_Backtranslated")
tokenizer.save_pretrained("./drill04+DE_Backtranslated")

('./drill04+DE_Backtranslated\\tokenizer_config.json',
 './drill04+DE_Backtranslated\\special_tokens_map.json',
 './drill04+DE_Backtranslated\\vocab.txt',
 './drill04+DE_Backtranslated\\added_tokens.json',
 './drill04+DE_Backtranslated\\tokenizer.json')

In [19]:
max_length = 384
stride = 128


def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

validation_dataset = squad["test"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=squad["test"].column_names,
)
len(squad["test"]), len(validation_dataset)

(17520, 17688)

In [20]:
from tqdm.auto import tqdm
import collections
import numpy as np
import evaluate
metric = evaluate.load("squad")

n_best = 20
max_answer_length = 30

def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [21]:
predictions, _, _ = trainer.predict(validation_dataset)
start_logits, end_logits = predictions
compute_metrics(start_logits, end_logits, validation_dataset, squad["test"])

  0%|          | 0/17520 [00:00<?, ?it/s]

{'exact_match': 64.01826484018265, 'f1': 78.06739535271652}

In [28]:
question = "Which is blue's car price"
context="The blue car costs $20,000 and the red car costs $25,000."
print(question)
from transformers import pipeline
question_answerer = pipeline("question-answering",
                             model="./drill04_qa_model",
                            tokenizer="./drill04_qa_model",
                            fp16=True)

result = question_answerer(question=question, context=context)
print(result['answer'])

Device set to use cuda:0


Which is blue's car price
$20,000 and the red car costs $25,000
