In [1]:
import torch
from datasets import load_dataset, Dataset
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq
)
import numpy as np
import gc
import os
from tqdm.auto import tqdm

# # Create output directories if they don't exist
# os.makedirs("./results/answer_generation", exist_ok=True)
# os.makedirs("./logs/answer_generation", exist_ok=True)

os.makedirs("/kaggle/working/results/answer_generation", exist_ok=True)
os.makedirs("/kaggle/working/logs/answer_generation", exist_ok=True)


# Enable memory optimization for PyTorch
torch.cuda.empty_cache()

# 1. Load dataset - load only a subset directly
print("Loading dataset...")
dataset = load_dataset("zjsd/RedStone-QA-mcq", split=f"train[:{int(0.02*100)}%]")
print(f"Dataset loaded with {len(dataset)} examples")

# 2. Define preprocessing function for answer generation
def preprocess_for_answer_generation(examples, batch_size=64):
    """Preprocess data for training the model to generate the correct answer letter."""
    inputs = []
    labels = []
    
    for i in range(0, len(examples["text"]), batch_size):
        batch_texts = examples["text"][i:i+batch_size]
        batch_questions = examples["question"][i:i+batch_size]
        batch_answers = examples["answer"][i:i+batch_size]
        
        for text, question, answer in zip(batch_texts, batch_questions, batch_answers):
            combined = f"generate answer: {text} question: {question}"
            inputs.append(combined)
            # Extract just the letter from "Answer:X" format
            labels.append(answer.replace("Answer:", "").strip())
    
    return {
        "input": inputs,
        "output": labels
    }

# 3. Process dataset in chunks to save memory
print("Preprocessing data for answer generation...")
answer_dataset = Dataset.from_dict(preprocess_for_answer_generation(dataset))

# Free up memory
del dataset
gc.collect()
torch.cuda.empty_cache()

# 4. Split dataset into train and validation
print("Splitting dataset into train and validation sets...")
answer_dataset = answer_dataset.train_test_split(test_size=0.1, seed=42)

answer_train_dataset = answer_dataset["train"]
answer_val_dataset = answer_dataset["test"]

print(f"Answer generation: {len(answer_train_dataset)} training examples, {len(answer_val_dataset)} validation examples")

# Free up memory
del answer_dataset
gc.collect()

# 5. Load tokenizer and model
print("Loading tokenizer and model...")
model_name = "google/flan-t5-small"  # Using the small model for faster training
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# 6. Define tokenization function
def tokenize_function(examples, max_input_length=512, max_target_length=128):
    model_inputs = tokenizer(
        examples["input"],
        max_length=max_input_length,
        padding="max_length",
        truncation=True
    )
    
    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["output"],
            max_length=max_target_length,
            padding="max_length",
            truncation=True
        )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# 7. Tokenize datasets with smaller batch size to reduce memory usage
print("Tokenizing datasets...")
batch_size = 32

answer_train_tokenized = answer_train_dataset.map(
    tokenize_function, 
    batched=True,
    batch_size=batch_size,
    remove_columns=["input", "output"]
)

answer_val_tokenized = answer_val_dataset.map(
    tokenize_function, 
    batched=True,
    batch_size=batch_size,
    remove_columns=["input", "output"]
)

# Free up memory
del answer_train_dataset, answer_val_dataset
gc.collect()
torch.cuda.empty_cache()

# 8. Define custom metrics computation function
def compute_answer_metrics(eval_pred):
    predictions, labels = eval_pred
    # Replace -100 with the pad_token_id
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
    # Get the predicted tokens
    predicted_tokens = np.argmax(predictions, axis=-1)
    
    # Decode to get the actual letters - process in batches to save memory
    batch_size = 32
    num_examples = len(predicted_tokens)
    decoded_preds = []
    decoded_labels = []
    
    for i in range(0, num_examples, batch_size):
        batch_preds = [tokenizer.decode(pred, skip_special_tokens=True) 
                      for pred in predicted_tokens[i:i+batch_size]]
        batch_labels = [tokenizer.decode(label, skip_special_tokens=True) 
                       for label in labels[i:i+batch_size]]
        decoded_preds.extend(batch_preds)
        decoded_labels.extend(batch_labels)
    
    # Calculate accuracy
    correct = sum(1 for pred, label in zip(decoded_preds, decoded_labels) if pred.strip() == label.strip())
    accuracy = correct / len(decoded_labels) if len(decoded_labels) > 0 else 0
    
    # Print just a few examples for debugging
    print("\nAnswer Generation Examples (Prediction, Reference):")
    for i in range(min(3, len(decoded_preds))):
        print(f"  {decoded_preds[i]} | {decoded_labels[i]}")
    
    # Return metrics
    return {
        "accuracy": accuracy,
        "exact_match_ratio": correct / len(decoded_labels) if len(decoded_labels) > 0 else 0,
    }

# 9. Define training arguments for answer generation
answer_training_args = TrainingArguments(
    output_dir="./results/answer_generation",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    logging_dir="./logs/answer_generation",
    logging_steps=100,
    report_to=["tensorboard"],
    # Memory optimizations
    fp16=True if torch.cuda.is_available() else False,  # Use mixed precision if available
    gradient_accumulation_steps=2,  # Accumulate gradients to simulate larger batch sizes
    dataloader_num_workers=1,  # Parallelize data loading
    dataloader_pin_memory=True,  # Speed up data transfer to GPU
    # Ensure progress bar is shown
    disable_tqdm=False,
)

# 10. Define data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding="max_length",
    max_length=512,
    pad_to_multiple_of=8  # Optimize for tensor operations
)

# 11. Train answer generation model
print("Initializing answer generation trainer...")
answer_trainer = Trainer(
    model=model,
    args=answer_training_args,
    train_dataset=answer_train_tokenized,
    eval_dataset=answer_val_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_answer_metrics,
)

print("Training answer generation model...")
answer_trainer.train()

# Get validation results
print("Evaluating answer generation model...")
answer_eval_results = answer_trainer.evaluate()
print("\n" + "="*50)
print("ANSWER GENERATION EVALUATION RESULTS")
print("="*50)
print(f"Accuracy: {answer_eval_results['eval_accuracy']:.4f}")
print(f"Exact Match: {answer_eval_results['eval_exact_match_ratio']:.4f}")
print("="*50 + "\n")

# Save model
print("Saving final model...")
answer_trainer.save_model("./results/answer_generation/final_model")
print("Model saved successfully!")

# 12. Memory-efficient inference function for demonstration
def generate_answer(context, question, model, tokenizer):
    # Set model to evaluation mode
    model.eval()
    
    # Get the device
    device = next(model.parameters()).device
    
    # Prepare input
    answer_input = f"generate answer: {context} question: {question}"
    answer_input_ids = tokenizer(answer_input, return_tensors="pt").input_ids.to(device)
    
    print("Generating answer letter...")
    with torch.no_grad():  # Disable gradient calculation to save memory
        answer_outputs = model.generate(
            answer_input_ids, 
            max_length=10,
            num_beams=4,
            early_stopping=True
        )
    answer_letter = tokenizer.decode(answer_outputs[0], skip_special_tokens=True).strip()
    print(f"Generated answer letter: {answer_letter}")
    
    # Clean up GPU memory
    del answer_outputs, answer_input_ids
    torch.cuda.empty_cache()
    
    return answer_letter

# Optional: Demonstrate the trained model
print("\n" + "="*50)
print("DEMONSTRATION")
print("="*50)
try:
    # Sample context and question
    context = "The Python programming language was created by Guido van Rossum and first released in 1991. It emphasizes code readability with its notable use of significant whitespace."
    question = "Who created the Python programming language?"
    
    # Generate answer
    answer_letter = generate_answer(context, question, model, tokenizer)
    print(f"Context: {context}")
    print(f"Question: {question}")
    print(f"Generated Answer: {answer_letter}")
except Exception as e:
    print(f"Error during demonstration: {e}")

print("\nTraining of Answer Generation model complete!")

Loading dataset...


README.md:   0%|          | 0.00/813 [00:00<?, ?B/s]

02f2eecfda5642699cec7306db349dd7.parquet:   0%|          | 0.00/41.3M [00:00<?, ?B/s]

23afc2c9c15646b5b42c3c1fd8191bb3.parquet:   0%|          | 0.00/41.1M [00:00<?, ?B/s]

3e76396d09c3438c82927f7147f79e4e.parquet:   0%|          | 0.00/43.0M [00:00<?, ?B/s]

4c027dcad52241f596b09eb8c6c893a9.parquet:   0%|          | 0.00/45.5M [00:00<?, ?B/s]

69e90546dac941528cdd07d49b93b140.parquet:   0%|          | 0.00/41.9M [00:00<?, ?B/s]

a2687c7645db40708ea8db9371a0e703.parquet:   0%|          | 0.00/42.6M [00:00<?, ?B/s]

cc8885cabdc8437b93220777f74d0c3f.parquet:   0%|          | 0.00/43.2M [00:00<?, ?B/s]

ea94289cf49746818839e882f248f13d.parquet:   0%|          | 0.00/8.93M [00:00<?, ?B/s]

f10233c4ea29412b9a07e7ca8aedd169.parquet:   0%|          | 0.00/41.9M [00:00<?, ?B/s]

f458d85ec1e747cf8b47ac222a46d8c2.parquet:   0%|          | 0.00/42.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1661603 [00:00<?, ? examples/s]

Dataset loaded with 33232 examples
Preprocessing data for answer generation...
Splitting dataset into train and validation sets...
Answer generation: 29908 training examples, 3324 validation examples
Loading tokenizer and model...


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Tokenizing datasets...


Map:   0%|          | 0/29908 [00:00<?, ? examples/s]



Map:   0%|          | 0/3324 [00:00<?, ? examples/s]

  answer_trainer = Trainer(


Initializing answer generation trainer...
Training answer generation model...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 5.39 GiB. GPU 0 has a total capacity of 14.74 GiB of which 3.81 GiB is free. Process 3053 has 10.93 GiB memory in use. Of the allocated memory 6.37 GiB is allocated by PyTorch, and 4.37 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [2]:
import os
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
import torch
from torch.utils.data import Dataset
import random
import gc
from tqdm.auto import tqdm

# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.empty_cache()

# Define constants
MODEL_NAME = "google/flan-t5-small"
MAX_INPUT_LENGTH = 384  # Reduced from 512
MAX_TARGET_LENGTH = 48  # Reduced from 64
BATCH_SIZE = 4  # Reduced from 8
GRADIENT_ACCUMULATION_STEPS = 4  # Increases effective batch size without increasing memory
LEARNING_RATE = 3e-4
NUM_EPOCHS = 3
SAMPLE_RATIO = 0.1  # Reduced from 0.15
MIXED_PRECISION = "fp16"  # Use mixed precision training

# Function to free memory
def free_memory():
    gc.collect()
    torch.cuda.empty_cache()

# Load the dataset in streaming mode to reduce memory usage
print("Loading dataset...")
dataset = load_dataset("zjsd/RedStone-QA-mcq", split="train", streaming=True)

# Convert to regular dataset for sampling
# Buffer a smaller amount to not load everything in memory
dataset = dataset.take(int(1.66e6 * SAMPLE_RATIO * 1.2))  # Buffer slightly more than needed
dataset = list(dataset)
random.shuffle(dataset)
dataset = dataset[:int(1.66e6 * SAMPLE_RATIO)]

# Split into train and validation
train_val_split = 0.9
train_size = int(len(dataset) * train_val_split)

# Create train and validation datasets
train_dataset = dataset[:train_size]
val_dataset = dataset[train_size:train_size + min(2000, len(dataset) - train_size)]  # Limit validation set

print(f"Total sampled examples: {len(dataset)}")
print(f"Training examples: {len(train_dataset)}")
print(f"Validation examples: {len(val_dataset)}")

# Free memory
del dataset
free_memory()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

# Define a processing function for batch tokenization
def preprocess_function(examples):
    batch_size = len(examples)
    input_texts = []
    target_texts = []
    
    for i in range(batch_size):
        context = examples[i]['text']
        question = examples[i]['question']
        answer = examples[i]['answer'].replace("Answer:", "").strip()
        
        # Format the input (context + question)
        input_text = f"Context: {context} Question: {question}"
        input_texts.append(input_text)
        target_texts.append(answer)
    
    # Tokenize inputs
    model_inputs = tokenizer(
        input_texts,
        max_length=MAX_INPUT_LENGTH,
        padding="max_length",
        truncation=True
    )
    
    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            target_texts,
            max_length=MAX_TARGET_LENGTH,
            padding="max_length",
            truncation=True
        )
    
    model_inputs["labels"] = labels["input_ids"]
    
    # Replace padding token id with -100 so it's ignored in the loss
    for i in range(len(model_inputs["labels"])):
        model_inputs["labels"][i] = [
            -100 if token == tokenizer.pad_token_id else token 
            for token in model_inputs["labels"][i]
        ]
    
    return model_inputs

# Process data in batches to save memory
print("Processing training data...")
batch_size = 512
train_processed = []
for i in tqdm(range(0, len(train_dataset), batch_size)):
    batch = train_dataset[i:i+batch_size]
    processed_batch = preprocess_function(batch)
    for j in range(len(batch)):
        train_processed.append({
            "input_ids": processed_batch["input_ids"][j],
            "attention_mask": processed_batch["attention_mask"][j],
            "labels": processed_batch["labels"][j]
        })

print("Processing validation data...")
val_processed = []
for i in tqdm(range(0, len(val_dataset), batch_size)):
    batch = val_dataset[i:i+batch_size]
    processed_batch = preprocess_function(batch)
    for j in range(len(batch)):
        val_processed.append({
            "input_ids": processed_batch["input_ids"][j],
            "attention_mask": processed_batch["attention_mask"][j],
            "labels": processed_batch["labels"][j]
        })

# Free memory
del train_dataset, val_dataset
free_memory()

# Create PyTorch datasets
class MemoryEfficientDataset(Dataset):
    def __init__(self, examples):
        self.examples = examples
    
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.examples[idx]["input_ids"]),
            "attention_mask": torch.tensor(self.examples[idx]["attention_mask"]),
            "labels": torch.tensor(self.examples[idx]["labels"])
        }

train_dataset = MemoryEfficientDataset(train_processed)
val_dataset = MemoryEfficientDataset(val_processed)

# Free memory
del train_processed, val_processed
free_memory()

# Load the model with 8-bit precision to save memory
print("Loading model...")
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./answer_generation_model",
    overwrite_output_dir=True,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,  # Keep only the best model
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    logging_dir="./logs",
    logging_steps=50,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_ratio=0.05,
    predict_with_generate=False,  # Save memory during evaluation
    fp16=MIXED_PRECISION == "fp16",  # Mixed precision training
    optim="adamw_torch",
    report_to="none",  # Disable W&B reporting
    disable_tqdm=False,  # Enable tqdm progress bar
)

# Set up the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True),
)

# Train the model
print("Training model...")
trainer.train()

# Save the model
print("Saving model...")
model.save_pretrained("./answer_generation_model_final")
tokenizer.save_pretrained("./answer_generation_model_final")

# Test on a few examples
print("Testing on some examples...")
model.eval()
test_examples = [
    val_dataset[i] for i in range(min(3, len(val_dataset)))
]

for example in test_examples:
    input_ids = example["input_ids"].unsqueeze(0).to(model.device)
    attention_mask = example["attention_mask"].unsqueeze(0).to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=MAX_TARGET_LENGTH,
            num_beams=2,  # Reduced for memory efficiency
            early_stopping=True
        )
    
    input_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
    predicted_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    print(f"Input: {input_text}")
    print(f"Predicted Answer: {predicted_answer}")
    print("=" * 50)

print("Training complete!")

Loading dataset...
Total sampled examples: 166000
Training examples: 149400
Validation examples: 2000
Processing training data...


  0%|          | 0/292 [00:00<?, ?it/s]



Processing validation data...


  0%|          | 0/4 [00:00<?, ?it/s]

Loading model...


  trainer = Seq2SeqTrainer(


Training model...


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Epoch,Training Loss,Validation Loss
1,0.0,0.0
2,0.0,0.0


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


Saving model...
Testing on some examples...
Input: Context: The primary distinction between the short term scheduler and the long term scheduler is __________ A. The length of their queues B. The type of processes they schedule C. The frequency of their execution D. None of the mentioned Answer:C Question: The primary distinction between the short term scheduler and the long term scheduler is __________
Predicted Answer: C
Input: Context: Consider the situation that the transaction 'P' holds shared key lock X. Also, other transaction 'Q' requests for shared key lock X, then : A. request will be immediately granted B. the deadlock situation is created C. request will be rejected after some time D. request will be granted at it is released by P. Answer:A Question: Consider the situation that the transaction 'P' holds shared key lock X. Also, other transaction 'Q' requests for shared key lock X, then :
Predicted Answer: A
Input: Context: The following wounds will heal without a scar forma

In [3]:
import os
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
import torch
from torch.utils.data import Dataset
import random
import gc
from tqdm.auto import tqdm
import re

# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.empty_cache()

# Define constants
MODEL_NAME = "google/flan-t5-small"
MAX_INPUT_LENGTH = 384
MAX_TARGET_LENGTH = 96
BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 4
LEARNING_RATE = 3e-4
NUM_EPOCHS = 3
SAMPLE_RATIO = 0.1
MIXED_PRECISION = "fp16"

# Function to free memory
def free_memory():
    gc.collect()
    torch.cuda.empty_cache()

# Extract actual answer content and distractors
def extract_answers_and_distractors(row):
    correct_answer = row['answer'].replace("Answer:", "").strip()
    choices = row['choices']
    
    # Identify correct answer letter
    correct_letter = None
    
    # If answer is just a letter (A, B, C, D)
    if re.match(r'^[A-D]$', correct_answer):
        correct_letter = correct_answer
    else:
        # If answer starts with a letter followed by a period/colon (A., A:)
        match = re.match(r'^([A-D])[:\.]?\s*', correct_answer)
        if match:
            correct_letter = match.group(1)
    
    # If we still don't have a letter, try to infer from content
    if not correct_letter:
        correct_content = correct_answer
        for choice in choices:
            parts = re.match(r'^([A-D])[.\s]+(.*)', choice)
            if parts:
                letter, content = parts.groups()
                # If the content matches (approximately) the correct answer
                if content.strip().lower() in correct_answer.lower() or correct_answer.lower() in content.strip().lower():
                    correct_letter = letter
                    correct_content = content.strip()
                    break
    
    # If we still don't have a letter, take a guess based on position
    if not correct_letter and "A" in correct_answer:
        correct_letter = "A"
    elif not correct_letter and "B" in correct_answer:
        correct_letter = "B"
    elif not correct_letter and "C" in correct_answer:
        correct_letter = "C"
    elif not correct_letter and "D" in correct_answer:
        correct_letter = "D"
    elif not correct_letter and len(choices) > 0:
        correct_letter = "A"  # Default to first option
    
    # Extract correct answer content and distractors
    correct_content = None
    distractors = []
    
    for choice in choices:
        parts = re.match(r'^([A-D])[.\s]+(.*)', choice)
        if parts:
            letter, content = parts.groups()
            content = content.strip()
            if letter == correct_letter:
                correct_content = content
            elif content:  # Only add non-empty distractors
                distractors.append(content)
    
    # If we didn't extract content from choices, use the original answer
    if not correct_content:
        if re.match(r'^[A-D][:\.]?\s+(.+)$', correct_answer):
            correct_content = re.match(r'^[A-D][:\.]?\s+(.+)$', correct_answer).group(1)
        else:
            correct_content = correct_answer
    
    return correct_content, distractors

# Load the dataset in streaming mode to reduce memory usage
print("Loading dataset...")
dataset = load_dataset("zjsd/RedStone-QA-mcq", split="train", streaming=True)

# Convert to regular dataset for sampling
dataset = dataset.take(int(1.66e6 * SAMPLE_RATIO * 1.2))
dataset = list(dataset)
random.shuffle(dataset)
dataset = dataset[:int(1.66e6 * SAMPLE_RATIO)]

# Process dataset to extract actual answers and distractors
print("Extracting answers and distractors...")
processed_dataset = []
for row in tqdm(dataset):
    actual_answer, distractors = extract_answers_and_distractors(row)
    if actual_answer and distractors:  # Only keep examples with valid answers and distractors
        row['actual_answer'] = actual_answer
        row['distractors_list'] = distractors
        processed_dataset.append(row)

# Free memory
del dataset
free_memory()

# Split into train and validation
train_val_split = 0.9
train_size = int(len(processed_dataset) * train_val_split)

# Create train and validation datasets
train_dataset = processed_dataset[:train_size]
val_dataset = processed_dataset[train_size:train_size + min(2000, len(processed_dataset) - train_size)]

print(f"Total processed examples: {len(processed_dataset)}")
print(f"Training examples: {len(train_dataset)}")
print(f"Validation examples: {len(val_dataset)}")

# Free memory
del processed_dataset
free_memory()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

# Define a processing function for batch tokenization
def preprocess_function(examples):
    batch_size = len(examples)
    input_texts = []
    target_texts = []
    
    for i in range(batch_size):
        context = examples[i]['text']
        question = examples[i]['question']
        answer = examples[i]['actual_answer']
        
        # Format distractors as a single string with separators
        distractors = " | ".join(examples[i]['distractors_list'])
        
        # Format the input (context + question + correct answer)
        input_text = f"Context: {context} Question: {question} Correct Answer: {answer}"
        input_texts.append(input_text)
        target_texts.append(distractors)
    
    # Tokenize inputs
    model_inputs = tokenizer(
        input_texts,
        max_length=MAX_INPUT_LENGTH,
        padding="max_length",
        truncation=True
    )
    
    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            target_texts,
            max_length=MAX_TARGET_LENGTH,
            padding="max_length",
            truncation=True
        )
    
    model_inputs["labels"] = labels["input_ids"]
    
    # Replace padding token id with -100 so it's ignored in the loss
    for i in range(len(model_inputs["labels"])):
        model_inputs["labels"][i] = [
            -100 if token == tokenizer.pad_token_id else token 
            for token in model_inputs["labels"][i]
        ]
    
    return model_inputs

# Process data in batches to save memory
print("Processing training data...")
batch_size = 512
train_processed = []
for i in tqdm(range(0, len(train_dataset), batch_size)):
    batch = train_dataset[i:i+batch_size]
    processed_batch = preprocess_function(batch)
    for j in range(len(batch)):
        train_processed.append({
            "input_ids": processed_batch["input_ids"][j],
            "attention_mask": processed_batch["attention_mask"][j],
            "labels": processed_batch["labels"][j]
        })

print("Processing validation data...")
val_processed = []
for i in tqdm(range(0, len(val_dataset), batch_size)):
    batch = val_dataset[i:i+batch_size]
    processed_batch = preprocess_function(batch)
    for j in range(len(batch)):
        val_processed.append({
            "input_ids": processed_batch["input_ids"][j],
            "attention_mask": processed_batch["attention_mask"][j],
            "labels": processed_batch["labels"][j]
        })

# Free memory
del train_dataset, val_dataset
free_memory()

# Create PyTorch datasets
class MemoryEfficientDataset(Dataset):
    def __init__(self, examples):
        self.examples = examples
    
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.examples[idx]["input_ids"]),
            "attention_mask": torch.tensor(self.examples[idx]["attention_mask"]),
            "labels": torch.tensor(self.examples[idx]["labels"])
        }

train_dataset = MemoryEfficientDataset(train_processed)
val_dataset = MemoryEfficientDataset(val_processed)

# Free memory
del train_processed, val_processed
free_memory()

# Load the model
print("Loading model...")
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./distractor_generation_model",
    overwrite_output_dir=True,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    logging_dir="./logs",
    logging_steps=50,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_ratio=0.05,
    predict_with_generate=False,
    fp16=MIXED_PRECISION == "fp16",
    optim="adamw_torch",
    report_to="none",
    disable_tqdm=False,
)

# Set up the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True),
)

# Train the model
print("Training model...")
trainer.train()

# Save the model
print("Saving model...")
model.save_pretrained("./distractor_generation_model_final")
tokenizer.save_pretrained("./distractor_generation_model_final")

# Test on a few examples
print("Testing on some examples...")
model.eval()
test_examples = [
    val_dataset[i] for i in range(min(3, len(val_dataset)))
]

for example in test_examples:
    input_ids = example["input_ids"].unsqueeze(0).to(model.device)
    attention_mask = example["attention_mask"].unsqueeze(0).to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=MAX_TARGET_LENGTH,
            num_beams=2,
            early_stopping=True
        )
    
    input_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
    predicted_distractors = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    print(f"Input: {input_text}")
    print(f"Predicted Distractors: {predicted_distractors}")
    print("=" * 50)

print("Training complete!")

Loading dataset...
Extracting answers and distractors...


  0%|          | 0/166000 [00:00<?, ?it/s]

Total processed examples: 165420
Training examples: 148878
Validation examples: 2000
Processing training data...


  0%|          | 0/291 [00:00<?, ?it/s]



Processing validation data...


  0%|          | 0/4 [00:00<?, ?it/s]

Loading model...


  trainer = Seq2SeqTrainer(


Training model...




Epoch,Training Loss,Validation Loss
1,0.0081,0.003985
2,0.0061,0.002732


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


Saving model...
Testing on some examples...
Input: Context: Why might individuals who report eating chocolate more often be more likely to be thin (not overweight)? A. Because overweight individuals are more likely to be on a diet, and therefore eat chocolate less frequently B. Because individuals who eat chocolate more often also exhibit more self control and actually eat less total chocolate C. Because overweight individuals are less likely to honestly report eating foods that are perceived as “unhealthy” D. Because chocolate contains certain nutrients that make a person fuller and therefore less likely to eat as many calories of other foods Answer:A Question: Why might individuals who report eating chocolate more often be more likely to be thin (not overweight)? Correct Answer: Because overweight individuals are more likely to be on a diet, and therefore eat chocolate less frequently
Predicted Distractors: Because individuals who eat chocolate more often also exhibit more self contr

In [4]:
# Add these imports at the top of your script
from huggingface_hub import login, HfApi


from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
HF_TOKEN = user_secrets.get_secret("HF_TOKEN")

print("Logging in to Hugging Face Hub...")
login(token=HF_TOKEN)

# Define your Hugging Face repository name
# Format: "username/repository-name"
HF_REPO_ID = "aayeshanakarmi/distractor-generation-redstone-flant5small-2"  # Replace with your desired repo name

# Save the model to the Hub
print(f"Uploading model to Hugging Face Hub as {HF_REPO_ID}...")
model.push_to_hub(HF_REPO_ID, use_auth_token=HF_TOKEN)
tokenizer.push_to_hub(HF_REPO_ID, use_auth_token=HF_TOKEN)


print(f"Model successfully uploaded to Hugging Face Hub: https://huggingface.co/{HF_REPO_ID}")

Logging in to Hugging Face Hub...
Uploading model to Hugging Face Hub as aayeshanakarmi/distractor-generation-redstone-flant5small-2...




model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]



README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Model successfully uploaded to Hugging Face Hub: https://huggingface.co/aayeshanakarmi/distractor-generation-redstone-flant5small-2


# Answer Generation Model 

In [None]:
import os
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
import torch
from torch.utils.data import Dataset
import random
import gc
from tqdm.auto import tqdm
import re

# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.empty_cache()

# Define constants
MODEL_NAME = "google/flan-t5-small"
MAX_INPUT_LENGTH = 384
MAX_TARGET_LENGTH = 48
BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 4
LEARNING_RATE = 3e-4
NUM_EPOCHS = 3
SAMPLE_RATIO = 0.1
MIXED_PRECISION = "fp16"

# Function to free memory
def free_memory():
    gc.collect()
    torch.cuda.empty_cache()

# Enhanced function to extract actual answer content (not just A, B, C, D)
def extract_actual_answer(row):
    correct_answer = row['answer'].replace("Answer:", "").strip()
    choices = row['choices']
    
    # If answer is just a single letter (A, B, C, D)
    if re.match(r'^[A-D]$', correct_answer):
        letter = correct_answer
        for choice in choices:
            # Look for the choice that starts with this letter
            if choice.startswith(letter + ".") or choice.startswith(letter + " ") or choice.startswith(letter + ":"):
                # Extract everything after the letter and separator
                content = re.sub(r'^[A-D][.\s:]+', '', choice).strip()
                return content
    
    # If answer starts with letter followed by period/colon/space (e.g., "A. text", "A: text", "A text")
    match = re.match(r'^([A-D])[.\s:]+(.+)$', correct_answer)
    if match:
        # Extract the content part after the letter
        content = match.group(2).strip()
        if content:
            return content
        
        # If no content after the letter in the answer, find it in choices
        letter = match.group(1)
        for choice in choices:
            if choice.startswith(letter + ".") or choice.startswith(letter + " ") or choice.startswith(letter + ":"):
                content = re.sub(r'^[A-D][.\s:]+', '', choice).strip()
                return content
    
    # Handle case where the answer might be the full text that matches one of the choices
    for choice in choices:
        # Extract the content part of the choice (removing any leading A., B., etc.)
        choice_content = re.sub(r'^[A-D][.\s:]+', '', choice).strip()
        # If the answer matches this content exactly, return it
        if correct_answer == choice_content:
            return correct_answer
    
    # If we couldn't match it to a specific choice or extract a letter,
    # just return the original answer as a fallback
    return correct_answer

# Load the dataset in streaming mode to reduce memory usage
print("Loading dataset...")
dataset = load_dataset("zjsd/RedStone-QA-mcq", split="train", streaming=True)

# Convert to regular dataset for sampling
dataset = dataset.take(int(1.66e6 * SAMPLE_RATIO * 1.2))
dataset = list(dataset)
random.shuffle(dataset)
dataset = dataset[:int(1.66e6 * SAMPLE_RATIO)]

# Process dataset to extract actual answers
print("Extracting actual answers...")
processed_dataset = []
for row in tqdm(dataset):
    actual_answer = extract_actual_answer(row)
    if actual_answer.strip():  # Only keep examples with non-empty answers
        row['actual_answer'] = actual_answer
        processed_dataset.append(row)

# Free memory
del dataset
free_memory()

# Split into train and validation
train_val_split = 0.9
train_size = int(len(processed_dataset) * train_val_split)

# Create train and validation datasets
train_dataset = processed_dataset[:train_size]
val_dataset = processed_dataset[train_size:train_size + min(2000, len(processed_dataset) - train_size)]

print(f"Total examples with actual answers: {len(processed_dataset)}")
print(f"Training examples: {len(train_dataset)}")
print(f"Validation examples: {len(val_dataset)}")

# Free memory
del processed_dataset
free_memory()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

# Define a processing function for batch tokenization
def preprocess_function(examples):
    batch_size = len(examples)
    input_texts = []
    target_texts = []
    
    for i in range(batch_size):
        context = examples[i]['text']
        question = examples[i]['question']
        answer = examples[i]['actual_answer']
        
        # Format the input (context + question)
        input_text = f"Context: {context} Question: {question}"
        input_texts.append(input_text)
        target_texts.append(answer)
    
    # Tokenize inputs
    model_inputs = tokenizer(
        input_texts,
        max_length=MAX_INPUT_LENGTH,
        padding="max_length",
        truncation=True
    )
    
    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            target_texts,
            max_length=MAX_TARGET_LENGTH,
            padding="max_length",
            truncation=True
        )
    
    model_inputs["labels"] = labels["input_ids"]
    
    # Replace padding token id with -100 so it's ignored in the loss
    for i in range(len(model_inputs["labels"])):
        model_inputs["labels"][i] = [
            -100 if token == tokenizer.pad_token_id else token 
            for token in model_inputs["labels"][i]
        ]
    
    return model_inputs

# Process data in batches to save memory
print("Processing training data...")
batch_size = 512
train_processed = []
for i in tqdm(range(0, len(train_dataset), batch_size)):
    batch = train_dataset[i:i+batch_size]
    processed_batch = preprocess_function(batch)
    for j in range(len(batch)):
        train_processed.append({
            "input_ids": processed_batch["input_ids"][j],
            "attention_mask": processed_batch["attention_mask"][j],
            "labels": processed_batch["labels"][j]
        })

print("Processing validation data...")
val_processed = []
for i in tqdm(range(0, len(val_dataset), batch_size)):
    batch = val_dataset[i:i+batch_size]
    processed_batch = preprocess_function(batch)
    for j in range(len(batch)):
        val_processed.append({
            "input_ids": processed_batch["input_ids"][j],
            "attention_mask": processed_batch["attention_mask"][j],
            "labels": processed_batch["labels"][j]
        })

# Free memory
del train_dataset, val_dataset
free_memory()

# Create PyTorch datasets
class MemoryEfficientDataset(Dataset):
    def __init__(self, examples):
        self.examples = examples
    
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.examples[idx]["input_ids"]),
            "attention_mask": torch.tensor(self.examples[idx]["attention_mask"]),
            "labels": torch.tensor(self.examples[idx]["labels"])
        }

train_dataset = MemoryEfficientDataset(train_processed)
val_dataset = MemoryEfficientDataset(val_processed)

# Free memory
del train_processed, val_processed
free_memory()

# Load the model
print("Loading model...")
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./answer_generation_model",
    overwrite_output_dir=True,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    logging_dir="./logs",
    logging_steps=50,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_ratio=0.05,
    predict_with_generate=False,
    fp16=MIXED_PRECISION == "fp16",
    optim="adamw_torch",
    report_to="none",
    disable_tqdm=False,
)

# Set up the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True),
)

# Train the model
print("Training model...")
trainer.train()

# Save the model
print("Saving model...")
model.save_pretrained("./answer_generation_model_final")
tokenizer.save_pretrained("./answer_generation_model_final")

# Test on a few examples
print("Testing on some examples...")
model.eval()
test_examples = [
    val_dataset[i] for i in range(min(3, len(val_dataset)))
]

for example in test_examples:
    input_ids = example["input_ids"].unsqueeze(0).to(model.device)
    attention_mask = example["attention_mask"].unsqueeze(0).to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=MAX_TARGET_LENGTH,
            num_beams=2,
            early_stopping=True
        )
    
    input_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
    predicted_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    print(f"Input: {input_text}")
    print(f"Predicted Answer: {predicted_answer}")
    print("=" * 50)

print("Training complete!")

Loading dataset...
Extracting actual answers...


  0%|          | 0/166000 [00:00<?, ?it/s]

Total examples with actual answers: 165884
Training examples: 149295
Validation examples: 2000
Processing training data...


  0%|          | 0/292 [00:00<?, ?it/s]



Processing validation data...


  0%|          | 0/4 [00:00<?, ?it/s]

Loading model...


  trainer = Seq2SeqTrainer(


Training model...




Epoch,Training Loss,Validation Loss
1,0.0143,0.01177
2,0.0093,0.013052


