In [31]:
import json
import random

def generate_fake_qa_dataset(num_samples=1000):
    """
    Generates a fake dataset for Question Answering (Q&A) fine-tuning.

    Each sample consists of a context, a question related to the context,
    and an answer extracted from the context.
    """
    dataset = []
    base_contexts = [
        "The capital of France is Paris. Paris is also known as the City of Lights. It is famous for the Eiffel Tower.",
        "Mount Everest is the highest mountain in the world, located in the Himalayas. Its peak is 8,848.86 meters above sea level.",
        "The Amazon River, located in South America, is the largest river by discharge volume of water in the world.",
        "Photosynthesis is the process used by plants, algae and cyanobacteria to convert light energy into chemical energy.",
        "Artificial intelligence (AI) is intelligence demonstrated by machines, unlike the natural intelligence displayed by humans and animals.",
        "The Internet is a global system of interconnected computer networks that uses the Internet protocol suite (TCP/IP) to communicate.",
        "Renewable energy sources, like solar and wind power, are naturally replenished on a human timescale.",
        "The human heart is a muscular organ that pumps blood through the circulatory system, supplying oxygen and nutrients to the body.",
        "Quantum computing is a type of computation that harnesses the phenomena of quantum mechanics, such as superposition and entanglement.",
        "Machine learning is a subset of artificial intelligence that involves training algorithms to learn from data and make predictions or decisions.",
    ]

    for i in range(num_samples):
        context = random.choice(base_contexts)
        question = ""
        answer = ""

        # Simple logic to create questions and answers based on keywords in contexts
        if "Paris" in context and "France" in context:
            question = f"What is the capital of France? (Sample {i+1})"
            answer = "Paris"
        elif "Mount Everest" in context and "highest mountain" in context:
            question = f"What is the highest mountain in the world? (Sample {i+1})"
            answer = "Mount Everest"
        elif "Amazon River" in context and "South America" in context:
            question = f"Where is the Amazon River located? (Sample {i+1})"
            answer = "South America"
        elif "Photosynthesis" in context and "plants" in context:
            question = f"What process do plants use to convert light energy? (Sample {i+1})"
            answer = "Photosynthesis"
        elif "Artificial intelligence" in context and "machines" in context:
            question = f"What is intelligence demonstrated by machines called? (Sample {i+1})"
            answer = "Artificial intelligence"
        elif "Internet" in context and "global system" in context:
            question = f"What is a global system of interconnected computer networks? (Sample {i+1})"
            answer = "The Internet"
        elif "Renewable energy" in context and "solar" in context:
            question = f"Name a type of renewable energy. (Sample {i+1})"
            answer = "solar"
        elif "human heart" in context and "pumps blood" in context:
            question = f"What organ pumps blood through the circulatory system? (Sample {i+1})"
            answer = "human heart"
        elif "Quantum computing" in context and "quantum mechanics" in context:
            question = f"What type of computation harnesses quantum mechanics? (Sample {i+1})"
            answer = "Quantum computing"
        elif "Machine learning" in context and "artificial intelligence" in context:
            question = f"What is a subset of artificial intelligence that learns from data? (Sample {i+1})"
            answer = "Machine learning"
        else:
            # Fallback for less specific contexts or if no specific match
            question = f"Tell me something about {context.split(' ')[random.randint(0, len(context.split())-1)]}? (Sample {i+1})"
            answer_words = context.split()
            answer_start = random.randint(0, len(answer_words) - 5) # Ensure enough words for a snippet
            answer = " ".join(answer_words[answer_start:answer_start + random.randint(2, 5)])


        dataset.append({
            "id": str(i), # Unique ID for each sample
            "context": context,
            "question": question,
            "answer": {
                "text": answer,
                # For BERT fine-tuning, you often need the start character position of the answer in the context
                "answer_start": context.find(answer) if answer in context else -1
            }
        })

    return dataset

if __name__ == "__main__":
    fake_qa_data = generate_fake_qa_dataset(num_samples=1000)
    print(f"Generated {len(fake_qa_data)} samples.")

    # You can save this to a JSON file
    with open("fake_qa_dataset.json", "w", encoding="utf-8") as f:
        json.dump(fake_qa_data, f, indent=4)
    print("Dataset saved to fake_qa_dataset.json")

    # Print a few samples to verify
    print("\n--- First 3 Samples ---")
    for j in range(min(3, len(fake_qa_data))):
        print(f"Sample {j+1}:")
        print(f"  Context: {fake_qa_data[j]['context']}")
        print(f"  Question: {fake_qa_data[j]['question']}")
        print(f"  Answer: {fake_qa_data[j]['answer']['text']} (Start: {fake_qa_data[j]['answer']['answer_start']})")
        print("-" * 20)


Generated 1000 samples.
Dataset saved to fake_qa_dataset.json

--- First 3 Samples ---
Sample 1:
  Context: Artificial intelligence (AI) is intelligence demonstrated by machines, unlike the natural intelligence displayed by humans and animals.
  Question: What is intelligence demonstrated by machines called? (Sample 1)
  Answer: Artificial intelligence (Start: 0)
--------------------
Sample 2:
  Context: Machine learning is a subset of artificial intelligence that involves training algorithms to learn from data and make predictions or decisions.
  Question: What is a subset of artificial intelligence that learns from data? (Sample 2)
  Answer: Machine learning (Start: 0)
--------------------
Sample 3:
  Context: Machine learning is a subset of artificial intelligence that involves training algorithms to learn from data and make predictions or decisions.
  Question: What is a subset of artificial intelligence that learns from data? (Sample 3)
  Answer: Machine learning (Start: 0)
------

In [33]:
import json
from datasets import Dataset, DatasetDict # Using Hugging Face's datasets library
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
import torch
import os

# --- 1. Load the dataset ---
def load_qa_dataset(file_path="fake_qa_dataset.json"):
    """
    Loads the fake Q&A dataset from a JSON file.
    """
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        print(f"Successfully loaded dataset from {file_path}. Found {len(data)} samples.")
        return data
    except FileNotFoundError:
        print(f"Error: Dataset file not found at {file_path}. Please run the data generation script first.")
        return None
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from {file_path}. Check file format.")
        return None

# Load your dataset
raw_datasets = load_qa_dataset()

if raw_datasets is None:
    exit() # Exit if dataset loading failed

# Convert the list of dictionaries to a Hugging Face Dataset object
# For simplicity, we'll put all data into the 'train' split for this example.
# In a real scenario, you'd split into train, validation, and test.
hf_dataset = Dataset.from_list(raw_datasets)
# Create a DatasetDict if you want to define splits (e.g., train, validation)
# For this example, let's create a small train/test split (80/20)
train_test_split = hf_dataset.train_test_split(test_size=0.2, seed=42)
dataset_dict = DatasetDict({
    'train': train_test_split['train'],
    'validation': train_test_split['test'] # Using test set as validation for this example
})
print(f"Dataset split into: {dataset_dict}")


# --- 2. Load pre-trained BERT model and tokenizer ---
# You can choose different BERT-like models, e.g., 'bert-base-uncased', 'distilbert-base-uncased'
model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

print(f"Loaded tokenizer and model: {model_checkpoint}")

# --- 3. Preprocess the dataset ---
# This is the most crucial part for Q&A fine-tuning.
# We need to map answers to token spans.

max_length = 384  # The maximum length of a feature (context and question)
doc_stride = 128   # The authorized overlap between two consecutive chunks

def preprocess_training_examples(examples):
    """
    Preprocesses the training examples for BERT Q&A.
    This involves tokenization, handling long contexts, and finding answer spans.
    """
    questions = [q.strip() for q in examples["question"]]
    contexts = examples["context"]
    answers = examples["answer"]

    # Tokenize contexts and questions together.
    # `truncation="only_second"` truncates the context if the combined length exceeds max_length.
    # `return_offsets_mapping` is crucial for mapping token spans back to original text.
    # `padding="max_length"` pads to max_length.
    # `stride` handles overlapping chunks for long contexts.
    tokenized_examples = tokenizer(
        questions,
        contexts,
        max_length=max_length,
        truncation="only_second",
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
        stride=doc_stride,
    )

    # Since one example can give us several features if it has a long context,
    # we need to ensure that each feature has the correct `example_id`
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offsets mapping will give us a tuple of (start_char, end_char) for each token.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We find the example that this feature came from
        sample_idx = sample_mapping[i]
        answer = answers[sample_idx]
        context = contexts[sample_idx]

        # Start and end character positions of the answer in the original context
        start_char = answer["answer_start"]
        end_char = start_char + len(answer["text"])

        # Sequence ID tells us if a token belongs to the question (0) or context (1)
        sequence_ids = tokenized_examples.sequence_ids(i)

        # Find the start and end of the context in the tokenized input
        # This is where the context begins after the [CLS] and question tokens
        idx = 0
        while sequence_ids[idx] != 1: # Find first token of context (sequence_id = 1)
            idx += 1
        context_start_token = idx

        # Find the end of the context
        idx = len(sequence_ids) - 1
        while sequence_ids[idx] != 1: # Find last token of context (sequence_id = 1)
            idx -= 1
        context_end_token = idx

        # If the answer is not fully contained in this chunk, set positions to 0 (CLS token)
        # This is a common practice when the answer is not found or spans across chunks.
        # The model will learn to predict the [CLS] token in such cases, indicating no answer.
        if (offsets[context_start_token][0] > start_char or
            offsets[context_end_token][1] < end_char):
            tokenized_examples["start_positions"].append(0)
            tokenized_examples["end_positions"].append(0)
        else:
            # Otherwise, find the start and end token positions
            # Iterate over tokens and check if their character offsets overlap with the answer.
            token_start_index = context_start_token
            while token_start_index <= context_end_token and offsets[token_start_index][0] <= start_char:
                token_start_index += 1
            tokenized_examples["start_positions"].append(token_start_index - 1)

            token_end_index = context_end_token
            while token_end_index >= context_start_token and offsets[token_end_index][1] >= end_char:
                token_end_index -= 1
            tokenized_examples["end_positions"].append(token_end_index + 1)
    return tokenized_examples

# Apply preprocessing to the dataset
print("Preprocessing training examples...")
tokenized_datasets = dataset_dict.map(
    preprocess_training_examples,
    batched=True,
    remove_columns=dataset_dict["train"].column_names # Remove original columns not needed for training
)
print("Preprocessing complete.")

# --- 4. Define training arguments ---
# Define output directory for saving checkpoints
output_dir = "./results"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

training_args = TrainingArguments(
    output_dir=output_dir,
    eval_strategy="epoch", # CORRECTED: Use eval_strategy instead of evaluation_strategy
    learning_rate=2e-5,
    per_device_train_batch_size=8, # Adjust based on your GPU memory
    per_device_eval_batch_size=8,
    num_train_epochs=3, # Number of epochs to train for
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch", # Save model every epoch
    load_best_model_at_end=True, # Load the best model at the end of training
    metric_for_best_model="eval_loss", # Metric to use for determining the best model
    push_to_hub=False, # Set to True if you want to push to Hugging Face Hub (requires login)
    do_train=True, # Explicitly enable training
    do_eval=True,  # Explicitly enable evaluation
)

# --- 5. Initialize Trainer ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer, # Pass tokenizer to Trainer for proper saving
)

# --- 6. Train the model ---
print("Starting model training...")
trainer.train()
print("Training complete!")

# Save the fine-tuned model and tokenizer
model_save_path = "./fine_tuned_bert_qa"
if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)

trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Fine-tuned model and tokenizer saved to {model_save_path}")

print("\nYou can now load this model for inference:")
print(f"""
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
tokenizer = AutoTokenizer.from_pretrained("{model_save_path}")
model = AutoModelForQuestionAnswering.from_pretrained("{model_save_path}")
""")


Successfully loaded dataset from fake_qa_dataset.json. Found 1000 samples.
Dataset split into: DatasetDict({
    train: Dataset({
        features: ['id', 'context', 'question', 'answer'],
        num_rows: 800
    })
    validation: Dataset({
        features: ['id', 'context', 'question', 'answer'],
        num_rows: 200
    })
})


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded tokenizer and model: bert-base-uncased
Preprocessing training examples...


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Preprocessing complete.
Starting model training...


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.0023,0.000965
2,0.0014,0.000563
3,0.0011,0.000486


Training complete!
Fine-tuned model and tokenizer saved to ./fine_tuned_bert_qa

You can now load this model for inference:

from transformers import AutoTokenizer, AutoModelForQuestionAnswering
tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_bert_qa")
model = AutoModelForQuestionAnswering.from_pretrained("./fine_tuned_bert_qa")



In [34]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
import torch
import os

# Define the path where your fine-tuned model and tokenizer are saved
model_path = "./fine_tuned_bert_qa"

# --- 1. Load the fine-tuned model and tokenizer ---
print(f"Loading fine-tuned model and tokenizer from: {model_path}")
if not os.path.exists(model_path):
    print(f"Error: Model directory not found at {model_path}. "
          "Please ensure the fine-tuning script completed successfully "
          "and saved the model to this location.")
    exit()

try:
    # We use Auto classes to ensure compatibility with the saved model
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForQuestionAnswering.from_pretrained(model_path)
    print("Model and tokenizer loaded successfully!")
except Exception as e:
    print(f"Failed to load model or tokenizer. Error: {e}")
    print("This might happen if the model was not saved correctly or if there's a version mismatch.")
    exit()

# --- 2. Create a Question-Answering pipeline ---
# The pipeline handles tokenization, model inference, and post-processing (extracting the answer span)
# You can specify the device if you have a GPU (e.g., device=0 for the first GPU)
qa_pipeline = pipeline(
    "question-answering",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1 # Use GPU if available, else CPU
)

print(f"QA pipeline initialized. Using device: {'GPU' if torch.cuda.is_available() else 'CPU'}")

# --- 3. Define a function for prediction ---
def get_answer(question: str, context: str):
    """
    Uses the fine-tuned BERT model to get an answer from a given context and question.
    """
    try:
        # The pipeline returns a dictionary with 'score', 'start', 'end', and 'answer'
        result = qa_pipeline(question=question, context=context)
        return result['answer']
    except Exception as e:
        print(f"An error occurred during prediction: {e}")
        return "Could not find an answer."

# --- 4. Test with example questions and contexts ---

print("\n--- Running Predictions ---")

# Example 1: Based on one of your generated contexts
context1 = "The capital of France is Paris. Paris is also known as the City of Lights. It is famous for the Eiffel Tower."
question1 = "What is the capital of France?"
print(f"\nContext: {context1}")
print(f"Question: {question1}")
print(f"Predicted Answer: {get_answer(question1, context1)}")
print("-" * 30)

# Example 2: Another context from your generated data
context2 = "Mount Everest is the highest mountain in the world, located in the Himalayas. Its peak is 8,848.86 meters above sea level."
question2 = "Where is Mount Everest located?"
print(f"\nContext: {context2}")
print(f"Question: {question2}")
print(f"Predicted Answer: {get_answer(question2, context2)}")
print("-" * 30)

# Example 3: A slightly different question to test generalization
context3 = "Photosynthesis is the process used by plants, algae and cyanobacteria to convert light energy into chemical energy."
question3 = "What do plants do with light energy?"
print(f"\nContext: {context3}")
print(f"Question: {question3}")
print(f"Predicted Answer: {get_answer(question3, context3)}")
print("-" * 30)

# Example 4: A question for which the answer might not be directly in the context
# (the model might return a less specific answer or an empty string depending on training)
context4 = "Artificial intelligence (AI) is intelligence demonstrated by machines, unlike the natural intelligence displayed by humans and animals."
question4 = "What is the future of AI?" # This specific answer is not directly in the context
print(f"\nContext: {context4}")
print(f"Question: {question4}")
print(f"Predicted Answer: {get_answer(question4, context4)}")
print("-" * 30)




print("\nPrediction process complete.")


Loading fine-tuned model and tokenizer from: ./fine_tuned_bert_qa


Device set to use cpu


Model and tokenizer loaded successfully!
QA pipeline initialized. Using device: CPU

--- Running Predictions ---

Context: The capital of France is Paris. Paris is also known as the City of Lights. It is famous for the Eiffel Tower.
Question: What is the capital of France?
Predicted Answer: Paris
------------------------------

Context: Mount Everest is the highest mountain in the world, located in the Himalayas. Its peak is 8,848.86 meters above sea level.
Question: Where is Mount Everest located?
Predicted Answer: Mount Everest
------------------------------

Context: Photosynthesis is the process used by plants, algae and cyanobacteria to convert light energy into chemical energy.
Question: What do plants do with light energy?
Predicted Answer: Photosynthesis
------------------------------

Context: Artificial intelligence (AI) is intelligence demonstrated by machines, unlike the natural intelligence displayed by humans and animals.
Question: What is the future of AI?
Predicted Answ

In [40]:
context4 = "Sam Altman is the CEO of OpenAI and he is also co-founder of OpenAI"
question4 = "who is the ceo of OpenAI?" # This specific answer is not directly in the context
print(f"\nContext: {context4}")
print(f"Question: {question4}")
print(f"Predicted Answer: {get_answer(question4, context4)}")
print("-" * 30)


Context: Sam Altman is the CEO of OpenAI and he is also co-founder of OpenAI
Question: who is the ceo of OpenAI?
Predicted Answer: OpenAI
------------------------------
