In [9]:
pip install fuzzywuzzy[speedup]  # or rapidfuzz




In [18]:
from fuzzywuzzy import process
import json
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the SQuAD dataset
with open('/content/dev-v1.1.json', 'r') as f:
    squad_data = json.load(f)

# Preprocess the dataset to map questions to their corresponding context
question_context_map = {}

for article in squad_data['data']:
    for paragraph in article['paragraphs']:
        context = paragraph['context']
        for qa in paragraph['qas']:
            question = qa['question']
            question_context_map[question] = context

# Initialize tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def get_closest_question(input_question):
    """
    Use fuzzy matching to find the closest question in the SQuAD dataset
    to the one provided by the user.
    """
    questions = list(question_context_map.keys())
    closest_question, _ = process.extractOne(input_question, questions)
    return closest_question

def answer_question_auto(question):
    """
    Takes a question as input, retrieves the closest matching context
    from the dataset, and uses the fine-tuned T5 model to generate an answer.
    """
    # Get the closest question based on the input question
    closest_question = get_closest_question(question)
    context = question_context_map[closest_question]

    print(f"Closest Question Found: {closest_question}")
    print(f"Context: {context}")

    # Combine question and context for the model input
    input_text = f"question: {question} context: {context}"

    # Encode input text using the tokenizer
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

    # Generate output (answer)
    with torch.no_grad():
        outputs = model.generate(input_ids, max_length=50, num_beams=5, early_stopping=True)

    # Decode the generated output to readable text
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Now you can input any question
user_question = input("Please ask your question: ")
predicted_answer = answer_question_auto(user_question)
print(f"Predicted Answer: {predicted_answer}")


Please ask your question: what form of poetry was developed in yuan
Closest Question Found: What form of poetry was developed in the Yuan?
Context: In the China of the Yuan, or Mongol era, various important developments in the arts occurred or continued in their development, including the areas of painting, mathematics, calligraphy, poetry, and theater, with many great artists and writers being famous today. Due to the coming together of painting, poetry, and calligraphy at this time many of the artists practicing these different pursuits were the same individuals, though perhaps more famed for one area of their achievements than others. Often in terms of the further development of landscape painting as well as the classical joining together of the arts of painting, poetry, and calligraphy, the Song dynasty and the Yuan dynasty are linked together. In the area of Chinese painting during the Yuan dynasty there were many famous painters. In the area of calligraphy many of the great calli

In [25]:
import json
import random
import numpy as np
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from fuzzywuzzy import process

# Load the SQuAD dataset
with open('/content/dev-v1.1.json', 'r') as f:
    squad_data = json.load(f)

# Preprocess the dataset to map questions to their corresponding context
question_context_map = {}
question_answer_map = {}

for article in squad_data['data']:
    for paragraph in article['paragraphs']:
        context = paragraph['context']
        for qa in paragraph['qas']:
            question = qa['question']
            answer = qa['answers'][0]['text'] if qa['answers'] else ''
            question_context_map[question] = context
            question_answer_map[question] = answer

# Initialize tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def get_closest_question(input_question):
    """
    Use fuzzy matching to find the closest question in the SQuAD dataset
    to the one provided by the user.
    """
    questions = list(question_context_map.keys())
    closest_question, _ = process.extractOne(input_question, questions)
    return closest_question

def answer_question_auto(question):
    """
    Takes a question as input, retrieves the closest matching context
    from the dataset, and uses the fine-tuned T5 model to generate an answer.
    """
    # Get the closest question based on the input question
    closest_question = get_closest_question(question)
    context = question_context_map[closest_question]

    print(f"Closest Question Found: {closest_question}")
    print(f"Context: {context}")

    # Combine question and context for the model input
    input_text = f"question: {question} context: {context}"

    # Encode input text using the tokenizer
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

    # Generate output (answer)
    with torch.no_grad():
        outputs = model.generate(input_ids, max_length=50, num_beams=3, early_stopping=True)

    # Decode the generated output to readable text
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def sample_random_subset(data, num_samples=50000):
    """
    Randomly sample a subset of the dataset.
    """
    questions = list(data.keys())
    sampled_questions = random.sample(questions, num_samples)
    return {q: data[q] for q in sampled_questions}

def answer_questions_batch(questions, contexts):
    """
    Generate answers for a batch of questions and contexts.
    """
    input_texts = [f"question: {q} context: {c}" for q, c in zip(questions, contexts)]
    inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)

    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=30, num_beams=3, early_stopping=True)

    answers = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return answers

def evaluate_model_on_subset(subset_question_context_map, batch_size=8):
    """
    Evaluate the model on a subset of the dataset.
    """
    model.eval()
    predictions = []
    references = []

    questions = list(subset_question_context_map.keys())
    contexts = [subset_question_context_map[q] for q in questions]
    ground_truths = [question_answer_map.get(q, '') for q in questions]

    # Process in batches
    for start_idx in range(0, len(questions), batch_size):
        end_idx = min(start_idx + batch_size, len(questions))
        batch_questions = questions[start_idx:end_idx]
        batch_contexts = contexts[start_idx:end_idx]
        batch_ground_truths = ground_truths[start_idx:end_idx]

        batch_predictions = answer_questions_batch(batch_questions, batch_contexts)
        predictions.extend(batch_predictions)
        references.extend(batch_ground_truths)

    # Compute metrics
    em_score = np.mean([p.lower() == r.lower() for p, r in zip(predictions, references)])
    f1_scores = []
    for pred, ref in zip(predictions, references):
        pred_tokens = set(pred.lower().split())
        ref_tokens = set(ref.lower().split())
        precision = len(pred_tokens & ref_tokens) / len(pred_tokens) if pred_tokens else 0
        recall = len(pred_tokens & ref_tokens) / len(ref_tokens) if ref_tokens else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) else 0
        f1_scores.append(f1)

    avg_f1_score = np.mean(f1_scores)

    print(f"Exact Match Score: {em_score * 100:.2f}%")
    print(f"Average F1 Score: {avg_f1_score:.2f}")

# Use the subset for evaluation
subset_question_context_map = sample_random_subset(question_context_map, num_samples=100)
evaluate_model_on_subset(subset_question_context_map, batch_size=8)


Exact Match Score: 59.00%
Average F1 Score: 0.77
