In [1]:
!pip install PyPDF2
!pip install faiss-cpu
!pip install langchain_google_genai
!pip install scikit-learn

In [2]:
import os
import time
import json
import random
import PyPDF2
import requests
import faiss
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from langchain_google_genai import ChatGoogleGenerativeAI

In [3]:
# Step 1: Extract Text from PDF
def extract_pdf_text(pdf_path):
    try:
        with open(pdf_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
        return text
    except Exception as e:
        print(f"Error extracting PDF: {str(e)}")
        return ""

In [4]:
# Step 2: Chunk the Extracted Text
def chunk_text(text, chunk_size=200, overlap=50):
    words = text.split()
    chunks = []

    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        if chunk:  # Ensure we don't add empty chunks
            chunks.append(chunk)

    return chunks

In [5]:
# Step 3: Define Improved Retrieval Function with TF-IDF and FAISS
class FAISSRetriever:
    def __init__(self, chunks):
        self.chunks = chunks
        self.vectorizer = TfidfVectorizer(stop_words='english')

        if not chunks:
            self.index = None
            self.vectors = None
            return

        # Create TF-IDF matrix
        self.vectors = self.vectorizer.fit_transform(chunks).toarray().astype('float32')

        # Initialize FAISS index - using L2 distance
        self.dimension = self.vectors.shape[1]
        self.index = faiss.IndexFlatL2(self.dimension)

        # Add vectors to the index
        faiss.normalize_L2(self.vectors)  # Normalize for cosine similarity
        self.index.add(self.vectors)

    def retrieve(self, query, top_k=3):
        if not self.chunks or self.index is None:
            return []

        try:
            # Vectorize the query
            query_vector = self.vectorizer.transform([query]).toarray().astype('float32')
            faiss.normalize_L2(query_vector)  # Normalize for cosine similarity

            # Search using FAISS
            distances, indices = self.index.search(query_vector, min(top_k, len(self.chunks)))

            # Return the top-k chunks
            return [self.chunks[i] for i in indices[0]]
        except Exception as e:
            print(f"Error in FAISS retrieval: {str(e)}")
            # Fallback to TF-IDF if FAISS fails
            return self.tfidf_fallback(query, top_k)

    def tfidf_fallback(self, query, top_k=3):
        try:
            query_vector = self.vectorizer.transform([query])
            similarities = cosine_similarity(query_vector, self.vectors)[0]
            top_indices = similarities.argsort()[-top_k:][::-1]
            return [self.chunks[i] for i in top_indices]
        except Exception as e:
            print(f"Error in TF-IDF fallback: {str(e)}")
            # Final fallback to random selection
            return random.sample(self.chunks, min(top_k, len(self.chunks)))

In [6]:
# Step 4: Define Bloom's Taxonomy Levels with improved descriptions
bloom_levels = {
    "Knowledge": "Create a question that tests recall of factual information, definitions, or key concepts from the text.",
    "Comprehension": "Create a question that requires summarizing or explaining the main ideas in one's own words.",
    "Application": "Create a question that asks how to apply concepts from the text to new situations or problems.",
    "Analysis": "Create a question that requires analyzing the relationships between different components or examining the structure of ideas.",
    "Synthesis": "Create a question that asks to combine ideas to create something new or propose alternative solutions.",
    "Evaluation": "Create a question that requires making judgments based on criteria, critically assessing arguments, or evaluating the validity of ideas."
}
 
# Define answer word count ranges based on marks
answer_word_counts = {
    1: {"min": 20, "max": 50, "description": "brief and concise"},
    3: {"min": 150, "max": 200, "description": "moderately detailed"},
    5: {"min": 300, "max": 500, "description": "comprehensive and in-depth"}
}

In [7]:
# Step 5: Implement RAG with LangChain Gemini Integration
def generate_question_with_gemini(retriever, mark, bloom_level, query, retry_count=2):
    relevant_chunks = retriever.retrieve(query, top_k=3)

    if not relevant_chunks:
        return {"error": "No relevant context found in the document"}

    context = "\n".join(relevant_chunks)

    # Get the word count requirements for this mark value
    word_count = answer_word_counts[mark]

    # More detailed prompt with marking scheme and word count requirements
    prompt = (
        f"Context from the document:\n\n{context}\n\n"
        f"Based on the above context, create a {mark}-mark exam question at the '{bloom_level}' level of Bloom's Taxonomy.\n"
        f"For a {mark}-mark question, the student's answer should have {mark} distinct points or aspects to receive full marks.\n\n"
        f"IMPORTANT: The model answer MUST be {word_count['min']}-{word_count['max']} words ({word_count['description']}).\n\n"
        f"Format your response exactly as follows:\n"
        f"Question: [Write your question here]\n"
        f"Answer: [Provide a model answer that would receive full marks and is {word_count['min']}-{word_count['max']} words]\n"
        f"Marking Scheme: [List {mark} specific points that should be included for full marks]\n"
        f"Word Count: [Provide the exact word count of your answer]"
    )

    # Initialize LangChain's Gemini model
    try:
        api_key = os.environ.get("GOOGLE_API_KEY", "AIzaSyDSQLLc2Pe25j-9FTPXuZYhptlmutkXSCk")

        # Initialize the LangChain ChatGoogleGenerativeAI object
        llm = ChatGoogleGenerativeAI(
            model="gemini-2.0-flash",
            google_api_key=api_key,
            temperature=0.4,
            max_output_tokens=1500,
            max_retries=retry_count,
            timeout=30
        )

        # Define system message for role context
        system_message = "You are an expert education professional creating exam questions based on provided text. Your task is to generate questions with answers that strictly adhere to specified word count requirements."

        for attempt in range(retry_count + 1):
            try:
                print(f"Attempt {attempt+1}: Making request with LangChain to Gemini")

                # Construct the messages
                from langchain_core.messages import SystemMessage, HumanMessage
                messages = [
                    SystemMessage(content=system_message),
                    HumanMessage(content=prompt)
                ]

                # Invoke the model
                response = llm.invoke(messages)
                generated_text = response.content

                # Extract question, answer, marking scheme, and word count
                question_part = ""
                answer_part = ""
                marking_scheme = ""
                word_count_actual = 0

                # Parse the response robustly
                if "Question:" in generated_text:
                    parts = generated_text.split("Question:", 1)
                    remaining = parts[1]

                    if "Answer:" in remaining:
                        question_answer = remaining.split("Answer:", 1)
                        question_part = question_answer[0].strip()

                        if "Marking Scheme:" in question_answer[1]:
                            answer_marking = question_answer[1].split("Marking Scheme:", 1)
                            answer_part = answer_marking[0].strip()

                            if "Word Count:" in answer_marking[1]:
                                marking_wordcount = answer_marking[1].split("Word Count:", 1)
                                marking_scheme = marking_wordcount[0].strip()

                                # Try to extract the actual word count
                                word_count_text = marking_wordcount[1].strip()
                                try:
                                    word_count_actual = int(''.join(filter(str.isdigit, word_count_text.split()[0])))
                                except:
                                    # Count words in answer if extraction fails
                                    word_count_actual = len(answer_part.split())
                            else:
                                marking_scheme = answer_marking[1].strip()
                                word_count_actual = len(answer_part.split())
                        else:
                            answer_part = question_answer[1].strip()
                            word_count_actual = len(answer_part.split())

                # Verify word count is within requirements
                actual_word_count = len(answer_part.split())
                within_limits = word_count["min"] <= actual_word_count <= word_count["max"]

                # If not within limits and we have retries left, try again
                if not within_limits and attempt < retry_count:
                    print(f"Answer word count ({actual_word_count}) outside required range ({word_count['min']}-{word_count['max']}). Retrying...")
                    time.sleep(1)
                    continue

                return {
                    "question": question_part,
                    "answer": answer_part,
                    "marking_scheme": marking_scheme,
                    "bloom_taxonomy": bloom_level,
                    "marks": mark,
                    "word_count": actual_word_count,
                    "retrieved_context": context,  # Include the retrieved context
                    "retrieved_chunks": relevant_chunks  # Also include individual chunks for more detailed analysis
                }

            except Exception as e:
                print(f"Error in LangChain Gemini request (attempt {attempt+1}/{retry_count+1}): {str(e)}")
                if attempt < retry_count:
                    wait_time = min(2 * (attempt + 1), 10)  # Exponential backoff
                    print(f"Waiting {wait_time} seconds before retrying...")
                    time.sleep(wait_time)
                    continue
                return {"error": f"LangChain request failed: {str(e)}"}

    except Exception as e:
        print(f"Failed to initialize LangChain model: {str(e)}")
        return {"error": f"Failed to initialize LangChain model: {str(e)}"}

    return {"error": "Failed to generate question after multiple attempts"}


In [8]:
# # At the beginning of your main block, add this function to validate the API key
# def test_api_connection(api_key):
#     """Test the API connection with a simple request using LangChain"""
#     try:
#         print("Testing API connection...")

#         # Initialize LangChain model with minimal settings for a quick test
#         llm = ChatGoogleGenerativeAI(
#             model="gemini-2.0-flash",
#             google_api_key=api_key,
#             temperature=0.1,
#             max_output_tokens=50,
#             timeout=10
#         )

#         # Simple test message
#         from langchain_core.messages import HumanMessage
#         test_message = HumanMessage(content="Hello, please respond with 'API connection successful' if you receive this message.")

#         # Make a simple request
#         response = llm.invoke([test_message])

#         if response and response.content:
#             print("API connection test successful!")
#             return True
#         else:
#             print("API test failed: No valid response content")
#             return False

#     except Exception as e:
#         print(f"API test failed with exception: {str(e)}")
#         return False

In [9]:
# Function to format output for each question - now including retrieved context
def format_question_output(question_data):
    """Format question data according to the required output format with retrieved context"""
    return {
        "Question": question_data["question"],
        "Answer": question_data["answer"],
        "Bloom's Taxonomy": question_data["bloom_taxonomy"],
        "Marks": question_data["marks"],
        "Retrieved Context": question_data["retrieved_context"],  # Include full concatenated context
        "Individual Context Chunks": question_data.get("retrieved_chunks", [])  # Include individual chunks
    }

In [10]:
# Main execution block
if __name__ == "__main__":
    # Set the API key
    os.environ["GOOGLE_API_KEY"] = "AIzaSyDSQLLc2Pe25j-9FTPXuZYhptlmutkXSCk"

    # Test API connection before proceeding
    # if not test_api_connection(os.environ["GOOGLE_API_KEY"]):
    #     print("API connection test failed. Please check your API key and try again.")
    #     exit(1)

    # Define your PDF file path here - use an environment variable or config file in production
    PDF_PATH = "hess205.pdf"

    if not os.path.exists(PDF_PATH):
        print(f"Error: PDF file not found at {PDF_PATH}")
        exit(1)

    # Extract text from PDF
    print(f"Extracting text from {PDF_PATH}...")
    pdf_text = extract_pdf_text(PDF_PATH)

    if not pdf_text:
        print("Error: Could not extract text from PDF")
        exit(1)

    # Create chunks
    print("Chunking text...")
    chunks = chunk_text(pdf_text, chunk_size=300, overlap=50)
    print(f"Created {len(chunks)} chunks")

    # Initialize FAISS retriever
    print("Initializing FAISS retriever...")
    retriever = FAISSRetriever(chunks)

    # Generate specific number of questions for each mark value
    questions = []
    marks_distribution = {1: 5, 3: 5, 5: 5}  # 3 questions of each mark value
    taxonomy_levels = list(bloom_levels.keys())

    print("Generating questions...")

    # Generate questions according to the required distribution
    for mark, count in marks_distribution.items():
        generated_count = 0
        retries = 0
        max_retries = 10  # Max number of attempts per mark value

        # Try to generate the required number of questions for this mark value
        while generated_count < count and retries < max_retries:
            # Distribute taxonomy levels evenly
            level = taxonomy_levels[generated_count % len(taxonomy_levels)]
            query = bloom_levels[level]

            print(f"Generating a {mark}-mark question at {level} level (attempt {retries+1})...")
            q = generate_question_with_gemini(retriever, mark, level, query, retry_count=3)

            if "error" in q:
                print(f"Error: {q['error']}")
                retries += 1
                time.sleep(1)  # Slight delay before retrying
            else:
                # Verify the word count is within the specified range
                word_count = q.get("word_count", 0)
                word_count_range = answer_word_counts[mark]

                if word_count_range["min"] <= word_count <= word_count_range["max"]:
                    questions.append(q)
                    print(f"Generated {mark}-mark question with {word_count} words: {q['question'][:50]}...")
                    print(f"Retrieved context length: {len(q['retrieved_context'])} characters")
                    generated_count += 1
                else:
                    print(f"Rejected question with {word_count} words (outside range {word_count_range['min']}-{word_count_range['max']})")
                    retries += 1

    # Sort questions by mark value
    questions.sort(key=lambda q: q.get("marks", 0))

    # Format questions according to the required output format
    formatted_questions = [format_question_output(q) for q in questions]

    # Save questions to JSON file in the required format
    output_file = "generated_questions.json"
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(formatted_questions, f, indent=4, ensure_ascii=False)

    # Print example of formatted output
    if formatted_questions:
        print("\nExample of formatted question output (truncated for readability):")
        example = formatted_questions[0].copy()

        # Truncate context for display purposes only
        if "Retrieved Context" in example:
            context_preview = example["Retrieved Context"][:150] + "..." if len(example["Retrieved Context"]) > 150 else example["Retrieved Context"]
            example["Retrieved Context"] = context_preview

        if "Individual Context Chunks" in example and example["Individual Context Chunks"]:
            for i, chunk in enumerate(example["Individual Context Chunks"]):
                example["Individual Context Chunks"][i] = chunk[:100] + "..." if len(chunk) > 100 else chunk

        print(json.dumps(example, indent=2))

    # Print summary
    mark_counts = {}
    for q in questions:
        mark = q.get("marks", 0)
        mark_counts[mark] = mark_counts.get(mark, 0) + 1

    print(f"\nSuccessfully generated {len(questions)} questions with the following distribution:")
    for mark, count in mark_counts.items():
        print(f"  {mark}-mark questions: {count}")
    print(f"Questions saved to {output_file}")

Extracting text from hess205.pdf...
Chunking text...
Created 20 chunks
Initializing FAISS retriever...
Generating questions...
Generating a 1-mark question at Knowledge level (attempt 1)...
Attempt 1: Making request with LangChain to Gemini
Generated 1-mark question with 34 words: Who was proclaimed the leader of the sepoys by the...
Retrieved context length: 5158 characters
Generating a 1-mark question at Comprehension level (attempt 1)...
Attempt 1: Making request with LangChain to Gemini
Generated 1-mark question with 30 words: What action by the British ignited the revolt in M...
Retrieved context length: 5344 characters
Generating a 1-mark question at Application level (attempt 1)...
Attempt 1: Making request with LangChain to Gemini
Generated 1-mark question with 31 words: Based on the text, how could the British have pote...
Retrieved context length: 5152 characters
Generating a 1-mark question at Analysis level (attempt 1)...
Attempt 1: Making request with LangChain to Gemini
G

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..


Error in LangChain Gemini request (attempt 1/4): 429 Resource has been exhausted (e.g. check quota).
Waiting 2 seconds before retrying...
Attempt 2: Making request with LangChain to Gemini


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..


Answer word count (520) outside required range (300-500). Retrying...
Attempt 3: Making request with LangChain to Gemini
Generated 5-mark question with 483 words: The events of 1857, triggered by the controversial...
Retrieved context length: 5152 characters
Generating a 5-mark question at Analysis level (attempt 1)...
Attempt 1: Making request with LangChain to Gemini
Answer word count (529) outside required range (300-500). Retrying...
Attempt 2: Making request with LangChain to Gemini
Generated 5-mark question with 435 words: Analyze the multiple factors that contributed to t...
Retrieved context length: 5259 characters
Generating a 5-mark question at Synthesis level (attempt 1)...
Attempt 1: Making request with LangChain to Gemini
Generated 5-mark question with 494 words: Synthesize the various factors that contributed to...
Retrieved context length: 5242 characters

Example of formatted question output (truncated for readability):
{
  "Question": "Who was proclaimed the leader of 