In [3]:
import pickle
from tqdm.notebook import tqdm
from sagerag import ResearchAssistant
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

def generate_question_from_chunk(assistant, chunk_text):
    """Uses an LLM to generate a question that the chunk_text can answer."""
    
    question_generation_prompt = PromptTemplate.from_template(
        "You are an expert researcher. Given the following paragraph from a scientific paper, "
        "generate one clear, specific question that this text could perfectly answer. "
        "Do not ask a question that is too broad. Focus only on the information present in the text. "
        "Provide only the question and nothing else.\n\n"
        "Text: \"{context}\"\n\n"
        "Question:"
    )
    
    chain = question_generation_prompt | assistant.llm | StrOutputParser()
    question = chain.invoke({"context": chunk_text}).strip()
    return question

# --- Example Usage in your notebook ---

# Load the chunks you created earlier
with open("../Database/processed_chunks.pkl", "rb") as f:
    all_chunks = pickle.load(f)

# Create a sample of 200 chunks
import random
chunk_sample = random.sample(all_chunks, 200)

golden_dataset = []
for chunk in tqdm(chunk_sample, desc="Generating Golden Questions"):
    question = generate_question_from_chunk(assistant, chunk.page_content)
    golden_dataset.append({
        "generated_question": question,
        "source_chunk_text": chunk.page_content,
        "source_metadata": chunk.metadata
    })

# Save your new evaluation dataset
df_golden = pd.DataFrame(golden_dataset)
df_golden.to_csv("golden_evaluation_questions.csv", index=False)

Generating Golden Questions:   0%|          | 0/200 [00:00<?, ?it/s]

NameError: name 'assistant' is not defined