In [None]:
# from google import genai
# from pydantic import BaseModel
# from dotenv import load_dotenv
# import os
# load_dotenv()

# GOOGLE_API_KEY = os.getenv("GEMINI_API")

# class Recipe(BaseModel):
#     recipe_name: str
#     ingredients: list[str]

# client = genai.Client(api_key="GOOGLE_API_KEY")
# response = client.models.generate_content(
#     model="gemini-2.0-flash",
#     contents="List a few popular cookie recipes, and include the amounts of ingredients.",
#     config={
#         "response_mime_type": "application/json",
#         "response_schema": list[Recipe],
#     },
# )
# # Use the response as a JSON string.
# print(response.text)

# # Use instantiated objects.
# my_recipes: list[Recipe] = response.parsed

In [None]:
from google import genai
from pydantic import BaseModel
from dotenv import load_dotenv
import os
from pathlib import Path
import json

load_dotenv()

GOOGLE_API_KEY = os.getenv("GEMINI_API")

class QAPair(BaseModel):
    question: str
    answer: str

class QAResponse(BaseModel):
    qa_pairs: list[QAPair]

def process_markdown_files(directory_path, output_file="qa_pairs.json"):
    client = genai.Client(api_key=GOOGLE_API_KEY)
    
    directory = Path(directory_path)
    if not directory.exists():
        print(f"Error: Directory '{directory_path}' does not exist")
        return
    
    # Find all markdown files
    md_files = list(directory.glob("*.md"))
    if not md_files:
        print(f"No .md files found in {directory_path}")
        return
    
    all_qa_pairs = []
    
    print(f"Found {len(md_files)} markdown files")
    
    for md_file in md_files:
        print(f"Processing {md_file.name}...")
        
        try:
            # Read the markdown file
            with open(md_file, 'r', encoding='utf-8') as f:
                content = f.read()
            
            if not content.strip():
                print(f"  Skipping empty file: {md_file.name}")
                continue
            
            # Create the prompt
            prompt = f"""Given this text: {content}

Generate 3-5 question-answer pairs that capture the key information. Create a mix of:
* Factual questions ("What is X?")
* Analytical prompts ("Explain why X happens")
* Application-focused ("How would you use X?")
* Comparative ("How does X differ from Y?")

Make sure the questions are specific to the content and the answers are comprehensive but concise."""
            
            # Generate Q&A pairs
            response = client.models.generate_content(
                model="gemini-2.0-flash-exp",
                contents=prompt,
                config={
                    "response_mime_type": "application/json",
                    "response_schema": QAResponse,
                },
            )
            
            # Parse the response
            qa_response: QAResponse = response.parsed
            
            # Add source file information to each Q&A pair
            for qa_pair in qa_response.qa_pairs:
                all_qa_pairs.append({
                    "source_file": md_file.name,
                    "question": qa_pair.question,
                    "answer": qa_pair.answer
                })
            
            print(f"  Generated {len(qa_response.qa_pairs)} Q&A pairs")
            break
            
        except Exception as e:
            print(f"Error processing {md_file.name}: {e}")
            continue
    
    # Save all Q&A pairs to JSON
    output_path = directory / output_file
    try:
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(all_qa_pairs, f, indent=2, ensure_ascii=False)
        
        print(f"\nSuccessfully generated {len(all_qa_pairs)} total Q&A pairs")
        print(f"Results saved to {output_path}")
        
    except Exception as e:
        print(f"Error saving results: {e}")


dir_path = 'data/distil/Legends'

process_markdown_files(dir_path)

Error: Directory 'data/distil/Lengends' does not exist
