In [None]:
from google import genai
from pydantic import BaseModel
from dotenv import load_dotenv
import os
from pathlib import Path
import json
import time

load_dotenv()

GOOGLE_API_KEY = os.getenv("GEMINI_API")

class QAPair(BaseModel):
    question: str
    answer: str

class QAResponse(BaseModel):
    qa_pairs: list[QAPair]

def process_markdown_files(directory_path, output_dir):
    client = genai.Client(api_key=GOOGLE_API_KEY)
    
    directory = Path(directory_path)
    output_directory = Path(output_dir)
    
    if not directory.exists():
        print(f"Error: Directory '{directory_path}' does not exist")
        return
    
    # Create output directory if it doesn't exist
    output_directory.mkdir(parents=True, exist_ok=True)
    
    # Find all markdown files
    md_files = list(directory.glob("*.md"))
    if not md_files:
        print(f"No .md files found in {directory_path}")
        return
    
    print(f"Found {len(md_files)} markdown files")
    
    processed_count = 0
    skipped_count = 0
    
    for md_file in md_files:
        # Check if output file already exists
        output_path = output_directory / f"{md_file.stem}_qa_pairs.json"
        if output_path.exists():
            print(f"Skipping {md_file.name} - output file already exists")
            skipped_count += 1
            continue
            
        print(f"Processing {md_file.name}...")
        
        try:
            # Read the markdown file
            with open(md_file, 'r', encoding='utf-8') as f:
                content = f.read()
            
            if not content.strip():
                print(f"  Skipping empty file: {md_file.name}")
                continue
            
            # Create the prompt
            prompt = f"""Given this text: {content}

Generate 3-5 question-answer pairs that capture the key information. Create a mix of:
* Factual questions ("What is X?")
* Analytical prompts ("Explain why X happens")
* Application-focused ("How would you use X?")
* Comparative ("How does X differ from Y?")

Make sure the questions are specific to the content and the answers are comprehensive but concise."""
            
            # Generate Q&A pairs
            response = client.models.generate_content(
                model="gemini-2.0-flash-exp",
                contents=prompt,
                config={
                    "response_mime_type": "application/json",
                    "response_schema": QAResponse,
                },
            )
            
            # Parse the response
            qa_response: QAResponse = response.parsed
            
            # Create Q&A pairs for this specific file
            file_qa_pairs = []
            for qa_pair in qa_response.qa_pairs:
                file_qa_pairs.append({
                    "source_file": md_file.name,
                    "question": qa_pair.question,
                    "answer": qa_pair.answer
                })
            
            print(f"  Generated {len(qa_response.qa_pairs)} Q&A pairs")
            
            # Save Q&A pairs for this file using original filename without .md extension
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(file_qa_pairs, f, indent=2, ensure_ascii=False)
            
            print(f"  Saved to {output_path}")
            processed_count += 1
            
            # Wait 7 seconds between API calls to respect rate limits (10 requests per minute)
            print(f"  Waiting 7 seconds before next request...")
            time.sleep(7)
            
        except Exception as e:
            print(f"Error processing {md_file.name}: {e}")
            # If it's a rate limit error, wait longer
            if "429" in str(e) or "RESOURCE_EXHAUSTED" in str(e):
                print("Rate limit hit, waiting 65 seconds...")
                time.sleep(65)
            continue
    
    print(f"\nProcessing complete!")
    print(f"Processed: {processed_count} files")
    print(f"Skipped (already exists): {skipped_count} files")
    print(f"Total files: {len(md_files)}")

dir_path = 'data/distil/Legends'
output_dir='data/QA Pairs/Legends'

process_markdown_files(dir_path, output_dir)