In [None]:
import os
import json
import glob
from pathlib import Path
import requests
import time
from typing import List, Dict, Any
import logging

In [None]:
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load environment variables from .env file
load_dotenv()
# Read the API key
NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY")
NVIDIA_API_URL = "https://api.nvcf.nvidia.com/v2/nvcf/pexec/functions/chat-completion"

In [35]:
def load_json_files(base_dir: str) -> List[Dict[str, Any]]:
    """Load all JSON files from the given directory and its subdirectories."""
    json_files = []
    for json_file in glob.glob(os.path.join(base_dir, "**/*.json"), recursive=True):
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
                json_files.append({
                    'file_path': json_file,
                    'content': data
                })
            logger.info(f"Successfully loaded {json_file}")
        except Exception as e:
            logger.error(f"Error loading {json_file}: {str(e)}")
    return json_files

def generate_qa_pairs(content: Dict[str, Any]) -> List[Dict[str, str]]:
    """Generate Q&A pairs using NVIDIA's LLM API."""
    headers = {
        "Authorization": f"Bearer {NVIDIA_API_KEY}",
        "Content-Type": "application/json"
    }
    
    # Prepare the prompt for the LLM
    prompt = f"""Based on the following medical information, generate 3 relevant question-answer pairs. 
    Make the questions specific and the answers detailed but concise.
    
    Information:
    {json.dumps(content, indent=2)}
    
    Format the response as a JSON array of objects with 'question' and 'answer' fields."""
    
    payload = {
        "messages": [
            {
                "role": "user",
                "content": prompt
            }
        ],
        "temperature": 0.7,
        "max_tokens": 1000
    }
    
    try:
        response = requests.post(NVIDIA_API_URL, headers=headers, json=payload)
        response.raise_for_status()
        result = response.json()
        
        # Extract the generated Q&A pairs from the response
        qa_pairs = json.loads(result['choices'][0]['message']['content'])
        return qa_pairs
    except Exception as e:
        logger.error(f"Error generating Q&A pairs: {str(e)}")
        return []


In [None]:
def main():
    base_dir = os.path.join(os.getcwd(), "raw_files")
    output_dir = os.path.join(os.getcwd(), "generated_qa")
    os.makedirs(output_dir, exist_ok=True)
    
    # Load all JSON files
    json_files = load_json_files(base_dir)
    logger.info(f"Found {len(json_files)} JSON files")
    
    # Process each file
    for file_data in json_files:
        file_path = file_data['file_path']
        content = file_data['content']
        
        # Generate Q&A pairs
        qa_pairs = generate_qa_pairs(content)
        
        if qa_pairs:
            # Create output filename
            relative_path = os.path.relpath(file_path, base_dir)
            output_filename = os.path.join(output_dir, f"{Path(relative_path).stem}_qa.json")
            
            # Save Q&A pairs
            with open(output_filename, 'w', encoding='utf-8') as f:
                json.dump(qa_pairs, f, indent=2)
            logger.info(f"Saved Q&A pairs to {output_filename}")
        
        # Add a small delay to avoid rate limiting
        time.sleep(1)

if __name__ == "__main__":
    main() 

### Generation Summary 