# Config

In [None]:
API_KEY = ""
LLM_MODEL = "gemini-2.5-pro-preview-03-25"

In [None]:
TEMPERATURE = 1

# Code

In [None]:
import json

In [None]:
"""
Configuration settings for Google Gemini API.

This module provides configuration settings and utilities for working with
the Google Gemini API, including environment variable loading and default
parameters for different use cases.
"""

import os
from typing import Dict, Any, Optional

class GeminiConfig:
    """Configuration settings for Google Gemini API."""

    # API key from environment variable
    API_KEY: str = API_KEY

    # Default models
    DEFAULT_MODEL: str = LLM_MODEL # Updated default model

    # Default parameters for different use cases
    DEFAULT_PARAMETERS: Dict[str, Any] = {
        "temperature": TEMPERATURE,
        "top_p": 0.95,
        "top_k": 40,
        "max_output_tokens": 60000,  # Increased default max tokens
    }

    # Parameters for more deterministic responses (e.g., for structured data extraction)
    DETERMINISTIC_PARAMETERS: Dict[str, Any] = {
        "temperature": 0.2,
        "top_p": 0.95,
        "top_k": 40,
        "max_output_tokens": 2048,
    }

    # Parameters for more creative responses (e.g., for CS agent)
    CREATIVE_PARAMETERS: Dict[str, Any] = {
        "temperature": 1.0,
        "top_p": 0.99,
        "top_k": 40,
        "max_output_tokens": 4096,
    }

    # Safety settings
    SAFETY_SETTINGS: Dict[str, str] = {
        "harassment": "BLOCK_NONE",
        "hate_speech": "BLOCK_NONE",
        "sexually_explicit": "BLOCK_NONE",
        "dangerous_content": "BLOCK_NONE",
    }

    @classmethod
    def get_api_key(cls) -> str:
        """
        Get the API key from environment variables.

        Returns:
            str: The API key.

        Raises:
            ValueError: If the API key is not set.
        """
        if not cls.API_KEY:
            raise ValueError(
                "GOOGLE_API_KEY environment variable is not set. "
                "Please set it in your .env file or environment variables."
            )
        return cls.API_KEY

    @classmethod
    def get_parameters(cls, parameter_set: str = "default") -> Dict[str, Any]:
        """
        Get parameters for a specific use case.

        Args:
            parameter_set: The parameter set to use. One of "default", "deterministic", or "creative".

        Returns:
            Dict[str, Any]: The parameters.
        """
        parameter_sets = {
            "default": cls.DEFAULT_PARAMETERS,
            "deterministic": cls.DETERMINISTIC_PARAMETERS,
            "creative": cls.CREATIVE_PARAMETERS,
        }

        return parameter_sets.get(parameter_set.lower(), cls.DEFAULT_PARAMETERS)


In [None]:
"""
LLM Service for Google Gemini.

This module provides a service for interacting with Google Gemini API,
offering methods for content generation, embeddings, and streaming responses.
"""

import time
import logging
from typing import Dict, List, Any, Optional, Union, Generator, Callable

import google.generativeai as genai
from google.generativeai.types import GenerationConfig
from google.generativeai.types.safety_types import HarmCategory, HarmBlockThreshold
from google.generativeai.types.generation_types import GenerateContentResponse

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class LLMService:
    """Service for interacting with Google Gemini API."""

    def __init__(self, api_key: Optional[str] = None):
        """
        Initialize the LLM service.

        Args:
            api_key: The API key to use. If not provided, it will be loaded from environment variables.
        """
        self.api_key = api_key or GeminiConfig.get_api_key()
        self.GeminiConfig = (
            GeminiConfig  # Make GeminiConfig accessible as an instance attribute
        )
        self._initialize_client()

    def _initialize_client(self) -> None:
        """Initialize the Google Gemini client."""
        genai.configure(api_key=self.api_key)
        logger.info("Google Gemini client initialized")

    def generate_content(
        self,
        prompt: Optional[str] = None,  # Make prompt optional
        contents: Optional[List[Union[str, Dict]]] = None,  # Add contents parameter
        model: Optional[str] = None,
        parameters: Optional[Dict[str, Any]] = None,
        safety_settings: Optional[Dict[str, str]] = None,
        retry_count: int = 3,
        retry_delay: float = 1.0,
        max_output_tokens: Optional[int] = None,
    ) -> GenerateContentResponse:
        """
        Generate content using Google Gemini.

        Can accept either a simple text prompt or a list of contents for multi-modal input.

        Args:
            prompt: The text prompt to generate content from (used if contents is None).
            contents: A list of content parts (e.g., text, PDF data) for multi-modal input.
                      Example: [{"mime_type": "application/pdf", "data": pdf_bytes}, "prompt text"]
            model: The model to use. If not provided, the default model will be used.
            parameters: The parameters to use. If not provided, the default parameters will be used.
            safety_settings: The safety settings to use. If not provided, the default safety settings will be used.
            retry_count: The number of times to retry if the API call fails.
            retry_delay: The delay between retries in seconds.
            max_output_tokens: Optional maximum number of tokens to generate.

        Returns:
            The generated content response.

        Raises:
            ValueError: If neither prompt nor contents is provided.
            Exception: If the API call fails after all retries.
        """
        if prompt is None and contents is None:
            raise ValueError("Either 'prompt' or 'contents' must be provided.")
        if prompt is not None and contents is not None:
            logger.warning(
                "Both 'prompt' and 'contents' provided. 'contents' will be used."
            )

        model = model or self.GeminiConfig.DEFAULT_MODEL  # Use instance attribute
        parameters = (
            parameters or self.GeminiConfig.get_parameters()
        )  # Use instance attribute
        safety_settings = (
            safety_settings or self.GeminiConfig.SAFETY_SETTINGS
        )  # Use instance attribute

        # Add max_output_tokens to parameters if provided
        if max_output_tokens is not None:
            parameters["max_output_tokens"] = max_output_tokens

        # Convert parameters to GenerationConfig
        generation_config = GenerationConfig(**parameters)

        # Convert safety settings to SafetySetting objects
        safety_settings_list = [
            {
                "category": getattr(HarmCategory, f"HARM_CATEGORY_{category.upper()}"),
                "threshold": getattr(HarmBlockThreshold, f"{threshold.upper()}"),
            }
            for category, threshold in safety_settings.items()
        ]

        # Get the model
        gemini_model = genai.GenerativeModel(
            model_name=model,
            generation_config=generation_config,
            safety_settings=safety_settings_list,
        )

        # Determine the content to send
        content_to_send = contents if contents is not None else prompt

        # Try to generate content with retries
        for attempt in range(retry_count):
            try:
                response = gemini_model.generate_content(content_to_send)
                return response
            except Exception as e:
                logger.warning(
                    f"API call failed: {str(e)}"
                )
                # if attempt < retry_count - 1:
                #     logger.warning(
                #         f"API call failed (attempt {attempt + 1}/{retry_count}): {str(e)}. Retrying in {retry_delay} seconds..."
                #     )
                #     time.sleep(retry_delay)
                #     # Increase delay for next retry (exponential backoff)
                #     retry_delay *= 2
                # else:
                #     logger.error(
                #         f"API call failed after {retry_count} attempts: {str(e)}"
                #     )
                #     raise

    def generate_structured_content(
        self,
        prompt: str,
        model: Optional[str] = None,
        parameters: Optional[Dict[str, Any]] = None,
        safety_settings: Optional[Dict[str, str]] = None,
        retry_count: int = 3,
        retry_delay: float = 1.0,
    ) -> Dict[str, Any]:
        """
        Generate structured content (JSON) using Google Gemini.

        This method is optimized for generating structured data by using more deterministic
        parameters and adding instructions to format the response as JSON.

        Args:
            prompt: The prompt to generate content from.
            model: The model to use. If not provided, the default model will be used.
            parameters: The parameters to use. If not provided, deterministic parameters will be used.
            safety_settings: The safety settings to use. If not provided, the default safety settings will be used.
            retry_count: The number of times to retry if the API call fails.
            retry_delay: The delay between retries in seconds.

        Returns:
            The generated content as a dictionary.

        Raises:
            Exception: If the API call fails after all retries or if the response cannot be parsed as JSON.
        """
        # Use deterministic parameters by default for structured content
        parameters = parameters or GeminiConfig.get_parameters("deterministic")

        # Add instructions to format the response as JSON
        structured_prompt = f"{prompt}\n\nProvide your response as valid JSON."

        response = self.generate_content(
            prompt=structured_prompt,
            model=model,
            parameters=parameters,
            safety_settings=safety_settings,
            retry_count=retry_count,
            retry_delay=retry_delay,
        )

        try:
            # The response.text contains the generated text
            import json

            return json.loads(response.text)
        except json.JSONDecodeError as e:
            logger.error(f"Failed to parse response as JSON: {str(e)}")
            logger.error(f"Response text: {response.text}")
            raise ValueError(f"Failed to parse response as JSON: {str(e)}")

    def stream_content(
        self,
        prompt: str,
        model: Optional[str] = None,
        parameters: Optional[Dict[str, Any]] = None,
        safety_settings: Optional[Dict[str, str]] = None,
    ) -> Generator[str, None, None]:
        """
        Stream content using Google Gemini.

        Args:
            prompt: The prompt to generate content from.
            model: The model to use. If not provided, the default model will be used.
            parameters: The parameters to use. If not provided, the default parameters will be used.
            safety_settings: The safety settings to use. If not provided, the default safety settings will be used.

        Yields:
            Chunks of generated content.

        Raises:
            Exception: If the API call fails.
        """
        model = model or GeminiConfig.DEFAULT_MODEL
        parameters = parameters or GeminiConfig.get_parameters()
        safety_settings = safety_settings or GeminiConfig.SAFETY_SETTINGS

        # Convert parameters to GenerationConfig
        generation_config = GenerationConfig(**parameters)

        # Convert safety settings to SafetySetting objects
        safety_settings_list = [
            {
                "category": getattr(HarmCategory, f"HARM_CATEGORY_{category.upper()}"),
                "threshold": getattr(HarmBlockThreshold, f"{threshold.upper()}"),
            }
            for category, threshold in safety_settings.items()
        ]

        # Get the model
        gemini_model = genai.GenerativeModel(
            model_name=model,
            generation_config=generation_config,
            safety_settings=safety_settings_list,
        )

        try:
            response = gemini_model.generate_content(prompt, stream=True)
            for chunk in response:
                if chunk.text:
                    yield chunk.text
        except Exception as e:
            logger.error(f"Streaming API call failed: {str(e)}")
            raise

    def batch_generate(
        self,
        prompts: List[str],
        model: Optional[str] = None,
        parameters: Optional[Dict[str, Any]] = None,
        safety_settings: Optional[Dict[str, str]] = None,
        retry_count: int = 3,
        retry_delay: float = 1.0,
    ) -> List[GenerateContentResponse]:
        """
        Generate content for multiple prompts.

        Args:
            prompts: The prompts to generate content from.
            model: The model to use. If not provided, the default model will be used.
            parameters: The parameters to use. If not provided, the default parameters will be used.
            safety_settings: The safety settings to use. If not provided, the default safety settings will be used.
            retry_count: The number of times to retry if an API call fails.
            retry_delay: The delay between retries in seconds.

        Returns:
            A list of generated content responses.

        Raises:
            Exception: If any API call fails after all retries.
        """
        results = []
        for prompt in prompts:
            response = self.generate_content(
                prompt=prompt,
                model=model,
                parameters=parameters,
                safety_settings=safety_settings,
                retry_count=retry_count,
                retry_delay=retry_delay,
            )
            results.append(response)
        return results

    def generate_with_retry(
        self,
        prompt: str,
        model: Optional[str] = None,
        parameters: Optional[Dict[str, Any]] = None,
        safety_settings: Optional[Dict[str, str]] = None,
        max_retries: int = 3,
        retry_delay: float = 1.0,
        validation_func: Optional[Callable[[str], bool]] = None,
    ) -> GenerateContentResponse:
        """
        Generate content with retry logic for validation failures.

        This method will retry the generation if the validation function returns False.

        Args:
            prompt: The prompt to generate content from.
            model: The model to use. If not provided, the default model will be used.
            parameters: The parameters to use. If not provided, the default parameters will be used.
            safety_settings: The safety settings to use. If not provided, the default safety settings will be used.
            max_retries: The maximum number of retries if validation fails.
            retry_delay: The delay between retries in seconds.
            validation_func: A function that takes the generated text and returns True if it's valid, False otherwise.

        Returns:
            The generated content response.

        Raises:
            ValueError: If validation fails after all retries.
        """
        for attempt in range(max_retries):
            response = self.generate_content(
                prompt=prompt,
                model=model,
                parameters=parameters,
                safety_settings=safety_settings,
            )

            # If no validation function or validation passes, return the response
            if validation_func is None or validation_func(response.text):
                return response

            # If validation fails and we have retries left, try again
            if attempt < max_retries - 1:
                logger.warning(
                    f"Validation failed (attempt {attempt + 1}/{max_retries}). Retrying in {retry_delay} seconds..."
                )
                time.sleep(retry_delay)
                # Increase delay for next retry (exponential backoff)
                retry_delay *= 2

        # If we get here, validation failed after all retries
        raise ValueError(f"Validation failed after {max_retries} attempts")


In [None]:
def process(contents):
    try:
        llm_service = LLMService(api_key=API_KEY)
    except ValueError as e:
        logger.error(f"Failed to initialize LLM Service: {e}")
        return  # Cannot proceed without API key

    try:
        response = llm_service.generate_content(contents=contents)
        print(response.text)
    except Exception as e:
        print(e)

In [None]:
def open_and_read_json_file(json_name):
  with open(json_name, 'r') as f:
    json_string = f.read()
    json_data = convert_numeric_keys_to_string(json_string)
    json_string = json.dumps(json_data, indent=4, separators=(',', ': \n'))
    return json_string

def convert_numeric_keys_to_string(data):
    if isinstance(data, dict):
        return {str(k): convert_numeric_keys_to_string(v) for k, v in data.items()}
    elif isinstance(data, list):
        return [convert_numeric_keys_to_string(item) for item in data]
    else:
        return data

In [None]:
def open_and_read_pdf_data(pdf_file_path):
    try:
        with open(pdf_file_path, "rb") as f:
            pdf_data = f.read()
            return pdf_data  # Return pdf_data if successful
    except Exception as e:
        print(f"Error reading PDF file: {e}")
        return None # Return None if failed

# Main


In [None]:
# --- Prompt Template ---
# Note: Using f-string deferred formatting {{ }} for json examples inside the main f-string
PROMPT_TEMPLATE = """[POLICY TIER NAME]: {policy_tier}
---
**Input:**
1. A json document (JSON)
2. A travel insurance policy document (PDF)
**Your Task:**
Act as a meticulous data extraction assistant.
In the JSON document, pick 10 of the **coverage_categories** at random and verify the **coverage_name**, **limits**, **details** and **source_info** matches the information provided in the PDF document. You should process the provided travel insurance policy document and extract specific coverage information for the **`[POLICY TIER NAME]`** plan/tier.
**Output Format (Strict JSON):**
Your output **MUST** be a single, valid JSON object conforming to the following structure. Do **NOT** include any text before or after the JSON code block.
```json
{{
    "result": "match",
    "coverage_name": "War Cover"
}},
{{
    "result": "match",
    "coverage_name": "Flight Deviation"
}},
{{
    "result": "no match",
    "coverage_name": "Medical Expenses",
    "json_detail": "listed activites - fishing, dancing, jogging",
    "pdf_detail": "listed activites - swimming, flying",
}},
{{
    "result": "no match",
    "coverage_name": "INCONVENIENCE / LIABILITY COVERS",
    "json_detail": "limit for per family is 200",
    "pdf_detail": "limit per family is 180",
}}
```
**Crucial Instructions:**
1.  **Tier Specificity:** Extract data *only* for the `[POLICY TIER NAME]` tier. If limits are shared across tiers, use the value specified for this tier.
2.  **Placeholder:** In the JSON, if placeholder text like "Refer to Section X" is observed, you **MUST** read the relevant section(s) in the policy text within the PDF for each coverage and include the key applicable conditions, definitions, major inclusions/exclusions, or other pertinent information for that coverage and specified tier. If a benefit is not covered for a specific tier, explicitly state that if mentioned.
3.  **Currency:** Identify and include the correct currency code (e.g., "SGD").
4.  **Source Info: ** `source_info` show additional info related to the **coverage_name**, all details from the source page also should be studied for no match, missing info or extra info
5.  **Display Error: ** display clearly why the JSON document is different from the PDF document under `json_detail` and `pdf_detail`
---
Please process the provided policy document for the **`[POLICY TIER NAME]`** tier and generate the JSON output according to these instructions.
"""

In [None]:
json_file_path = 'sompo_{Vital}.json'
pdf_file_path = 'Sompo-traveljoy.pdf'
policy_tier_name = "Vital"

pdf_data = open_and_read_pdf_data(pdf_file_path)
formatted_prompt = PROMPT_TEMPLATE.format(policy_tier=policy_tier_name)

contents=[
    formatted_prompt,
    open_and_read_json_file(json_file_path),
    {"mime_type": "application/pdf", "data": pdf_data}
]

In [None]:
process(contents) #1

```json
[
    {
        "result": "match",
        "coverage_name": "Personal Accident"
    },
    {
        "result": "match",
        "coverage_name": "Medical Expenses Incurred Overseas"
    },
    {
        "result": "match",
        "coverage_name": "Loss or Damage to Baggage & Personal Effects"
    },
    {
        "result": "match",
        "coverage_name": "Trip Cancellation or Postponement"
    },
    {
        "result": "match",
        "coverage_name": "Personal Liability / Family Liability"
    },
    {
        "result": "match",
        "coverage_name": "Baggage Delay"
    },
    {
        "result": "match",
        "coverage_name": "Travel Delay"
    },
    {
        "result": "match",
        "coverage_name": "Flight Deviation"
    },
    {
        "result": "match",
        "coverage_name": "Alternative Accommodation Arrangement"
    },
    {
        "result": "match",
        "coverage_name": "Quarantine Allowance Due to 17 Infectious Diseases Upon Return to Singapore"

In [None]:
process(contents) #2

```json
[
    {
        "result": "match",
        "coverage_name": "Personal Accident"
    },
    {
        "result": "match",
        "coverage_name": "Emergency Medical Evacuation & Repatriation (Including Mortal Remains) Back to Singapore"
    },
    {
        "result": "match",
        "coverage_name": "Compassionate Visit by a Relative or Friend"
    },
    {
        "result": "match",
        "coverage_name": "Loss or Damage to Baggage & Personal Effects"
    },
    {
        "result": "match",
        "coverage_name": "Trip Cancellation or Postponement"
    },
    {
        "result": "match",
        "coverage_name": "Personal Liability / Family Liability"
    },
    {
        "result": "match",
        "coverage_name": "Travel Delay"
    },
    {
        "result": "match",
        "coverage_name": "Flight Deviation"
    },
    {
        "result": "match",
        "coverage_name": "Alternative Travel Arrangement"
    },
    {
        "result": "match",
        "coverage_name": 

In [None]:
process(contents) #3



```json
{
    "result": "match",
    "coverage_name": "Emergency Medical Evacuation & Repatriation (Including Mortal Remains) Back to Singapore"
}
```
```json
{
    "result": "match",
    "coverage_name": "Alternative Accommodation Arrangement"
}
```
```json
{
    "result": "match",
    "coverage_name": "Loss or Damage to Baggage & Personal Effects"
}
```
```json
{
    "result": "match",
    "coverage_name": "Financial Collapse of Licensed Tour Operators"
}
```
```json
{
    "result": "match",
    "coverage_name": "Personal Accident"
}
```
```json
{
    "result": "match",
    "coverage_name": "Travel Delay"
}
```
```json
{
    "result": "match",
    "coverage_name": "Quarantine Allowance Due to 17 Infectious Diseases Upon Return to Singapore"
}
```
```json
{
    "result": "match",
    "coverage_name": "Flight Deviation"
}
```
```json
{
    "result": "match",
    "coverage_name": "Trip Cancellation or Postponement"
}
```
```json
{
    "result": "match",
    "coverage_name": "Medical Exp

In [None]:
process(contents) #4

```json
[
    {
        "result": "match",
        "coverage_name": "Personal Accident"
    },
    {
        "result": "match",
        "coverage_name": "Medical Expenses Incurred Overseas"
    },
    {
        "result": "no match",
        "coverage_name": "Emergency Medical Evacuation & Repatriation (Including Mortal Remains) Back to Singapore",
        "json_detail": "Pre-existing medical conditions are not covered for this benefit under the Vital tier (N.A.).",
        "pdf_detail": "PDF Benefit table (Page 3) shows N.A. for pre-existing condition coverage under this benefit for Vital Tier, but the general description for the benefit (across all tiers) states 'Includes pregnancy-related complications and pre-existing medical conditions'. The JSON correctly notes N.A. for Vital based on the table, but the PDF description is slightly confusing without reading the table values."
    },
    {
        "result": "no match",
        "coverage_name": "Loss or Damage to Baggage & Personal E

In [None]:
process(contents) #5

```json
[
    {
        "result": "match",
        "coverage_name": "Personal Accident"
    },
    {
        "result": "match",
        "coverage_name": "Medical Expenses Incurred Overseas"
    },
    {
        "result": "match",
        "coverage_name": "Emergency Medical Evacuation & Repatriation (Including Mortal Remains) Back to Singapore"
    },
    {
        "result": "match",
        "coverage_name": "Compassionate Visit by a Relative or Friend"
    },
    {
        "result": "match",
        "coverage_name": "Loss or Damage to Baggage & Personal Effects"
    },
    {
        "result": "match",
        "coverage_name": "Trip Cancellation or Postponement"
    },
    {
        "result": "match",
        "coverage_name": "Travel Delay"
    },
    {
        "result": "match",
        "coverage_name": "Flight Deviation"
    },
    {
        "result": "match",
        "coverage_name": "Alternative Accommodation Arrangement"
    },
    {
        "result": "match",
        "coverage_nam