# Flood Use Case Parser

## Import Libraries

In [46]:
import os
import json
import re
import requests
from tqdm import tqdm

## Load the Data

In [67]:
with open("./disaster_details.json", "r", encoding="utf-8") as f:
    extracted_data = json.load(f)

In [68]:
# extracted_data

## CBR Extractor

In [69]:
# Files contains logic for LLM Models

## Import Libraries
import os
import requests
from abc import ABC, abstractmethod
from typing import *
from functools import lru_cache
import tiktoken


## Main Logic 

class BaseAIModel(ABC):
    def __init__(self, config: Dict[str, Any]):
        """
        Initialize the BaseAIModel with a configuration dictionary.

        :param config: Configuration dictionary for the model.
        """
        self.config = config
        self._validate_config()
        self.tokenizer = self.get_tokenizer()

    @abstractmethod
    def _validate_config(self) -> None:
        """
        Validate the configuration dictionary.
        Must be implemented by subclasses to enforce required keys in the config.
        """
        pass

    @abstractmethod
    def get_tokenizer(self) -> Any:
        """
        Get the tokenizer.
        Should be implemented by subclasses to return the appropriate tokenizer.
        """
        pass

    @abstractmethod
    def inference(self, prompt: str, args: Dict[str, Any]) -> str:
        """
        Generate a response from the model based on the prompt and args.
        Must be implemented by subclasses.

        :param prompt: The input prompt for the model.
        :param args: Additional arguments for the model's inference method.
        :return: The generated text from the model.
        """
        pass

    def get_num_tokens(self, text: str) -> int:
        """
        Get the number of tokens in the text using the tokenizer.

        :param text: The input text to tokenize.
        :return: The number of tokens in the text.
        """
        if self.tokenizer:
            return len(self.tokenizer.encode(text))
        raise NotImplementedError("Tokenization is not implemented for this model type.")

    def get_token_ids(self, text: str) -> List[int]:
        """
        Return the ordered ids of the tokens in a text using the tokenizer.

        :param text: The input text to tokenize.
        :return: The list of token ids in the text.
        """
        if self.tokenizer:
            return self.tokenizer.encode(text)
        raise NotImplementedError("Tokenization is not implemented for this model type.")

class LocalAIModel(BaseAIModel):
    def _validate_config(self) -> None:
        """
        Validate the configuration dictionary for LocalAIModel.
        Ensures 'model_path' is present in the config.
        """
        required_keys = ["model_path"]
        for key in required_keys:
            if key not in self.config:
                raise ValueError(f"Missing required config key: {key}")

    def get_tokenizer(self) -> tiktoken.Encoding:
        """
        Get the tokenizer for the LocalAIModel.
        Loads the tokenizer from the specified model path in the config.

        :return: An instance of tiktoken.Encoding.
        """
        try:
            return tiktoken.get_encoding(self.config["model_path"])
        except Exception as e:
            raise RuntimeError(f"Failed to load tokenizer: {e}")

    def inference(self, prompt: str, args: Dict[str, Any]) -> str:
        """
        Perform inference using the local model.

        :param prompt: The input prompt for the model.
        :param args: Additional arguments for the model's generate method.
        :return: The generated text from the model.
        """
        if not prompt:
            raise ValueError("Prompt is required for inference.")
        
        model = self.config.get("model")
        if model is None:
            raise ValueError("Local model not provided in config.")

        inputs = self.tokenizer.encode(prompt)
        outputs = model.generate(inputs, **args)
        return self.tokenizer.decode(outputs[0])

class APIAIModel(BaseAIModel):
    def _validate_config(self) -> None:
        """
        Validate the configuration dictionary for APIAIModel.
        Ensures 'api_url' and 'api_key' are present in the config.
        """
        required_keys = ["api_url", "api_key"]
        for key in required_keys:
            if key not in self.config:
                raise ValueError(f"Missing required config key: {key}")

    def get_tokenizer(self) -> None:
        """
        Get the tokenizer for the APIAIModel.
        API models do not require a tokenizer, so this returns None.

        :return: None
        """
        return None

    def inference(self, prompt: str, args: Dict[str, Any]) -> str:
        """
        Perform inference using the API model.

        :param prompt: The input prompt for the model.
        :param args: Additional arguments for the API request.
        :return: The generated text from the API response.
        """
        if not prompt:
            raise ValueError("Prompt is required for inference.")

        response = requests.post(
            self.config["api_url"],
            headers={"Authorization": f"Bearer {self.config['api_key']}"},
            json={"prompt": prompt, **args}
        )

        if response.status_code != 200:
            raise RuntimeError(f"API request failed with status code {response.status_code}: {response.text}")

        return response.json()["generated_text"]


class OpenAI_Model(BaseAIModel):
    def __init__(self, config: Dict[str, Any]):
        """
        Initialize the OpenAI_Model with a configuration dictionary.
        Set default values for optional parameters.

        :param config: Configuration dictionary for the model.
        """
        self.model: str = config.get("model", "gpt-3.5-turbo")
        self.api_url: str = config.get("api_url", "https://api.openai.com/v1/chat/completions")
        self.api_key: str = config.get("api_key", os.getenv("OPENAI_API_KEY"))
        if not self.api_key:
            raise ValueError("API key is required.")
        self.temperature: float = config.get("temperature", 1.0)
        self.max_tokens: int = config.get("max_tokens", 100)
        self.n: int = config.get("n", 1)
        self.stop: Optional[List[str]] = config.get("stop", None)
        self.frequency_penalty: float = config.get("frequency_penalty", 0.0)
        self.presence_penalty: float = config.get("presence_penalty", 0.0)
        
        super().__init__(config)

    def _validate_config(self) -> None:
        """
        Validate the configuration dictionary for OpenAI_Model.
        Ensures required keys are present in the config and checks the validity of parameter values.
        """
        required_keys = ["api_url", "api_key", "model"]
        for key in required_keys:
            if key not in self.config:
                raise ValueError(f"Missing required config key: {key}")
        
        if not (0 <= self.temperature <= 2):
            raise ValueError("Temperature must be between 0 and 2.")
        if not (0 <= self.frequency_penalty <= 2):
            raise ValueError("Frequency penalty must be between -2 and 2.")
        if not (0 <= self.presence_penalty <= 2):
            raise ValueError("Presence penalty must be between -2 and 2.")
        if not (0 <= self.max_tokens <= 4096):  # Example max token limit for GPT-3.5-turbo
            raise ValueError("Max tokens must be between 0 and the model's context length limit.")

    def get_tokenizer(self) -> tiktoken.Encoding:
        """
        Get the tokenizer for the OpenAI_Model.
        Uses tiktoken to get the tokenizer corresponding to the specified model.

        :return: An instance of tiktoken.Encoding.
        """
        try:
            return tiktoken.encoding_for_model(self.model)
        except Exception as e:
            raise RuntimeError(f"Failed to load tokenizer: {e}")

    @lru_cache(maxsize=32)
    def inference(self, prompt: str, args: Dict[str, Any] = {}, json_mode: bool = False) -> str:
        """
        Perform inference using the GPT-3.5 model with caching.

        :param prompt: The input prompt for the model.
        :param args: Additional arguments for the API request.
        :param json_mode: If true, requests a JSON-formatted response.
        :return: The generated text from the API response.
        """
        if not prompt:
            raise ValueError("Prompt is required for inference.")

        request_data = {
            "model": self.model,
            "messages": [{"role": "user", "content": prompt}],
            "temperature": args.get("temperature", self.temperature),
            "max_tokens": args.get("max_tokens", self.max_tokens),
            "n": args.get("n", self.n),
            "stop": args.get("stop", self.stop),
            "frequency_penalty": args.get("frequency_penalty", self.frequency_penalty),
            "presence_penalty": args.get("presence_penalty", self.presence_penalty)
        }

        if json_mode:
            request_data["response_format"] = {"type": "json_object"}

        response = requests.post(
            self.api_url,
            headers={"Authorization": f"Bearer {self.api_key}"},
            json=request_data
        )

        if response.status_code != 200:
            raise RuntimeError(f"API request failed with status code {response.status_code}: {response.text}")

        return response.json()["choices"][0]["message"]["content"]

    @lru_cache(maxsize=32)
    def get_full_response(self, prompt: str, args: Dict[str, Any] = {}, json_mode: bool = False) -> Dict[str, Any]:
        """
        Perform inference using the GPT-3.5 model and return the full response.

        :param prompt: The input prompt for the model.
        :param args: Additional arguments for the API request.
        :param json_mode: If true, requests a JSON-formatted response.
        :return: The full response from the API.
        """
        if not prompt:
            raise ValueError("Prompt is required for inference.")

        request_data = {
            "model": self.model,
            "messages": [{"role": "user", "content": prompt}],
            "temperature": args.get("temperature", self.temperature),
            "max_tokens": args.get("max_tokens", self.max_tokens),
            "n": args.get("n", self.n),
            "stop": args.get("stop", self.stop),
            "frequency_penalty": args.get("frequency_penalty", self.frequency_penalty),
            "presence_penalty": args.get("presence_penalty", self.presence_penalty)
        }

        if json_mode:
            request_data["response_format"] = {"type": "json_object"}

        response = requests.post(
            self.api_url,
            headers={"Authorization": f"Bearer {self.api_key}"},
            json=request_data
        )

        if response.status_code != 200:
            raise RuntimeError(f"API request failed with status code {response.status_code}: {response.text}")

        return response.json()

    def update_config(self, new_config: Dict[str, Any]) -> None:
        """
        Update the configuration parameters and revalidate them.

        :param new_config: Dictionary containing the new configuration parameters.
        """
        self.config.update(new_config)
        self._validate_config()

    def validate_prompt(self, prompt: List[Dict[str, str]]) -> None:
        """
        Validate the format of the prompt.

        :param prompt: The input prompt to validate.
        :raises ValueError: If the prompt format is invalid.
        """
        if not isinstance(prompt, list):
            raise ValueError("Prompt must be a list of dictionaries.")
        
        valid_roles = {"system", "user", "assistant"}
        
        for message in prompt:
            if not isinstance(message, dict):
                raise ValueError("Each message in the prompt must be a dictionary.")
            if "role" not in message or "content" not in message:
                raise ValueError("Each message must have 'role' and 'content' keys.")
            if message["role"] not in valid_roles:
                raise ValueError(f"Invalid role '{message['role']}'. Role must be one of {valid_roles}.")




In [70]:
class GPT4o_Model(OpenAI_Model):
    """
    GPT-4o Model class extending the OpenAI_Model class.
    Utilizes the OpenAI client call method for API interaction.
    Inherits from OpenAI_Model and overrides the inference method for GPT-4 specific usage.
    """

    def __init__(self, config: Dict[str, Any]):
        """
        Initialize the GPT4o_Model with the configuration dictionary.
        Uses OpenAI client for API interaction with additional API settings.

        :param config: Configuration dictionary for the model.
        """
        # Set up default model and API key details, and initialize the client
        self.model = config.get("model", "gpt-4")
        api_key = config.get("api_key", os.getenv("OPENAI_API_KEY"))
        helicone_key = os.getenv("HELICONE_API_KEY")

        # If no API key is provided in config or environment, raise an error
        if not api_key:
            raise ValueError("API key is required for GPT-4 model initialization.")

        # Initialize OpenAI client
        self.client = OpenAI(
            api_key=api_key,
            base_url="https://oai.hconeai.com/v1",
            default_headers={"Helicone-Auth": f"Bearer {helicone_key}"}
        )

        self.temperature = config.get("temperature", 1.0)
        self.max_tokens = config.get("max_tokens", None)
        self.n = config.get("n", 1)
        self.stop = config.get("stop", None)
        self.frequency_penalty = config.get("frequency_penalty", 0.0)
        self.presence_penalty = config.get("presence_penalty", 0.0)

        super().__init__(config)

    def _validate_config(self) -> None:
        """
        Override config validation for GPT-4o Model to check client initialization and parameters.
        Ensures required keys are present and valid client initialization.

        :raises ValueError: If required keys or configurations are invalid.
        """
        required_keys = ["model"]
        for key in required_keys:
            if key not in self.config:
                raise ValueError(f"Missing required config key: {key}")

        # Validate temperature
        if not (0 <= self.temperature <= 2):
            raise ValueError("Temperature must be between 0 and 2.")

        # Validate frequency penalty
        if not (-2 <= self.frequency_penalty <= 2):
            raise ValueError("Frequency penalty must be between -2 and 2.")

        # Validate presence penalty
        if not (-2 <= self.presence_penalty <= 2):
            raise ValueError("Presence penalty must be between -2 and 2.")

        # Validate max_tokens only if provided
        if self.max_tokens is not None and not (0 <= self.max_tokens <= 4096):
            raise ValueError("Max tokens must be between 0 and 4096 for GPT-4.")

    def generate_api_request(self, prompt_text: str, images_dict: Dict[str, str], max_tokens: Optional[int] = None) -> str:
        """
        Function to call the GPT-4o API with a prompt and associated images in base64 format.

        :param prompt_text: The input text prompt for the model.
        :param images_dict: A dictionary where keys are image numbers and values are base64-encoded images.
        :param max_tokens: Optional maximum number of tokens for the response. Only included if provided.
        :return: The generated text from the GPT-4 API.
        """
        # Construct message content with the prompt and base64-encoded images
        message_content = [{"type": "text", "text": prompt_text}]
        for image_num, base64_image in images_dict.items():
            message_content.append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{base64_image}"
                }
            })

        # API call to the OpenAI GPT-4 model
        response = self.client.chat.completions.create(
            model=self.model,
            temperature=self.temperature,
            messages=[
                {
                    "role": "user",
                    "content": message_content
                }
            ],
            # Only pass max_tokens if provided
            **({"max_tokens": max_tokens} if max_tokens else {})
        )

        # Return the generated text response
        return response

    def generate_api_request_with_json(self, prompt_text: str, images_dict: Dict[str, str], max_tokens: Optional[int] = None) -> str:
        """
        Function to call the GPT-4o API with a prompt and associated images in base64 format.

        :param prompt_text: The input text prompt for the model.
        :param images_dict: A dictionary where keys are image numbers and values are base64-encoded images.
        :param max_tokens: Optional maximum number of tokens for the response. Only included if provided.
        :return: The generated text from the GPT-4 API.
        """
        # Construct message content with the prompt and base64-encoded images
        message_content = [{"type": "text", "text": prompt_text}]
        for image_num, base64_image in images_dict.items():
            message_content.append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{base64_image}"
                }
            })

        # API call to the OpenAI GPT-4 model
        response = self.client.chat.completions.create(
            model=self.model,
            temperature=self.temperature,
            messages=[
                {
                    "role": "user",
                    "content": message_content
                }
            ],
            # Only pass max_tokens if provided
            **({"max_tokens": max_tokens} if max_tokens else {}),
            response_format = {"type": "json_object"}
        )

        # Return the generated text response
        return response

    def inference(self, prompt_text: str, images_dict: Dict[str, str] = {}, args: Dict[str, Any] = {}) -> str:
        """
        Perform inference using the GPT-4 model via the OpenAI client call method.

        :param prompt_text: The input text prompt for the model.
        :param images_dict: Dictionary where keys are image numbers and values are base64-encoded image data.
        :param args: Additional arguments, including max_tokens, temperature, etc.
        :return: The generated text response from the GPT-4 API.
        """
        max_tokens = args.get("max_tokens", None)  # Optional max_tokens argument

        # Generate the API request using the prompt and images
        return self.generate_api_request(prompt_text, images_dict, max_tokens)

    def inference_with_json(self, prompt_text: str, images_dict: Dict[str, str] = {}, args: Dict[str, Any] = {}) -> str:
        """
        Perform inference using the GPT-4 model via the OpenAI client call method.

        :param prompt_text: The input text prompt for the model.
        :param images_dict: Dictionary where keys are image numbers and values are base64-encoded image data.
        :param args: Additional arguments, including max_tokens, temperature, etc.
        :return: The generated text response from the GPT-4 API.
        """
        max_tokens = args.get("max_tokens", None)  # Optional max_tokens argument

        # Generate the API request using the prompt and images
        return self.generate_api_request_with_json(prompt_text, images_dict, max_tokens)

In [71]:
def generate_instructions_template():
    """
    You are a state-of-the-art extraction tool tasked with converting raw ReliefWeb documents into structured cases for a Case-Based Reasoning (CBR) system in disaster recovery and management.

    Each case should capture the following key elements:
      - "case_identifier_url": URL of the Case in the CBR System for later Indexing. 
      - "disaster_title": The title of the disaster encountered in the content
      - "disaster_type": List of classifications of the disaster/s (e.g., Flood, Cyclone, Epidemic, etc.).
      - "date": The date of the disaster event or report.
      - "disaster_description": A detailed description of the disaster, including its causes, impacts, and key events.
      - "interventions": A summary of any relief responses, interventions, or actions taken.
      - "affected_countries": A list of countries or regions affected by the disaster.
      - "sources": List of original source/s of the information (e.g., UN DHA, UNICEF, etc.).
      - "language": The language of the content (if identifiable).

    You are provided with the raw HTML content in the variable "$raw_content". Your task is to:
      1. Parse the provided raw content.
      2. Extract the required fields while handling variations in document structure (e.g., sections for "Overview", "Latest Updates", "Affected Countries", etc.) and multilingual text.
      3. Construct a JSON object in which each key is a unique case identifier (or disaster event identifier) and its value is an object with the extracted fields.

    The expected JSON structure should be as follows:
    {
       "<case_identifier_url>": {
           "disaster_title": "<Disaster title>",
           "disaster_type": ["<Disaster type>", ...],
           "date": "<Date>",
           "disaster_description": "<Detailed disaster description>",
           "interventions": "<Interventions and responses>",
           "affected_countries": ["<Country1>", "<Country2>", ...],
           "sources": ["<Source information1>", "<Source information2>", ...],
           "language": "<Identified language (if available)>"
       },
       ...
    }

    This case formulation is designed for a CBR system because historical disaster cases—each containing both the event description and the corresponding relief interventions—can be compared to new disaster scenarios. By matching similar case elements, decision-makers can quickly identify effective response strategies from past events. The multilingual aspect ensures that cases are standardized across different languages, making the system applicable globally.

    Your output should be strictly the final JSON object with no extra commentary.

    Here is the content to parse:
    `````
    $raw_content
    `````
    """
    return generate_instructions_template.__doc__

In [72]:
print(generate_instructions_template())


    You are a state-of-the-art extraction tool tasked with converting raw ReliefWeb documents into structured cases for a Case-Based Reasoning (CBR) system in disaster recovery and management.

    Each case should capture the following key elements:
      - "case_identifier_url": URL of the Case in the CBR System for later Indexing. 
      - "disaster_title": The title of the disaster encountered in the content
      - "disaster_type": List of classifications of the disaster/s (e.g., Flood, Cyclone, Epidemic, etc.).
      - "date": The date of the disaster event or report.
      - "disaster_description": A detailed description of the disaster, including its causes, impacts, and key events.
      - "interventions": A summary of any relief responses, interventions, or actions taken.
      - "affected_countries": A list of countries or regions affected by the disaster.
      - "sources": List of original source/s of the information (e.g., UN DHA, UNICEF, etc.).
      - "language": The l

In [73]:
import os
from openai import OpenAI

# Set API key as an environment variable
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [62]:
GPT_4o_CONFIG = {
    "model": "gpt-4o-2024-08-06",
    "api_key":OPENAI_API_KEY,
    "temperature": 0.0,
}
GPT_4o_model = GPT4o_Model(GPT_4o_CONFIG)

In [63]:
def get_cbr_case(rich_interaction_data):
    cbr_instructions = generate_instructions_template()
    
    cbr_instructions = cbr_instructions.replace("$raw_content", str(rich_interaction_data))
    
    cbr_case = GPT_4o_model.inference_with_json(cbr_instructions).choices[0].message.content
    return cbr_case

In [64]:
# print(get_reasonability_scores(user_interaction_rich_data[user_interaction_list[3]]))

In [65]:
cbr_dict = {}

In [75]:
for cbr_content in tqdm(extracted_data):
    cbr_url = cbr_content["url"]
    
    if cbr_url in cbr_dict:
       continue 
        
    cbr_case = get_cbr_case(cbr_content)
    cbr_dict[cbr_url] = cbr_case

    with open("cbr_casebase_dict.json", "w") as f:
        json.dump(cbr_dict, f)

100%|████████████████████████████████████████████████████████████████████████████| 3589/3589 [2:38:46<00:00,  2.65s/it]


In [None]:
# extracted_data[0]

In [321]:
with open("assets/reasonability_dict.json", "w") as f:
    json.dump(reasonability_dict, f)

In [1]:
import json

with open("cbr_casebase_dict.json", "r") as f:
        cbr_dict = json.load(f)

In [2]:
cbr_dict

{'https://reliefweb.int/disaster/ep-2020-000120-caf': '{\n    "https://reliefweb.int/disaster/ep-2020-000120-caf": {\n        "disaster_title": "Central African Republic: Measles Outbreak - Jan 2020",\n        "disaster_type": ["Epidemic"],\n        "date": "24 January 2020",\n        "disaster_description": "On 24 January, the Ministry of Health declared a national epidemic of measles. More than 3,600 cases were registered and 53 persons have died so far between February 2019 and January 2020. Several cases are reported in northern and central provincial towns such as Paoua, Vakaga, Nana-Gribizi, Batangafo, Bocaranga, Ngaoundaye and Bambari. The authorities are calling for support from technical partners and donors to scale up the response against the outbreak to immediately secure sufficient quantities of vaccines, targeting all children aged between 6 months and nine years old. The measles outbreak in Central African Republic, formally declared by the Ministry of Health on 24 Januar

In [3]:
cbr_dict = {k:eval(v) for k,v in cbr_dict.items()}

In [4]:
cbr_dict

{'https://reliefweb.int/disaster/ep-2020-000120-caf': {'https://reliefweb.int/disaster/ep-2020-000120-caf': {'disaster_title': 'Central African Republic: Measles Outbreak - Jan 2020',
   'disaster_type': ['Epidemic'],
   'date': '24 January 2020',
   'disaster_description': "On 24 January, the Ministry of Health declared a national epidemic of measles. More than 3,600 cases were registered and 53 persons have died so far between February 2019 and January 2020. Several cases are reported in northern and central provincial towns such as Paoua, Vakaga, Nana-Gribizi, Batangafo, Bocaranga, Ngaoundaye and Bambari. The authorities are calling for support from technical partners and donors to scale up the response against the outbreak to immediately secure sufficient quantities of vaccines, targeting all children aged between 6 months and nine years old. The measles outbreak in Central African Republic, formally declared by the Ministry of Health on 24 January 2020, is evolving rapidly in numb

In [10]:
set([cbr_dict[k][k]["language"] for k,v in cbr_dict.items()])

{'EN / FR', 'English', 'English, French', 'French', 'en'}

In [14]:
{k:v for k,v in cbr_dict.items() if cbr_dict[k][k]["language"] == "English, French"}

{'https://reliefweb.int/disaster/2013-000034-mdg': {'https://reliefweb.int/disaster/2013-000034-mdg': {'disaster_title': 'Madagascar: Locusts - Mar 2013',
   'disaster_type': ['Locust Plague'],
   'date': 'Mar 2013',
   'disaster_description': 'The plague of the Malagasy Migratory Locust began in April 2012 in Madagascar, in a context where food insecurity and malnutrition rates were already high. Given the extent of the plague, it was estimated that the food security of about 13 million people (60 percent of the population) could be affected in the absence of large-scale locust control operations. To cope with this dire situation, in December 2012, the Ministry of Agriculture and Rural Development of Madagascar and the Food and Agriculture Organization of the United Nations developed a Three-year Programme (2013–2016) in response to the locust plague. Donor response has been timely and positive. The first locust campaign was fully funded and successfully implemented. The specific obje