### Setup GPT API

In [1]:
import os
import json

XAI_API_KEY = ""

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {XAI_API_KEY}"
}

data = {
    "messages": [
        {
            "role": "user",
            "content": "What is the meaning of life, the universe, and everything?"
        }
    ],
    "model": "grok-3-latest",
    "stream": False,
    "temperature": 0.7
}

with open("payload.json", "w") as f:
    json.dump(data, f)

!curl https://api.x.ai/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer {XAI_API_KEY}" \
  -d @payload.json

LLMName = 'grok-4-0709'

{"id":"d4dfce8d-e445-11a0-3973-34f767ee2d8a","object":"chat.completion","created":1753494234,"model":"grok-3","choices":[{"index":0,"message":{"role":"assistant","content":"The question \"What is the meaning of life, the universe, and everything?\" is famously associated with Douglas Adams' science fiction series *The Hitchhiker's Guide to the Galaxy*. In the story, a supercomputer called Deep Thought is tasked with finding the answer to this ultimate question. After 7.5 million years of computation, Deep Thought reveals that the answer is **42**—but it also admits that the question itself is not fully understood, and a second computer (Earth) is needed to figure out the actual question.\n\nPhilosophically and culturally, this has become a humorous way to highlight the complexity and perhaps absurdity of seeking a single, definitive meaning to existence. Beyond the fictional reference, the \"meaning of life\" varies depending on individual perspectives, cultural beliefs, and philosophi

In [2]:
%%capture
# ✅ STEP 1: Load packages and ChatGPT API
# Install all necessary pachages
!apt-get install -y poppler-utils
!apt-get install -y tesseract-ocr
! pip install -U langchain openai langchain-chroma langchain-experimental # (newest versions required for multi-modal)
! pip install "unstructured[all-docs]" pillow pydantic lxml pillow matplotlib chromadb tiktoken
!pip install -U langchain-openai
!pip install xai-sdk

In [3]:
import os
import base64
from PIL import Image
from io import BytesIO
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
import os
import time
import pandas as pd
import base64
from joblib import Parallel, delayed
from pathlib import Path
import random

from google.colab import drive
import pandas as pd
from pathlib import Path
from IPython.display import display
from PIL import Image

drive.mount('/content/drive')

# Base folders
base_folder = Path("/content/drive/MyDrive/LLMs/MLLM_CW/0723_Shuffle")

# Put your API key here
os.environ["OPENAI_API_KEY"] = "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"  # Replace with your key

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Data Loading and partition

In [4]:
from pathlib import Path
import json

# Assuming you have already mounted the drive and defined:
# base_folder = Path("/content/drive/MyDrive/LLMs/MLLM_CW/0723_Shuffle")

json_path = base_folder / "Final_QA_JSON_filtered_0723.json"

# Open the file correctly
with open(json_path, "r", encoding="utf-8") as f:
    QASet = json.load(f)

EventNames = list(QASet.keys())

In [5]:
import os
import base64

def write_message(Q_current, InputFileFolder):
    Q_modalities = Q_current['modalities']
    storm, year, leadtime = Q_current['context'][-3:]

    messages = [{"role": "system", "content": Q_current['prompt']}]

    for modality in Q_modalities:
        filename = f"{storm}_{year}_{leadtime}h"
        folder_path = os.path.join(InputFileFolder, modality)

        if modality in ['Graphic_Uncertainty_cone', 'Graphic_Wind']:
            image_path = os.path.join(folder_path, f"{filename}.png")
            with open(image_path, "rb") as img_file:
                image_b64 = base64.b64encode(img_file.read()).decode("utf-8")
                messages.append({
                    "role": "user",
                    "content": [{
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{image_b64}"}
                    }]
                })

        elif modality in ['text_advisory', 'Table_wind']:
            text_path = os.path.join(folder_path, f"{filename}.txt")
            with open(text_path, "r", encoding="utf-8") as text_file:
                advisory_text = text_file.read()
                messages.append({
                    "role": "user",
                    "content": advisory_text
                })
        else:
            raise ValueError('Unknown Modality Input!')

    # Append question
    messages.append({
        "role": "user",
        "content": Q_current['question']
    })

    true_answer = Q_current['answer']
    return messages, true_answer

In [6]:
def run_LLM(EventIdx, InputFileFolder):
    EventName_thisevent = EventNames[EventIdx]
    print(f"EventIdx = {EventIdx}, EventName = {EventName_thisevent}, LLM = {LLMName}")
    QASet_thisevent = QASet[EventNames[EventIdx]]['qa']
    QASet_thisevent_indexed = list(enumerate(QASet_thisevent))
    random.shuffle(QASet_thisevent_indexed)

    results_dict = {}

    for idx_q, q in QASet_thisevent_indexed:
        t1 = time.time()
        message_thisq, true_answer = write_message(q, InputFileFolder)
        response = call_xai_llm(message_thisq)
        t2 = time.time()

        results_dict[idx_q] = {
            "response": response,
            "ground_truth": true_answer
        }

        print(f"Evtid = {EventIdx}, Q_idx = {idx_q}, elapsed time = {t2 - t1:.4f}")

    results_thisevent = [results_dict[idx] for idx in sorted(results_dict)]
    df = pd.DataFrame(results_thisevent)
    final_output_path = Path(base_folder) / "MultiModalOutputs" / safe_LLMName / f"EvtID{EventIdx}_{EventName_thisevent}.csv"
    df.to_csv(final_output_path, index=False)

In [7]:

from xai_sdk import Client
from xai_sdk.chat import user, system

client = Client(api_key="xai-EVBdlIh1Mg69TATXVOLcCoxtr4JhEbZED6nU440B82wouKi7sEQO5kvijM9AFVsJTXk08GvcsD8rFeaP")
XAI_API_KEY = "xai-EVBdlIh1Mg69TATXVOLcCoxtr4JhEbZED6nU440B82wouKi7sEQO5kvijM9AFVsJTXk08GvcsD8rFeaP"

# Choose model
LLMName = 'grok-4-0709'  # You can change to 'gpt-4', 'gpt-3.5-turbo', etc.
safe_LLMName = LLMName.replace(':', '_')
InputFileFolder = base_folder /'MultiModalInputs'

# Output path
path = Path(base_folder) / "MultiModalOutputs" / safe_LLMName
path.mkdir(parents=True, exist_ok=True)

# Wrapper function to mimic `llm.invoke()`
def call_xai_llm(messages):
    chat = client.chat.create(model="grok-4-0709", temperature=0)

    for msg in messages:
        role = msg["role"]
        content = msg["content"]

        if isinstance(content, list):
            content = " ".join(str(c) for c in content)  # flatten any list just in case

        if role == "user":
            chat.append(user(content))
        elif role == "system":
            chat.append(system(content))
        else:
            raise ValueError(f"Unsupported role: {role}")

    response = chat.sample()
    return response.content.strip()

In [8]:
for event_index in range(0, 101):  # from 1 to 100 inclusive
    print(f"\nRunning LLM for EventIdx = {event_index}")
    run_LLM(event_index, InputFileFolder)


Running LLM for EventIdx = 0
EventIdx = 0, EventName = ANA_2015_Port_of_Savannah,_GA, LLM = grok-4-0709
Evtid = 0, Q_idx = 4, elapsed time = 29.7537
Evtid = 0, Q_idx = 121, elapsed time = 68.2214


KeyboardInterrupt: 

In [None]:
from langchain_core.messages import HumanMessage

# ✅ STEP 2: Image Encoding and Summary Helper
# Load and encode image
def encode_image(image_path):
    with Image.open(image_path).convert("RGB") as img:
        buffered = BytesIO()
        img.save(buffered, format="JPEG")
        return base64.b64encode(buffered.getvalue()).decode("utf-8")

# Summarize cyclone scenario with structured schema
def summarize_scenario(cone_img_path, wind_img_path, advisory_text, wind_text,
                             port_name, cyclone_name):
    # Encode both images
    base64_cone = encode_image(cone_img_path)
    base64_wind = encode_image(wind_img_path)

    # Enhanced prompt
    prompt = prompt = f"""
Act as expert in port operation specialist and tropical cyclone analyst. Based on the provided graphics and texts, extract information into the following structured JSON schema. The current senario information are given:

**Port Name**: {port_name}
**Cyclone Name**: {cyclone_name}

You must extract relevant details and populate the following **structured JSON schema**. This schema supports operational decision-making and resilience analysis for ports. Carefully read the **inline annotations** next to each field — they describe exactly what must be extracted. Use all available sources: graphics for movement, proximity, and timing; text for classification, warnings, and hazard probabilities.

JSON Format:
{{
  "cyclone": {{
    "name": "{cyclone_name}",                   // Cyclone identifier
    "category": "",                             // Current SSHS classification (e.g., "Category 1")
    "current_location": "",                     // Verbal location (e.g., "about 200 miles SE of Galveston")
    "coordinates": "",                          // Lat/Lon format (e.g., "22.3N, 86.6W")
    "motion": {{
      "direction": "",                          // e.g., "WNW"
      "speed": ""                               // e.g., "14 mph"
    }},
    "trajectory": "",                           // Describe the cyclone's predicted path direction and target area (e.g., "tracking northwest toward Gulf Coast")
    "expected_landfall_location": "",           // Closest landfall or coastal encounter
    "expected_landfall_date": "",               // e.g., "2022-06-05"
    "hours_to_landfall": "",                    // e.g., "48"
    "hours_to_offshore": "",                    // Estimated number of hours until the storm moves offshore from {port_name}'s coast (e.g., "24")
    "hours_of_strike": "",                      // Estimated number of hours from the storm’s initial strike to its departure at {port_name} (e.g., "12")
  }},
  "port": {{
    "name": "{port_name}"                       // Full port name
  }},
  "weather_observation": {{
    "within_uncertainty_cone": "",              // If the port located within the uncertainty cone of the tropical cyclone, e.g. "Yes" or "No"
    "forecast_window":"",                       // How many days are there in the forecast window shown from the uncertainty cone track, e.g. 5
    "under_watch": "",                          // The watch information at the port, e.g., "Tropical Storm Watch","Hurricane Watch", "None"
    "under_warning": "",                        // The warning information at the port, e.g., "Tropical Storm Warning","Hurricane Warning", "None"
    "watch_coast": "",                          // Describe geographic extent of watch area
    "warning_coast": "",                        // Describe geographic extent of warning area
    "impacted_coast": "",                       // Describe geographic extent of coast area impacted by the tropical cyclone
    "expected_closest_date_to_port": "",        // Closest approach date
    "expected_leaving_port_date": "",           // Estimated time cyclone moves away from port influence
    "closest_wind_data_location":"",            // The closest wind location to {port_name} in wind probability table, e.g. "BOSTON MA"
    "culmulative_wind_34kt":"",                 // cumulative probability of 34kt wind arrival at {port_name} in future 5 days
    "culmulative_wind_50kt":"",                 // cumulative probability of 50kt wind arrival at {port_name} in future 5 days
    "34kt_max":"",                              // probability value when 34kt wind probability peaks at {port_name} in future 5 days
    "time_to_34kt_max":"",                      // number of hours to 34kt wind probability peaks at {port_name} in future 5 days
    "time_to_34kt_non_zero":"",                 // number of hours to 34kt wind probability become non-zero at {port_name} in future 5 days
    "time_to_50kt_max":"",                      // number of hours to 34kt wind probability peaks at {port_name} in future 5 days
    "gale_probability": {{
      "in 12h": "",                             // Estimate the gale force probability at the port from the wind forecast table in future 12, 24, 48, 72 and 96 hours e.g., "12%"
      "in 24h": "",
      "in 48h": "",
      "in 72h": "",
      "in 96h": ""
    }},
    "first_hazard":"",                          // Select the earliest possible hazard happen at port, from "TC’s landfall","Arrival of tropical storm-force winds", "Start of heavy rainfall" or "Start of storm surge"
    "rain_fall_above_4_inch":"",                // Estimate if rainfall at {port_name} could reach 4 inch in the event
    "storm_surge_above_3_feet":"",              // Estimate if surge height at {port_name} could reach 3 feet in the event
    "wind":"",                                  // Describe possible strong wind that could at {port_name}'s side, if no information is avaliable, output "None"
    "storm_surge":"",                           // Describe possible storm surge at {port_name}'s side, if no information is avaliable, output "None"
    "rainfall":"",                              // Describe possible heavy rainfall at {port_name}'s side, if no information is avaliable, output "None"
    "other_hazards": ""                         // Describe possible other hazards at {port_name}'s side that will impact operation
  }}
}}

Evaluate on how the {port_name} is impacted by {cyclone_name}, be specific and rely on graphics for movement direction, cone shape, and timing. Use the advisory and text of wind probability table for wind speed, warnings, probability values, and hazard descriptions.
"""

    # LLM call
    chat = ChatOpenAI(model="o3",max_tokens=4096,model_kwargs={"response_format":{"type":"json_object"}})
    msg = chat.invoke([
        HumanMessage(
            content=[
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_cone}"}},
                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_wind}"}},
                {"type": "text", "text": f"Advisory Text:\n{advisory_text}"},
                {"type": "text", "text": f"Wind Text:\n{wind_text}"}
            ]
        )
    ])
    return msg.content

In [None]:
# 🧪 Test Code: Hurricane Florance on Port of Charleston
from google.colab import drive
import pandas as pd
from pathlib import Path
from IPython.display import display
from PIL import Image

drive.mount('/content/drive')

# Base folders
base_folder = Path("/content/drive/MyDrive/LLMs/QA_Generation_GPT4o")
metadata_file = base_folder / "senarios_filtered.csv"

# Read scenario metadata
senario_metadata = pd.read_csv(metadata_file)

# Define subfolders
cone_graphic_dir = base_folder / "Cyclone Graphics Archive Uncertainty Cone"
wind_graphic_dir = base_folder / "Cyclone Graphics Archive Wind"
advisory_text_dir = base_folder / "Cyclone Text Archive Advisory"
wind_text_dir = base_folder / "Cyclone Text Archive Wind"

# Iterate through scenarios
for idx, row in senario_metadata.iloc[:1].iterrows():
    name = row['tc_name']
    year = int(row['year'])
    hour = int(row['hour'])

    # Construct paths
    # Build base path without extension
    cone_image = cone_graphic_dir / f"{name}_{year}_{hour}h"
    matches_cone_image = list(cone_image.parent.glob(cone_image.name + ".*"))
    wind_image = wind_graphic_dir / f"{name}_{year}_{hour}h"
    matches_wind_image = list(wind_image.parent.glob(wind_image.name + ".*"))
    advisory_text_file = advisory_text_dir / f"{name}_{year}_{hour}h.txt"
    wind_text_file = wind_text_dir / f"{name}_{year}_{hour}h.txt"

    # Print extracted paths and content
    print(f"\n📍 Scenario {idx + 1}: {row['port_name']} - {name} (Hour {hour} before landfall)")

    # Print and display cone image
    print(f"🌀 Cone Image Path: {cone_image}")
    if matches_cone_image:
        img = Image.open(matches_cone_image[0])
        resized_img = img.resize((400, int(img.height * 400 / img.width)))  # resize width to 400 px, keep aspect ratio
        display(resized_img)
    else:
        print("⚠️ Cone image not found.")

    # Print and display wind image
    print(f"💨 Wind Image Path: {wind_image}")
    if matches_wind_image:
        img = Image.open(matches_wind_image[0])
        resized_img = img.resize((400, int(img.height * 400 / img.width)))  # resize width to 400 px, keep aspect ratio
        display(resized_img)
    else:
        print("⚠️ Wind image not found.")

    # Read and print advisory text
    if advisory_text_file.exists():
        with open(advisory_text_file, 'r') as f:
            advisory_text = f.read()
        print(f"📄 Advisory Text:\n{advisory_text[:500]}...")  # Preview first 500 chars
    else:
        print("⚠️ Advisory text file not found.")

    # Read and print wind text
    if wind_text_file.exists():
        with open(wind_text_file, 'r') as f:
            wind_text = f.read()
        print(f"🌬️ Wind Text:\n{wind_text[:500]}...")  # Preview
    else:
        print("⚠️ Wind text file not found.")

In [None]:
senario_metadata

In [None]:
import json
from pathlib import Path

summary_JSON = Path("summary_JSON")  # Ensure this is defined properly
summary_JSON.mkdir(parents=True, exist_ok=True)  # Create folder if not exists

# Select for experiment
subset_df = pd.concat([
    senario_metadata.iloc[2570:]
])

excluded_pairs = [
    ('ELSA', 2021),
    ('ETA', 2020),
    ('FRED', 2021),
    ('IRMA', 2017),
    ('ISAIAS', 2020),
    ('LAURA', 2020),
    ('MATTHEW', 2016)
]
subset_df = subset_df[~subset_df[['tc_name', 'year']].apply(tuple, axis=1).isin(excluded_pairs)]

# Iterate through scenarios
for idx, row in subset_df.iterrows():
    name = row['tc_name']
    year = int(row['year'])
    hour = int(row['hour'])

    # Construct paths
    # Build base path without extension
    cone_image = cone_graphic_dir / f"{name}_{year}_{hour}h"
    matches_cone_image = list(cone_image.parent.glob(cone_image.name + ".*"))
    wind_image = wind_graphic_dir / f"{name}_{year}_{hour}h"
    matches_wind_image = list(wind_image.parent.glob(wind_image.name + ".*"))
    advisory_text_file = advisory_text_dir / f"{name}_{year}_{hour}h.txt"
    wind_text_file = wind_text_dir / f"{name}_{year}_{hour}h.txt"

    # Print extracted paths and content
    print(f"\n📍 Scenario {idx + 1}: {row['port_name']} - {name} (Hour {hour} before landfall)")

    # Print and display cone image
    if matches_cone_image:
        img = Image.open(matches_cone_image[0])
        resized_img = img.resize((400, int(img.height * 400 / img.width)))  # resize width to 400 px, keep aspect ratio
    else:
        print("⚠️ Cone image not found.")

    # Print and display wind image
    if matches_wind_image:
        img = Image.open(matches_wind_image[0])
        resized_img = img.resize((400, int(img.height * 400 / img.width)))  # resize width to 400 px, keep aspect ratio
    else:
        print("⚠️ Wind image not found.")

    # Read and print advisory text
    if advisory_text_file.exists():
        with open(advisory_text_file, 'r') as f:
            advisory_text = f.read()
    else:
        print("⚠️ Advisory text file not found.")

    # Read and print wind text
    if wind_text_file.exists():
        with open(wind_text_file, 'r') as f:
            wind_text = f.read()
    else:
        print("⚠️ Wind text file not found.")


    if all([
        advisory_text_file.exists(),
        wind_text_file.exists(),
        matches_cone_image,
        matches_wind_image
    ]):
        cone_img_path = matches_cone_image[0]
        wind_img_path = matches_wind_image[0]

        with open(advisory_text_file, 'r') as f1, open(wind_text_file, 'r') as f2:
            advisory_text = f1.read()
            wind_text = f2.read()

        try:
            summary = summarize_scenario(
                cone_img_path=cone_img_path,
                wind_img_path=wind_img_path,
                advisory_text=advisory_text,
                wind_text=wind_text,
                port_name=row['port_name'],
                cyclone_name=row['tc_name']
            )

            print("🧠 Structured Summary:\n", summary)

            # Create safe filenames
            tc_name_safe = str(row['tc_name']).replace(" ", "_")
            port_name_safe = str(row['port_name']).replace(" ", "_")
            # Define your base folder
            base_folder = Path("/content/drive/MyDrive/LLMs/QA_Generation_GPT4o")
            summary_JSON = base_folder / "summary_JSON"

            output_path = summary_JSON / f"{tc_name_safe}_{year}_{port_name_safe}_{hour}_summary.json"

            # Save summary
            with open(output_path, 'w') as f:
                if isinstance(summary, dict):
                    json.dump(summary, f, indent=2)
                else:
                    f.write(summary)

            print(f"✅ JSON saved to: {output_path}")

        except Exception as e:
            print(f"❌ Skipped due to error: {type(e).__name__} - {e}")

In [None]:
# Impact prediction


In [None]:
# Senario Encoding


In [None]:
# Vessel routing


### Iterative code

In [None]:
# ✅ STEP 3: Senario multimodal data organization
from google.colab import drive
import pandas as pd

drive.mount('/content/drive')

# Base folders
base_folder = Path("/content/drive/MyDrive/LLMs/QA_Generation_GPT4o")
metadata_file = base_folder / "Cyclone Senarios.csv"

# Read scenario metadata
senario_metadata = pd.read_csv(metadata_file)

# Define subfolders
cone_graphic_dir = base_folder / "Cyclone Graphics Archive Uncertainty Cone"
wind_graphic_dir = base_folder / "Cyclone Graphics Archive Wind"
advisory_text_dir = base_folder / "Cyclone Text Archive Advisory"
wind_text_dir = base_folder / "Cyclone Text Archive Wind"

# Iterate through scenarios
for idx, row in senario_metadata.iterrows():
    name = row['NAME']
    day = int(row['Day_before_Landfall'])

    # Construct paths
    cone_image = cone_graphic_dir / f"{name}_Day_{day}.png"
    wind_image = wind_graphic_dir / f"{name}_Day_{day}.png"
    advisory_text_file = advisory_text_dir / f"{name}_Day_{day}.txt"
    wind_text_file = wind_text_dir / f"{name}_Day_{day}.txt"  # typo fixed: .png ➝ .txt

    # Print extracted paths and content
    print(f"\n📍 Scenario {idx + 1}: {row['PORT']} - {name} (Day {day} before landfall)")

    print(f"🌀 Cone Image Path: {cone_image}")
    print(f"💨 Wind Image Path: {wind_image}")

    # Read and print advisory text
    if advisory_text_file.exists():
        with open(advisory_text_file, 'r') as f:
            advisory_text = f.read()
        print(f"📄 Advisory Text:\n{advisory_text[:500]}...")  # Preview first 500 chars
    else:
        print("⚠️ Advisory text file not found.")

    # Read and print wind text
    if wind_text_file.exists():
        with open(wind_text_file, 'r') as f:
            wind_text = f.read()
        print(f"🌬️ Wind Text:\n{wind_text[:500]}...")  # Preview
    else:
        print("⚠️ Wind text file not found.")