## USCG Condition Social Media Data

In [54]:
import os, requests, time
from datetime import datetime

# -- API CONFIG -----------------------------------------------------------------
API_KEY = os.getenv("TWITTERAPI_KEY", "183c6cfd106347ed9670a13e34fede49")
ENDPOINT = "https://api.twitterapi.io/twitter/tweet/advanced_search"
HEADERS  = {"X-API-Key": API_KEY}

# -- ACCOUNTS -------------------------------------------------------------------
uscg_accounts = [
    "USCGSoutheast", "ComdtUSCG", "USCG", "USCoastGuard", "USCGAcademy", "AMVER",
    "USCGAlaska", "USCGGreatLakes", "USCGHawaiiPac", "USCGHeartland", "USCGLANTAREA",
    "USCGMidAtlantic", "USCGNortheast", "USCGPacificNW", "MaritimeCommons"
]

# -- FETCH FUNCTION -------------------------------------------------------------
def search_tweets(account):
    """Return list of all tweets from one account that mention Port Condition."""
    tweets_for_handle = []
    params = {
        "query": f'from:{account} "Port Condition"',
        "queryType": "Latest",
        "cursor": ""
    }

    while True:
        try:
            resp = requests.get(ENDPOINT, headers=HEADERS, params=params, timeout=15)
            resp.raise_for_status()
            data = resp.json()
        except Exception as e:
            # uncomment next line if you want to see the failure reason
            # print(f"⚠️  API error for @{account}: {e}")
            break

        tweets_for_handle.extend(data.get("tweets", []))

        if not data.get("has_next_page"):
            break

        params["cursor"] = data.get("next_cursor", "")
        time.sleep(0.3)       # gentle rate‑limit for each page

    if tweets_for_handle:
        print(f"✅ {len(tweets_for_handle)} tweets from @{account}")
    else:
        print(f"— No tweets found for @{account}")

    return tweets_for_handle

# -- MAIN LOOP ------------------------------------------------------------------
all_tweets = []
for acc in uscg_accounts:
    all_tweets.extend(search_tweets(acc))
    time.sleep(1.0)           # polite gap between different accounts

print(f"\n📦 Total tweets collected: {len(all_tweets)}")

✅ 199 tweets from @USCGSoutheast
— No tweets found for @ComdtUSCG
✅ 19 tweets from @USCG
✅ 139 tweets from @USCoastGuard
— No tweets found for @USCGAcademy
— No tweets found for @AMVER
— No tweets found for @USCGAlaska
— No tweets found for @USCGGreatLakes
✅ 17 tweets from @USCGHawaiiPac
✅ 10 tweets from @USCGHeartland
✅ 2 tweets from @USCGLANTAREA
✅ 43 tweets from @USCGMidAtlantic
✅ 9 tweets from @USCGNortheast
— No tweets found for @USCGPacificNW
✅ 1 tweets from @MaritimeCommons

📦 Total tweets collected: 439


## Senario Encoding for tweets

In [55]:
#!/usr/bin/env python3
"""
Scrape USCG port‑condition tweets, parse them with GPT‑4o (o3),
and write the structured results to a CSV.
"""

import os
import csv
import json
import base64
from pathlib import Path
from datetime import datetime
import requests
import openai                     # pip install openai

# ========== USER CONFIG ==========

openai.api_key = "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"  # <- 🔐 PUT YOUR GPT-4o KEY HERE
assert openai.api_key, "Missing OPENAI_API_KEY environment variable"

# ========== I/O SETUP ==========

image_dir = Path("downloaded_images")
image_dir.mkdir(exist_ok=True)

csv_path = Path("uscg_port_conditions_enriched.csv")
fieldnames = [
    "id", "time_utc", "text", "photos",
    "CycloneName", "Year", "PortList", "Condition", "ConditionEffectiveTime",
    "VesselOps", "OceanVesselOps", "FacilityOps",
]

# ========== GPT HELPER ==========

def encode_image_b64(path: Path) -> str:
    with path.open("rb") as f:
        return base64.b64encode(f.read()).decode()


def call_gpt_o3(tweet_text: str, tweet_time_utc: str, image_paths: list[Path]) -> dict:
    """
    Send the full tweet text + timestamp + all photos to o3
    and return the parsed JSON as a Python dict.
    """
    prompt = f"""
Tweet time (UTC): {tweet_time_utc}

Given the tweet content and any attached USCG image(s), extract and return the following **as a JSON object**:
- Tropical Cyclone Name             (single word, e.g. "HARVEY")
- Year of the post                  (format: "YYYY")
- Port List                         (array of single‑word port names, and regions should find all ports, e.g. keywest is not a single port, find all ports in keywest then output the list, e.g. ["Miami", "Houston"])
- Condition                         (e.g. "WHISKEY", "YANKEE")
- Condition effective time          (assume 12 h before tweet time if not explicitly stated; format "YYYY‑MM‑DD HH:00")
- Adopted operations for vessels    (describe operations need for all vessels, extract from text or photo)
- Adopted operations for ocean‑going vessels     (describe operations need for ocean‑going and commercial vessels, extract from text or photo)
- Adopted operations for ports and facilities    (describe operations need for ports and facilities, extract from text or photo)

Tweet text:
{tweet_text}

Return **only** the JSON object – no prose, no markdown.
""".strip()

    # Build one multimodal "user" message containing text + all images
    content_blocks = [{"type": "text", "text": prompt}]
    for p in image_paths:
        content_blocks.append(
            {
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{encode_image_b64(p)}"},
            }
        )

    resp = openai.chat.completions.create(
        model="o3",
        messages=[
            {
                "role": "system",
                "content": (
                    "You are a port‑operations analyst extracting structured alert information."
                ),
            },
            {"role": "user", "content": content_blocks},
        ],
    )
    return json.loads(resp.choices[0].message.content)


# ========== MAIN LOOP ==========

with csv_path.open("w", newline="", encoding="utf-8") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for row_id, tw in enumerate(all_tweets, start=1):
        # --- time ---
        raw_time = tw.get("createdAt", "")
        try:
            dt = datetime.strptime(raw_time, "%a %b %d %H:%M:%S %z %Y")
            time_utc = dt.strftime("%Y-%m-%d %H:%M:%S")
        except Exception:
            time_utc = raw_time

        # --- text ---
        tweet_text = tw.get("text", "").replace("\n", " ").strip()

        # --- download images ---
        photos: list[str] = []
        image_paths: list[Path] = []
        for m_idx, media in enumerate(tw.get("extendedEntities", {}).get("media", []), 1):
            if media.get("type") != "photo":
                continue
            url = media.get("media_url_https")
            ext = os.path.splitext(url)[1] or ".jpg"
            local_path = image_dir / f"{row_id}_{m_idx}{ext}"
            try:
                r = requests.get(url, timeout=15)
                r.raise_for_status()
                local_path.write_bytes(r.content)
                photos.append(local_path.name)
                image_paths.append(local_path)
            except Exception as e:
                print(f"⚠️  Image download failed ({url}): {e}")

        # --- GPT call (if we have at least one image) ---
        record = {
            "id": row_id,
            "time_utc": time_utc,
            "text": tweet_text,
            "photos": "; ".join(photos),
            "CycloneName": "",
            "Year": "",
            "PortList": "",
            "Condition": "",
            "ConditionEffectiveTime": "",
            "VesselOps": "",
            "OceanVesselOps": "",
            "FacilityOps": "",
        }

        if image_paths:
            try:
                parsed = call_gpt_o3(tweet_text, time_utc, image_paths)
                print(f"\n📤 GPT Raw Response for tweet {row_id}:\n{json.dumps(parsed, indent=2)}\n")
                record.update(
                    {
                        "CycloneName": parsed.get("Tropical Cyclone Name", ""),
                        "Year": parsed.get("Year", ""),
                        "PortList": ", ".join(parsed.get("Port List", [])),
                        "Condition": parsed.get("Condition", ""),
                        "ConditionEffectiveTime": parsed.get(
                            "Condition effective time", ""
                        ),
                        "VesselOps": parsed.get("Adopted operations for vessels", ""),
                        "OceanVesselOps": parsed.get(
                            "Adopted operations for ocean‑going vessels", ""
                        ),
                        "FacilityOps": parsed.get(
                            "Adopted operations for ports and facilities", ""
                        ),
                    }
                )
            except Exception as e:
                print(f"⚠️  GPT parsing error on tweet {row_id}: {e}")

        writer.writerow(record)

print(f"✅ CSV written: {csv_path.resolve()}")
print(f"Images saved under: {image_dir.resolve()}")


📤 GPT Raw Response for tweet 1:
{
  "Tropical Cyclone Name": "RAFAEL",
  "Year": "2024",
  "Port List": [
    "Tampa",
    "Manatee",
    "StPetersburg"
  ],
  "Condition": "WHISKEY",
  "Condition effective time": "2024-11-04 11:00",
  "Adopted operations for vessels": "Vessels wishing to remain in port must obtain Captain-of-the-Port approval and submit a written safe-mooring plan; vessels unable to offload and depart within 24 hours of threatening winds should divert to an alternate port.",
  "Adopted operations for ocean-going vessels": "All ocean-going vessels, barges, and pleasure craft should make plans to depart the port or seek safe harbor.",
  "Adopted operations for ports and facilities": "Ports and facilities remain open to all commercial traffic and cargo transfer operations may continue while Condition WHISKEY is in effect."
}


📤 GPT Raw Response for tweet 2:
{
  "Tropical Cyclone Name": "EIGHTEEN",
  "Year": "2024",
  "Port List": [
    "KeyWest"
  ],
  "Condition": "X-

## Extra Fill-in

In [59]:
"""
fill_port_conditions.py  –  auto‑fills CycloneName, PortList, Condition
-----------------------------------------------------------------------
USAGE:
    export OPENAI_API_KEY="sk‑..."
    python fill_port_conditions.py  input.csv  output_filled.csv
"""

import sys, json, os, hashlib, time
import pandas as pd
import openai

# ---------------------------------------------------------------------
# 0. CLI / config
# ---------------------------------------------------------------------
if len(sys.argv) != 3:
    sys.exit("Usage: python fill_port_conditions.py  input.csv  output.csv")

INPUT_CSV  = sys.argv[1]
OUTPUT_CSV = sys.argv[2]
MODEL      = "gpt-4o-mini"       # or "gpt-4o" if you have access
openai.api_key = "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"  # <- 🔐 PUT YOUR GPT-4o KEY HERE
assert openai.api_key, "Set OPENAI_API_KEY in your environment!"

# ---------------------------------------------------------------------
# 1. Load data
# ---------------------------------------------------------------------
df = pd.read_csv('uscg_port_conditions.csv')
target_cols = ["CycloneName", "PortList", "Condition"]
df[target_cols] = df[target_cols].astype("object")
# ---------------------------------------------------------------------
# 2. Helper:  ask GPT‑4o + simple cache
# ---------------------------------------------------------------------
cache = {}

def md5(s: str) -> str:
    return hashlib.md5(s.encode()).hexdigest()

def extract_fields(text: str) -> dict:
    """Return {'CycloneName': str, 'PortList': [..], 'Condition': str}."""
    key = md5(text)
    if key in cache:
        return cache[key]

    system_msg = (
        "You are a maritime operations analyst. "
        "When given a Coast‑Guard style tweet (text), "
        "extract:\n"
        "  • CycloneName   – single word, uppercase (e.g., HARVEY)\n"
        "  • PortList      – JSON array of single‑word port names ONLY\n"
        "  • Condition     – one of WHISKEY, X‑RAY, YANKEE, ZULU, FOUR, etc.\n\n"
        "Return a *single line* of valid JSON with keys exactly: "
        "CycloneName, PortList, Condition."
    )
    user_msg = f"TEXT:\n{text}"

    # One retry on RateLimit / transient errors
    for attempt in (1, 2):
        try:
            rsp = openai.chat.completions.create(
                model=MODEL,
                temperature=0,
                messages=[
                    {"role": "system", "content": system_msg},
                    {"role": "user",   "content": user_msg}
                ]
            )
            j = json.loads(rsp.choices[0].message.content)
            # Basic sanity check
            if not all(k in j for k in ("CycloneName", "PortList", "Condition")):
                raise ValueError("Missing key(s) in JSON")
            cache[key] = j
            return j

        except (openai.RateLimitError, openai.APIError) as e:
            if attempt == 2:
                raise
            print("API busy – sleeping 20 s and retrying…", flush=True)
            time.sleep(20)
        except Exception as e:
            print("Parse error for text:\n", text, "\n→", e)
            cache[key] = {}          # avoid infinite loop on same bad row
            return {}

# ---------------------------------------------------------------------
# 3. Loop over rows, fill in blanks
# ---------------------------------------------------------------------
for idx, row in df.iterrows():
    needs_fill = any(
        pd.isna(row[c]) or str(row[c]).strip() == "" for c in target_cols
    )
    if not needs_fill:
        continue

    info = extract_fields(str(row["text"]))
    if not info:
        continue
    # --- write results back -----------------------------------------
    # Cast PortList (list) → nice string for the CSV
    cleaned = {
        "CycloneName": info["CycloneName"],
        "PortList"   : ", ".join(info["PortList"]),   # "Miami, Houston"
        "Condition"  : info["Condition"]
    }
    print(cleaned)
    for col in target_cols:
        if pd.isna(row[col]) or str(row[col]).strip() == "":
            df.at[idx, col] = info[col]

# ---------------------------------------------------------------------
# 4. Save result
# ---------------------------------------------------------------------
df.to_csv(OUTPUT_CSV, index=False)
print(f"✅  Finished!  Filled CSV written to {OUTPUT_CSV}")

{'CycloneName': 'NORMAL', 'PortList': '', 'Condition': 'NORMAL'}
{'CycloneName': 'HONE', 'PortList': 'HAWAII', 'Condition': 'ZULU'}
{'CycloneName': 'HONE', 'PortList': 'OAHU, MAUI, HAWAII', 'Condition': 'X-RAY'}
{'CycloneName': 'HONE', 'PortList': 'HAWAII', 'Condition': 'WHISKEY'}
{'CycloneName': 'FREDRIKSTED', 'PortList': 'VIRGIN, ST, CROIX', 'Condition': 'FOUR'}
{'CycloneName': 'ERNESTO', 'PortList': 'PUERTO, RICO, VIRGIN, ISLANDS', 'Condition': 'FOUR'}
{'CycloneName': 'ERNESTO', 'PortList': 'VIRGIN, PUERTO', 'Condition': 'YANKEE'}
{'CycloneName': '', 'PortList': '', 'Condition': 'FOUR'}
{'CycloneName': 'TAMMY', 'PortList': 'SANJUAN, USVIRGINISLANDS, PUERTORICO', 'Condition': 'FOUR'}
{'CycloneName': 'LEE', 'PortList': '', 'Condition': 'X-RAY'}
{'CycloneName': 'LEE', 'PortList': 'PUERTO, VIRGINS', 'Condition': 'WHISKEY'}
{'CycloneName': 'IDALIA', 'PortList': 'MOREHEAD', 'Condition': 'ZULU'}
{'CycloneName': 'IDALIA', 'PortList': 'WILMINGTON', 'Condition': 'ZULU'}
{'CycloneName': 'FRANK

In [61]:
df.to_csv('enriched.csv',index = False)