In [1]:
import boto3
import pandas as pd
import json
import time
import random
from tqdm import tqdm
from botocore.exceptions import ClientError

# -------------------
# 1. Load transcripts
# -------------------
df = pd.read_csv("call_transcripts.csv")
print(f"Loaded {len(df)} transcripts")

# -------------------
# 2. Connect to AWS Bedrock once
# -------------------
bedrock = boto3.client(
    service_name="bedrock-runtime",
    region_name="us-east-1"
)

# -------------------
# 3. Summarize + categorize with retry/backoff
# -------------------
def summarize_transcript(transcript, retries=5, backoff=2):
    prompt = f"""Summarize this 911 call in two bullets for responders. 
Always include the location exactly as shown in the dialogue in BOTH bullets. 
Also classify the emergency into one of these categories:
[Medical, Fire, Traffic Accident, Crime, Domestic Disturbance, Other]. 

Format:
Emergency: [type] at [location] — [people], urgency [High/Medium/Low]
Key Details: [critical responder facts, including the caller's location]
Category: [category label]

911 Call Transcript: {transcript}

Provide a concise, structured summary for emergency responders."""
    
    for attempt in range(retries):
        try:
            response = bedrock.invoke_model(
                modelId="us.anthropic.claude-3-7-sonnet-20250219-v1:0",
                body=json.dumps({
                    "anthropic_version": "bedrock-2023-05-31",
                    "max_tokens": 300,
                    "temperature": 0.3,
                    "messages": [{"role": "user", "content": prompt}]
                })
            )
            
            result = json.loads(response['body'].read())
            text_out = result["content"][0]["text"]

            category = "Uncategorized"
            location = "Unknown Location"

            # Extract category if present
            if "Category:" in text_out:
                parts = text_out.split("Category:")
                summary = parts[0].strip()
                category = parts[1].strip()
            else:
                summary = text_out.strip()

            # crude location extraction
            for line in summary.splitlines():
                if line.startswith("Emergency:") and " at " in line:
                    try:
                        location = line.split(" at ")[1].split("—")[0].strip()
                    except:
                        pass

            return summary, category, location

        except ClientError as e:
            if e.response["Error"]["Code"] == "ThrottlingException":
                sleep_time = backoff ** attempt + random.uniform(0, 1)
                print(f"⚠️ Throttled. Retrying in {sleep_time:.1f}s...")
                time.sleep(sleep_time)
                continue
            else:
                return f"Error: {e}", "Uncategorized", "Unknown Location"
        except Exception as e:
            return f"Error: {e}", "Uncategorized", "Unknown Location"
    
    return "Error: Max retries exceeded", "Uncategorized", "Unknown Location"

# -------------------
# 4. Consolidate multiple transcripts into one summary per group
# -------------------
def consolidate_group_summary(transcripts, retries=5, backoff=2):
    transcripts_text = "\n\n".join(transcripts[:10])  # limit to avoid token issues
    prompt = f"""You are a 911 call assistant.
Summarize the following transcripts about the same incident into a single concise summary.
Include:
- Emergency line (type + location + people + urgency)
- Key Details (critical facts + location)
- Category

Transcripts:
{transcripts_text}
"""
    for attempt in range(retries):
        try:
            response = bedrock.invoke_model(
                modelId="us.anthropic.claude-3-7-sonnet-20250219-v1:0",
                body=json.dumps({
                    "anthropic_version": "bedrock-2023-05-31",
                    "max_tokens": 400,
                    "temperature": 0.3,
                    "messages": [{"role":"user","content":prompt}]
                })
            )
            result = json.loads(response['body'].read())
            return result["content"][0]["text"]
        except Exception as e:
            time.sleep(backoff ** attempt + random.uniform(0,1))
            continue
    return "Error: Max retries exceeded"

# -------------------
# 5. Sequential processing with grouping
# -------------------
def process_with_grouping(df):
    grouped_results = {}

    for i, row in tqdm(df.iterrows(), total=len(df), desc="Processing transcripts"):
        transcript = row["Transcription"]
        summary, category, location = summarize_transcript(transcript)

        group_key = (category, location)
        if group_key not in grouped_results:
            grouped_results[group_key] = {
                "Category": category,
                "Location": location,
                "Transcripts": [],
            }

        grouped_results[group_key]["Transcripts"].append(transcript)

    # Consolidate summaries per group
    for key, g in grouped_results.items():
        g["Combined Summary"] = consolidate_group_summary(g["Transcripts"])

    return grouped_results

# -------------------
# 6. Run processing
# -------------------
print("\nProcessing transcripts with grouping and consolidation...")
grouped = process_with_grouping(df)

# -------------------
# 7. Save results
# -------------------
rows = []
for key, g in grouped.items():
    rows.append({
        "Category": g["Category"],
        "Location": g["Location"],
        "Combined Summary": g["Combined Summary"],
        "Transcript Count": len(g["Transcripts"])
    })

out_df = pd.DataFrame(rows)
out_df.to_csv("grouped_call_summaries.csv", index=False)
print("✅ Grouped and consolidated summaries saved to grouped_call_summaries.csv")




Loaded 13 transcripts

Processing transcripts with grouping and consolidation...


Processing transcripts:  31%|███       | 4/13 [00:07<00:17,  1.97s/it]

⚠️ Throttled. Retrying in 1.4s...


Processing transcripts:  38%|███▊      | 5/13 [00:17<00:37,  4.73s/it]

⚠️ Throttled. Retrying in 1.9s...


Processing transcripts:  54%|█████▍    | 7/13 [00:47<01:02, 10.41s/it]

⚠️ Throttled. Retrying in 1.8s...


Processing transcripts:  62%|██████▏   | 8/13 [01:02<00:59, 11.89s/it]

⚠️ Throttled. Retrying in 1.7s...


Processing transcripts:  77%|███████▋  | 10/13 [01:32<00:40, 13.47s/it]

⚠️ Throttled. Retrying in 1.1s...


Processing transcripts:  85%|████████▍ | 11/13 [01:47<00:28, 14.03s/it]

⚠️ Throttled. Retrying in 1.7s...


Processing transcripts: 100%|██████████| 13/13 [02:17<00:00, 10.56s/it]


✅ Grouped and consolidated summaries saved to grouped_call_summaries.csv


In [18]:

df.head()

Unnamed: 0,Transcription,Start Time,End Time,Duration (seconds),Location,Caller ID,Call Type
0,"911, what's your emergency? Caller: My husband...",2024-03-15 14:23:17,2024-03-15 14:28:42,325,"47284 Jennifer Islands, Port Kellychester, ND ...",Sarah Martinez,Chest pain
1,"Emergency services, what's happening? Caller: ...",2024-05-22 11:47:33,2024-05-22 11:52:18,285,"789 David Locks, Anthonyshire, CA 92103",Anonymous,Active shooter
2,"911 dispatch, what's your emergency? Caller: M...",2024-07-08 19:34:26,2024-07-08 19:41:15,409,"456 Maria Valley, Lake Christopherstad, TX 75201",Michael Chen,House fire
3,What's your emergency? Caller: I think my wife...,2024-01-19 09:15:44,2024-01-19 09:20:29,285,"123 Thompson Street, New Patricia, FL 33125",Robert Johnson,Stroke symptoms
4,"911, how can I help you? Caller: There's been ...",2024-09-03 16:42:11,2024-09-03 16:45:33,202,"890 Garcia Ranch, Williamsbury, IL 60614",Jennifer Lopez,Car accident (with injury)
