# Policy Severity & OpenAI Classification


This notebook fetches the cleaned policy CSV, calls OpenAI to label each update with a severity (LOW, MEDIUM, HIGH, CRITICAL), and writes out a timestamped CSV.


## Import libraries and data

In [4]:
#!pip install openai python-dotenv
#%pip install --upgrade typing_extensions openai python-dotenv
# %pip install openai==0.28.0
# %pip install --upgrade openai



In [5]:
# Load packages / libraries
#load_ext dotenv -----> run this once
%reload_ext dotenv      
%dotenv
    
import os, openai
import pandas as pd
from time import sleep
from datetime import datetime
print("CWD:", os.getcwd())
print("Files:", os.listdir())


CWD: /Volumes/Personal Drive/GitHub/gdpr-ccpa-risk-pipeline/notebooks
Files: ['Policy Severity & OpenAI.ipynb', 'validate_forecast.ipynb', '.env', '.ipynb_checkpoints']


In [6]:
# Verify the key is loaded
key = os.getenv("OPENAI_API_KEY")
print("Key loaded?", key is not None)

# Configure the client
openai.api_key = key


Key loaded? True



## Define the classifier


In [7]:
# Cell 3: Prompt & classification function
PROMPT_HEADER = """
You are a compliance analyst. Rate the severity of this GDPR/CCPA policy update as one of:
LOW, MEDIUM, HIGH, or CRITICAL.
- CRITICAL = major new obligation or enforcement action
- HIGH     = significant risk or new guidance
- MEDIUM   = incremental clarifications
- LOW      = minor updates or purely informational

Examples:
Title: "EDPB fines Company X €20 million for GDPR breach"
Summary: "First major fine under the new regime…"
Severity: CRITICAL

---
""".strip()

def classify_severity(title: str, summary: str) -> str:
    prompt = PROMPT_HEADER + f"\nTitle: \"{title}\"\nSummary: \"{summary}\"\nSeverity:"
    resp = openai.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role":"system", "content":"Classify GDPR/CCPA policy updates by severity."},
                {"role":"user",   "content": prompt}
            ],
            temperature=0.0,
            max_tokens=5
        )
    return resp.choices[0].message.content.strip().splitlines()[-1].upper()


## Run classification and save

In [8]:
# 1) Load your cleaned policies
df = pd.read_csv("/../data/processed/cleaned_policies.csv")

# 2) Classify in a loop, pacing with sleep
severities = []
for _, row in df.iterrows():
    sev = classify_severity(
        title   = row["title"],
        summary = row.get("summary", "")
    )
    severities.append(sev)
    sleep(0.3)   

# 3) Attach the results and save
df["severity"] = severities

ts = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
out_path = f"../data/processed/cleaned_with_severity_{ts}.csv"
df.to_csv(out_path, index=False)

print("✅ Saved classified policies to:", out_path)
df.head()


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}