In [40]:

import subprocess
import logging
import os
import threading
from concurrent.futures import ThreadPoolExecutor
from typing import List

import nltk
import pandas as pd
import spacy
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from dotenv import load_dotenv
from openai import OpenAI
from pydantic import BaseModel, Field
from spacy import Language, displacy
from spacy.matcher import PhraseMatcher, Matcher
from spacy.tokens import Span, Token, Doc
from tqdm.autonotebook import tqdm
from word2number import w2n

load_dotenv()

nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])

nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\alang\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\alang\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\alang\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [2]:
# Load datasets as raw CSVs
news_articles_df = pd.read_csv("data/local_news_articles.csv")
press_releases_df = pd.read_csv("data/police_press_releases.csv")

In [3]:
mapping = str.maketrans({
    "ċ": "c",
    "Ċ": "C",
    "ġ": "g",
    "Ġ": "G",
    "ħ": "h",
    "Ħ": "H",
    "ż": "z",
    "Ż": "Z",
    "“": '"',
    "”": '"',
    "‘": "'",
    "’": "'",
    "—": "-",
    "–": "-",
    "…": "...",
})

def clean_articles(df: pd.DataFrame, columns):
    for col in columns:
        df[col] = df[col].astype(str).str.translate(mapping)
    return df

news_articles_df = clean_articles(news_articles_df, ["title", "subtitle", "content"])
press_releases_df = clean_articles(press_releases_df, ["title", "content"])

In [4]:
people_subj = {"man", "woman", "child", "driver", "motorist", "motorcyclist", "pedestrian"}
vehicles = {"car", "motorcycle", "truck", "van", "bus", "bicycle",
            "Audi", "BMW", "Chevrolet", "Citroen", "Dodge", "Fiat", "Ford", "Honda", "Hyundai", "Isuzu",
            "Jaguar", "Jeep", "Kia", "Kymco", "Mercedes", "Mercedes-Benz", "Mini", "Mitsubishi", "Nissan",
            "Peugeot", "Renault", "Skoda", "Subaru", "Suzuki", "Toyota", "Volkswagen", "VW", "Volvo"}
accident = {"accident", "crash", "collision"}
injuries = {"injure", "die"}

phrases = [
    "car crash", "traffic accident", "road accident", "collision",
    "crashed", "crash", "hit by a car", "motorcycle accident",
    "injured in a crash", "overturned", "run over", "lost control"
]

age_matcher = PhraseMatcher(nlp.vocab)
patterns = [nlp(text) for text in phrases]
age_matcher.add("ACCIDENT_PATTERNS", patterns)

def refers_to_accident(text: str) -> bool:
    doc = nlp(text)

    matches = age_matcher(doc)

    if len(matches) > 0:
        return True

    has_people = False
    has_vehicles = False
    has_accident = False
    has_injury = False

    for token in doc:
        if token.lemma_ in people_subj:
            has_people = True

        if token.lemma_ in vehicles:
            has_vehicles = True

        if token.lemma_ in accident:
            has_accident = True

        if token.lemma_ in injuries:
            has_injury = True

        if has_people and has_vehicles and (has_accident or has_injury):
            return True

    return False

non_related_news_article_ids = []

for id, text in news_articles_df[["id", "content"]].values:
    is_accident = refers_to_accident(text)
    if not is_accident:
        non_related_news_article_ids.append(id)

os.makedirs("data/intermediate", exist_ok=True)

non_related_news_article_df = news_articles_df[news_articles_df["id"].isin(non_related_news_article_ids)]
related_news_article_df = news_articles_df[~news_articles_df["id"].isin(non_related_news_article_ids)]
non_related_news_article_df.to_csv("data/intermediate/local_news_articles_exclusions.csv", index=False)
related_news_article_df.to_csv("data/intermediate/local_news_articles.csv", index=False)

In [5]:
api_version = "2025-01-01-preview"

endpoint = "https://news-analysis-resource.openai.azure.com/openai/v1/"

token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default")

client = OpenAI(
    base_url=endpoint,
    api_key=token_provider,
)

In [6]:
class NonAccidentIDs(BaseModel):
    ids: List[int] = Field(description="A list of ids of news articles that are not accidents")

models = [("gpt-5-mini", 50), ("grok-4-fast-non-reasoning", 50), ("DeepSeek-R1-0528", 20)]

system_msg = """
You are a senior data scientist reviewing a semi-structured dataset of vehicle accidents news articles. The articles were obtained by simple web scraping (ex: on the tag of the article) which means that some articles do not refer to actual accidents (for example, they refer to new accident/traffic measures).

Your job is to review the given accident CSV and return a list of news article IDs that do not refer to accidents.
Be very critical! Any article which mentions a specific accident and provides details on it should not be removed.

IMPORTANT: You MUST return the data by calling the `set_non_accident_ids` function.

Do not return anything other than a function call.
"""

csv_prompt = lambda dataset_csv, start_rows, end_rows, total_rows: f"""
MAKE SURE THAT THE RETURNED IDS EXIST IN THIS CSV!

Accident CSV Data ({start_rows}-{end_rows}/{total_rows} rows):

{dataset_csv}
"""

result_function = {
    "type": "function",
    "function": {
        "name": "set_non_accident_ids",
        "description": "Set the IDs of the news articles which do not refer to an accident",
        "parameters": NonAccidentIDs.model_json_schema()
    }
}

tools = [result_function]

tqdm.set_lock(threading.RLock())

def process_model(model, num_rows_per_request, dataset_df: pd.DataFrame, system_prompt) -> set[int]:
    total_ids = set()

    with tqdm(range(0, len(dataset_df.index), num_rows_per_request), desc=f"Querying {model}", total=len(dataset_df.index), unit=" rows") as pbar:
        i = 0
        while i < len(dataset_df.index):
            try:
                start = i
                end = min(i + num_rows_per_request, len(dataset_df.index))

                df_section: pd.DataFrame = dataset_df.iloc[start:end]
                df_section_csv = df_section.to_csv(index=False)

                messages = [
                    {
                        "role": "system",
                        "content": system_prompt,
                    },
                    {
                        "role": "user",
                        "content": csv_prompt(df_section_csv, start + 1, end + 1, len(dataset_df.index)),
                    }
                ]

                response = client.chat.completions.create(
                    model=model,
                    tools=tools,
                    messages=messages,
                )

                result: NonAccidentIDs = NonAccidentIDs.model_validate_json(response.choices[0].message.tool_calls[0].function.arguments)
                for id in result.ids:
                    if id not in dataset_df["id"].values:
                        raise ValueError(f"ID {id} not in dataset")

                total_ids.update(result.ids)

                pbar.update(end - start)
                i += num_rows_per_request
            except Exception as e:
                logging.warning(f"Failed to query {model}: {e}")

    return total_ids

with ThreadPoolExecutor(max_workers=len(models)) as executor:
    futures = {
        executor.submit(
            process_model,
            model,
            num_rows_per_request,
            related_news_article_df,
            system_msg
        ): model
        for model, num_rows_per_request in models
    }

    model_ids = {}

    for f in futures.keys():
        result = f.result()
        model_ids[futures[f]] = result

    print({
        model: list(sorted(ids)) for model, ids in model_ids.items()
    })

all_ids = list(model_ids.values())
ids_union = all_ids[0].union(*all_ids[1:])

related_news_article_df[related_news_article_df["id"].isin(ids_union)].to_csv("data/intermediate/local_news_articles_llm_exclusions.csv", index=False)
related_news_article_df[related_news_article_df["id"].isin(ids_union)].to_csv("data/intermediate/local_news_articles_llm_manual_exclusions.csv", index=False)

Querying grok-4-fast-non-reasoning:   0%|          | 0/242 [00:00<?, ? rows/s]

Querying DeepSeek-R1-0528:   0%|          | 0/242 [00:00<?, ? rows/s]

Querying gpt-5-mini:   0%|          | 0/242 [00:00<?, ? rows/s]

{'gpt-5-mini': [46, 370, 645, 695, 894, 1387, 2022, 3827, 467185, 493223, 493920, 496068, 496154], 'grok-4-fast-non-reasoning': [46, 287, 370, 645, 2022, 2092, 3604, 3699, 3809, 3823, 3827, 490929, 491047, 491118, 491371, 491380, 491437, 493920, 494102, 495320, 495942, 496154, 496206, 496362], 'DeepSeek-R1-0528': [46, 370, 645, 695, 894, 1352, 1387, 2022, 2092, 3535, 3827, 467108, 467185, 467297, 489424, 490206, 490685, 490742, 490929, 491047, 491118, 491371, 491380, 491437, 493212, 493223, 493920, 494102, 496068, 496154]}


# Manually review LLM exclusions
## Removed IDs
- 3699
- 1352
- 370
- 287
- 489424
- 491118
- 491371
- 494102
- 495320 (inconclusive evidence)
- 495942
- 496362

In [9]:
manual_ids = {3699, 1352, 370, 287, 489424, 491118, 491371, 494102, 495320, 495942, 496362}
llm_manual_exclusions = pd.read_csv("data/intermediate/local_news_articles_llm_manual_exclusions.csv")
llm_manual_exclusions = llm_manual_exclusions[~llm_manual_exclusions["id"].isin(manual_ids)]
llm_manual_exclusions.to_csv("data/intermediate/local_news_articles_llm_manual_exclusions.csv", index=False)

In [7]:
llm_manual_exclusions = pd.read_csv("data/intermediate/local_news_articles_llm_manual_exclusions.csv")
news_articles_df = related_news_article_df[~related_news_article_df["id"].isin(llm_manual_exclusions["id"])]
news_articles_df.to_csv("data/intermediate/local_news_articles.csv", index=False)

 # Per-person data

In [55]:
person_data = []

age_matcher = Matcher(nlp.vocab)

age_patterns = [
    [{"LIKE_NUM": True}, {"LOWER": "year"}, {"LOWER": "old"}],
    [{"LIKE_NUM": True}, {"IS_PUNCT": True}, {"LOWER": "year"}, {"IS_PUNCT": True}, {"LOWER": "old"}],
    [{"TEXT": ","}, {"LIKE_NUM": True}, {"IS_PUNCT": True}]
]

age_matcher.add("AGE", age_patterns)

severity_matcher = Matcher(nlp.vocab)

severity_patterns = [
    [{"LIKE_NUM": True}, {"LOWER": "year"}, {"LOWER": "old"}],
    [{"LIKE_NUM": True}, {"IS_PUNCT": True}, {"LOWER": "year"}, {"IS_PUNCT": True}, {"LOWER": "old"}],
    [{"TEXT": ","}, {"LIKE_NUM": True}, {"IS_PUNCT": True}]
]

severity_matcher.add("SEVERITY", age_patterns)

def get_ages(doc: Doc) -> list[tuple[int, Span]]:
    ages = []
    matches = age_matcher(doc)

    for match_id, start, end in matches:
        span: Span = doc[start:end]
        age_int = -1
        for token in span:
            if token.like_num:
                try:
                    age_int = int(token.text)
                except ValueError:
                    age_int = w2n.word_to_num(token.text)
                break
        if age_int <= 0 or age_int >= 120:
            continue
        ages.append((age_int, span))

    return ages

gender_words = {
    "man": "male", "male": "male", "he": "male", "him": "male",
    "woman": "female", "female": "female", "she": "female", "her": "female",
}

def get_genders(doc: Doc) -> list[tuple[str, Span]]:
    genders = []
    for token in doc:
        if token.lemma_ in gender_words:
            genders.append((gender_words[token.lemma_], Span(doc, token.i, token.i + 1)))
    return genders

accepted_injuries = {"serious", "slight", "grievous", "light", "critical", "fatal", "slight", "bad"}
def get_severities(doc: Doc) -> list[tuple[str, Span]]:
    severities = []
    def adverb_to_adj(word: str):
        w = word.lower()
        if w.endswith("ously"):
            return w[:-2]
        if w.endswith("ally"):
            return w[:-2]
        if w.endswith("ly"):
            return w[:-2]
        return w

    for token in doc:
        if token.lemma_ == "injure" or token.lemma_ == "injury":
            for child in token.children:
                if child.dep_ == "advmod" or child.dep_ == "amod":
                    injury_adj = adverb_to_adj(child.lemma_)
                    if injury_adj not in accepted_injuries:
                        continue
                    severities.append((injury_adj, Span(doc, child.i, child.i + 1)))

    return severities

test_doc = nlp("One person suffered grievous injuries while another sustained light injuries in the accident")
# displacy.render(test_doc)
#
# for token in test_doc:
#     print(token, "|", token.lemma_)

for idx, row in news_articles_df.iterrows():
    content = row["content"]
    doc = nlp(content)

    # displacy.render(doc)
    ages = get_ages(doc)
    genders = get_genders(doc)
    severities = get_severities(doc)
    print(row["id"], severities)



One | one
person | person
suffered | suffer
grievous | grievous
injuries | injury
while | while
another | another
sustained | sustain
light | light
injuries | injury
in | in
the | the
accident | accident
4208 []
4093 [('serious', serious)]
4110 [('serious', seriously), ('serious', serious)]
4066 [('critical', critically), ('serious', serious), ('serious', seriously)]
4067 []
4087 []
4049 [('serious', seriously), ('serious', serious)]
4034 [('serious', seriously)]
4007 []
4024 []
4014 []
3989 []
3991 [('serious', serious)]
3976 [('serious', serious)]
3964 [('slight', slight), ('serious', seriously)]
3966 [('grievous', grievously)]
3938 [('critical', critically)]
3910 []
3885 [('grievous', grievous)]
3900 [('serious', serious)]
3861 []
3834 [('grievous', grievous), ('grievous', grievous)]
3766 [('grievous', grievously), ('grievous', grievous)]
3747 []
3749 [('bad', badly), ('serious', serious), ('grievous', grievously)]
3740 [('serious', seriously)]
3704 [('grievous', grievously)]
3699 [