# Data Preparation
This script contains all the code to process the original CSV files into a structured dataset.

In [None]:
import subprocess
import logging
import os
import threading
from concurrent.futures import ThreadPoolExecutor
from typing import List, Any

import pandas as pd
import spacy
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from openai import OpenAI
from pydantic import BaseModel, Field
from spacy.matcher import PhraseMatcher
from tqdm.autonotebook import tqdm
from IPython.core.magic import register_cell_magic
from IPython import get_ipython

# Downloads spaCy model
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
# Loads spaCy
nlp = spacy.load("en_core_web_sm")

@register_cell_magic
def skip_if(line, cell):
    if eval(line):
        return
    get_ipython().run_cell(cell)

In [None]:
# Load original datasets
news_articles_df = pd.read_csv("data/original/local_news_articles.csv")
press_releases_df = pd.read_csv("data/original/police_press_releases.csv")

# Rename news article `article_id` column to `id`
news_articles_df.rename(columns={"article_id": "id"}, inplace=True)

# Add `id` column to police press releases, continuing from the news articles ids
start = news_articles_df["id"].max() + 1
press_releases_df.insert(0, "id", range(start, start + len(press_releases_df)))

# We can save the police press releases as is; they are all valid accidents
press_releases_df.to_csv("data/police_press_releases.csv", index=False)

# 1. Replace special characters
## Why?
1. Special characters are not always supported by NLP libraries.
2. Special characters are not always converted to lowercase successfully.

In [None]:
# Map special characters to ASCII
mapping = str.maketrans({
    "ċ": "c",
    "Ċ": "C",
    "ġ": "g",
    "Ġ": "G",
    "ħ": "h",
    "Ħ": "H",
    "ż": "z",
    "Ż": "Z",
    "“": '"',
    "”": '"',
    "‘": "'",
    "’": "'",
    "—": "-",
    "–": "-",
    "…": "...",
})

def clean_articles(df: pd.DataFrame, columns):
    """Replaces special characters in the given dataframe columns with their ASCII counterparts"""
    for col in columns:
        df[col] = df[col].astype(str).str.translate(mapping)
    return df

# Clean the two datasets
news_articles_df = clean_articles(news_articles_df, ["title", "subtitle", "content"])
press_releases_df = clean_articles(press_releases_df, ["title", "content"])

# 2.a. Remove non-related articles
Some articles in the dataset do not refer to vehicle accidents (e.g. refers to work accidents or new accident prevention policies). We need to remove these.

This is done in two ways:
1. Matching accident phrases (e.g. car crash)
2. Finding reference to a person, vehicle and accident or injury

In [None]:
people_subj = {"man", "woman", "child", "driver", "motorist", "motorcyclist", "pedestrian"}
vehicles = {"car", "motorcycle", "truck", "van", "bus", "bicycle",
            "Audi", "BMW", "Chevrolet", "Citroen", "Dodge", "Fiat", "Ford", "Honda", "Hyundai", "Isuzu",
            "Jaguar", "Jeep", "Kia", "Kymco", "Mercedes", "Mercedes-Benz", "Mini", "Mitsubishi", "Nissan",
            "Peugeot", "Renault", "Skoda", "Subaru", "Suzuki", "Toyota", "Volkswagen", "VW", "Volvo"}
accident = {"accident", "crash", "collision"}
injuries = {"injure", "die"}

accident_phrases = [
    "car crash", "traffic accident", "road accident", "collision",
    "crashed", "crash", "hit by a car", "motorcycle accident",
    "injured in a crash", "overturned", "run over", "lost control"
]

accident_matcher = PhraseMatcher(nlp.vocab)
patterns = [nlp(text) for text in accident_phrases]
accident_matcher.add("ACCIDENT_PATTERNS", patterns)

def refers_to_accident(text: str) -> bool:
    doc = nlp(text)

    matches = accident_matcher(doc)

    # If any accident phrases are found, assume it is a valid article
    if len(matches) > 0:
        return True

    has_people = False
    has_vehicles = False
    has_accident = False
    has_injury = False

    for token in doc:
        if token.lemma_ in people_subj:
            has_people = True

        if token.lemma_ in vehicles:
            has_vehicles = True

        if token.lemma_ in accident:
            has_accident = True

        if token.lemma_ in injuries:
            has_injury = True

        # If people, vehicles and accident or injury is mentioned, assume it is a valid article
        if has_people and has_vehicles and (has_accident or has_injury):
            return True

    return False

# IDs of articles not referring to vehicle accidents
non_related_news_article_ids = []

for id, text in news_articles_df[["id", "content"]].values:
    is_accident = refers_to_accident(text)
    if not is_accident:
        non_related_news_article_ids.append(id)

os.makedirs("data/intermediate", exist_ok=True)

non_related_news_article_df = news_articles_df[news_articles_df["id"].isin(non_related_news_article_ids)]
related_news_article_df = news_articles_df[~news_articles_df["id"].isin(non_related_news_article_ids)]

# Save dataframes as CSVs to view results
non_related_news_article_df.to_csv("data/intermediate/local_news_articles_exclusions.csv", index=False)
related_news_article_df.to_csv("data/intermediate/local_news_articles.csv", index=False)

# 2.b. Using LLMs to flag non-related articles
While the previous method works quite well, some articles still get through.
To catch these, we pass the remaining articles through 3 LLMs (GPT 5 Mini, Grok 4 Fast, Deepseek R1).

The LLMs were set up through [Microsoft Foundry](https://ai.azure.com/) to have a unified API to communicate with different LLMs.

In [None]:
"""Initialising API"""
api_version = "2025-01-01-preview"

endpoint = "https://news-analysis-resource.openai.azure.com/openai/v1/"

# Need to login using `az login --use-device-code`
token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default")

client = OpenAI(
    base_url=endpoint,
    api_key=token_provider,
)

try:
    token_provider()
    run_cell = True
except:
    run_cell = False

In [None]:
%%skip_if not run_cell

models = [("gpt-5-mini", 50), ("grok-4-fast-non-reasoning", 50), ("DeepSeek-R1-0528", 20)]

class NonAccidentIDs(BaseModel):
    ids: List[int] = Field(description="A list of ids of news articles that are not accidents")

system_prompt = """
You are a senior data scientist reviewing a semi-structured dataset of vehicle accidents news articles. The articles were obtained by simple web scraping (ex: on the tag of the article) which means that some articles do not refer to actual accidents (for example, they refer to new accident/traffic measures).

Your job is to review the given accident CSV and return a list of news article IDs that do not refer to accidents.
Be very critical! Any article which mentions a specific accident and provides details on it should not be removed.

IMPORTANT: You MUST return the data by calling the `set_non_accident_ids` function.

Do not return anything other than a function call.
"""

csv_prompt = lambda dataset_csv, start_rows, end_rows, total_rows: f"""
MAKE SURE THAT THE RETURNED IDS EXIST IN THIS CSV!

Accident CSV Data ({start_rows}-{end_rows}/{total_rows} rows):

{dataset_csv}
"""

# LLM function definition
result_function = {
    "type": "function",
    "function": {
        "name": "set_non_accident_ids",
        "description": "Set the IDs of the news articles which do not refer to an accident",
        "parameters": NonAccidentIDs.model_json_schema()
    }
}

tools = [result_function]

tqdm.set_lock(threading.RLock())

def process_model(model, num_rows_per_request, dataset_df: pd.DataFrame, system_prompt) -> set[int]:
    total_ids = set()

    with tqdm(range(0, len(dataset_df.index), num_rows_per_request), desc=f"Querying {model}", total=len(dataset_df.index), unit=" rows") as pbar:
        i = 0
        while i < len(dataset_df.index):
            try:
                # Get row range as the LLMs cannot process the entire file at once
                start = i
                end = min(i + num_rows_per_request, len(dataset_df.index))
                df_section: pd.DataFrame = dataset_df.iloc[start:end]
                df_section_csv = df_section.to_csv(index=False)

                messages = [
                    {
                        "role": "system",
                        "content": system_prompt,
                    },
                    {
                        "role": "user",
                        "content": csv_prompt(df_section_csv, start + 1, end + 1, len(dataset_df.index)),
                    }
                ]

                response = client.chat.completions.create(
                    model=model,
                    tools=tools,
                    messages=messages,
                )

                result: NonAccidentIDs = NonAccidentIDs.model_validate_json(response.choices[0].message.tool_calls[0].function.arguments)

                for id in result.ids:
                    # Throw an error if a returned ID is not in the dataset
                    if id not in dataset_df["id"].values:
                        raise ValueError(f"ID {id} not in dataset")

                total_ids.update(result.ids)

                # Update tqdm progress bar
                pbar.update(end - start)
                i += num_rows_per_request
            except Exception as e:
                # If we get an error, retry the model (i.e. do not increment i)
                logging.warning(f"Failed to query {model}: {e}")

    return total_ids

# Run LLMs in parallel
with ThreadPoolExecutor(max_workers=len(models)) as executor:
    futures = {
        executor.submit(
            process_model,
            model,
            num_rows_per_request,
            related_news_article_df,
            system_prompt
        ): model
        for model, num_rows_per_request in models
    }

    model_ids = {}

    for f in futures.keys():
        result = f.result()
        model_ids[futures[f]] = result

all_ids = list(model_ids.values())
# Combine IDs from all LLMs
ids_union = all_ids[0].union(*all_ids[1:])

# Save the LLM excluded articles as a CSV to review
related_news_article_df[related_news_article_df["id"].isin(ids_union)].to_csv("data/intermediate/local_news_articles_llm_exclusions.csv", index=False)
# Save the same LLM excluded articles as a CSV. This CSV will be manually checked and modified
related_news_article_df[related_news_article_df["id"].isin(ids_union)].to_csv("data/intermediate/local_news_articles_llm_manual_exclusions.csv", index=False)

# 2.c. Manually review LLM exclusions
The LLMs tend to flag valid articles as invalid. Since this is a short list (because we already removed the bulk of invalid articles in step 1), we can go through the list manually and remove the valid articles.

## Removed IDs
- 3699
- 1352
- 370
- 287
- 489424
- 491118
- 491371
- 494102
- 495320
- 495942
- 496362

*Note: `local_news_articles_llm_exclusions.csv` contains the excluded rows as given by the LLMs. `local_news_articles_llm_manual_exclusions.csv` contains the same excluded rows given by the LLMs, except that any rows referring to valid accidents were manually removed.*

In [None]:
# Manual removal of valid rows
manual_ids = {3699, 1352, 370, 287, 489424, 491118, 491371, 494102, 495320, 495942, 496362}
llm_manual_exclusions = pd.read_csv("data/intermediate/local_news_articles_llm_manual_exclusions.csv")
llm_manual_exclusions = llm_manual_exclusions[~llm_manual_exclusions["id"].isin(manual_ids)]
llm_manual_exclusions.to_csv("data/intermediate/local_news_articles_llm_manual_exclusions.csv", index=False)

In [None]:
llm_manual_exclusions = pd.read_csv("data/intermediate/local_news_articles_llm_manual_exclusions.csv")

# Filtering the original news articles
news_articles_df = related_news_article_df[~related_news_article_df["id"].isin(llm_manual_exclusions["id"])]
news_articles_df.to_csv("data/local_news_articles.csv", index=False)

 # Per-person data

In [None]:
person_data = []

def remove_contained_spans(spans: list[tuple[Any, Span]]) -> list[tuple[Any, Span]]:
    filtered_spans = []
    spans = sorted(spans, key=lambda s: (s[1].end - s[1].start), reverse=True)

    for x, span in spans:
        contained = False
        for _, kept in filtered_spans:
            if span.start >= kept.start and span.end <= kept.end:
                contained = True
                break
        if not contained:
            filtered_spans.append((x, span))

    return list(sorted(filtered_spans, key=lambda s: s[1].start))

age_matcher = Matcher(nlp.vocab)

age_patterns = [
    [{"LIKE_NUM": True}, {"LOWER": "year"}, {"LOWER": "old"}],
    [{"LIKE_NUM": True}, {"IS_PUNCT": True}, {"LOWER": "year"}, {"IS_PUNCT": True}, {"LOWER": "old"}],
    [{"TEXT": ","}, {"LIKE_NUM": True}, {"IS_PUNCT": True}]
]

age_matcher.add("AGE", age_patterns)

severity_matcher = Matcher(nlp.vocab)

severity_patterns = [
    [{"LIKE_NUM": True}, {"LOWER": "year"}, {"LOWER": "old"}],
    [{"LIKE_NUM": True}, {"IS_PUNCT": True}, {"LOWER": "year"}, {"IS_PUNCT": True}, {"LOWER": "old"}],
    [{"TEXT": ","}, {"LIKE_NUM": True}, {"IS_PUNCT": True}]
]

severity_matcher.add("SEVERITY", age_patterns)

def get_ages(doc: Doc) -> list[tuple[int, Span]]:
    ages = []
    matches = age_matcher(doc)

    for match_id, start, end in matches:
        span: Span = doc[start:end]
        age_int = -1
        for token in span:
            if token.like_num:
                try:
                    age_int = int(token.text)
                except ValueError:
                    age_int = w2n.word_to_num(token.text)
                break
        if age_int <= 0 or age_int >= 120:
            continue
        ages.append((age_int, span))

    return ages

gender_words = {
    "man": "male", "male": "male", "he": "male", "him": "male",
    "woman": "female", "female": "female", "she": "female", "her": "female",
}

def get_genders(doc: Doc) -> list[tuple[str, Span]]:
    genders = []
    for token in doc:
        if token.lemma_ in gender_words:
            genders.append((gender_words[token.lemma_], Span(doc, token.i, token.i + 1)))
    return genders

accepted_injuries = {"serious", "slight", "grievous", "light", "critical", "fatal", "slight", "bad"}
def get_severities(doc: Doc) -> list[tuple[str, Span]]:
    severities = []
    def adverb_to_adj(word: str):
        w = word.lower()
        if w.endswith("ously"):
            return w[:-2]
        if w.endswith("ally"):
            return w[:-2]
        if w.endswith("ly"):
            return w[:-2]
        return w

    for token in doc:
        if token.lemma_ == "injure" or token.lemma_ == "injury":
            for child in token.children:
                if child.dep_ == "advmod" or child.dep_ == "amod":
                    injury_adj = adverb_to_adj(child.lemma_)
                    if injury_adj not in accepted_injuries:
                        continue
                    severities.append((injury_adj, Span(doc, child.i, child.i + 1)))

    return severities

person_matcher = Matcher(nlp.vocab)
person_nouns = ["man", "woman", "boy", "girl", "teen", "teenager", "baby", "person", "driver", "passenger",
    "pedestrian", "victim", "motorcyclist", "cyclist", "rider",
    "motorist", "resident", "teenager", "youth"]
vehicle_nouns = ["car", "motorcycle", "truck", "van", "bus", "bicycle"]
person_patterns = [
    [{"LOWER": {"IN": ["a", "an", "the", "one"]}, "OP": "?"}, {"LIKE_NUM": True}, {"IS_PUNCT": True}, {"LOWER": "year"}, {"IS_PUNCT": True}, {"LOWER": "old"}, {"POS": "ADJ", "OP": "?"}, {"LOWER": {"IN": person_nouns}, "OP": "?"}],
    [{"LOWER": {"IN": ["a", "an", "the", "one"]}}, {"POS": "ADJ", "OP": "?"}, {"LOWER": {"IN": person_nouns}}],
    [{"LOWER": {"IN": ["a", "an", "the", "one"]}}, {"POS": "ADJ", "OP": "?"}, {"LOWER": {"IN": vehicle_nouns}}, {"LOWER": "driver"}],
    [{"POS": "PROPN", "OP": "+"}, {"TEXT": ","}, {"LIKE_NUM": True}, {"IS_PUNCT": True}],
]

person_matcher.add("PERSON", person_patterns)

def get_persons(doc: Doc) -> list[tuple[str, Span]]:
    people = []
    matches = person_matcher(doc)

    for match_id, start, end in matches:
        span = doc[start:end]
        people.append((span.text, span))

    return remove_contained_spans(people)

test_doc = nlp("a 54-year-old English man from Mellieha")
# print(get_persons(test_doc))
displacy.render(test_doc)
#
# for token in test_doc:
#     print(token, "|", token.lemma_)

for idx, row in news_articles_df.iterrows():
    content = row["content"]
    doc = nlp(content)

    # displacy.render(doc)
    ages = get_ages(doc)
    genders = get_genders(doc)
    severities = get_severities(doc)
    people = get_persons(doc)
    print(row["id"], people)

