In [2]:
#%pip install pandas spacy sentence_transformers geopy

In [3]:
#!python -m spacy download en_core_web_sm

In [4]:
import pandas as pd
import re
import spacy
from spacy.matcher import PhraseMatcher, Matcher
from sentence_transformers import SentenceTransformer
from datetime import datetime
import unicodedata
from extract_location import load_towns, extract_road_and_town
import numpy as np
import requests_cache
from retry_requests import retry
import openmeteo_requests
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
LAT_CENTRE = 35.8858681
LONG_CENTRE = 14.4027236

In [6]:
police_df = pd.read_csv('../data/police_press_releases.csv')
locations_df = pd.read_csv('../data/locations.csv')
police_df.describe()

Unnamed: 0,title,date_published,date_modified,content
count,111,111,111,111
unique,79,87,86,111
top,Woman grievously injured in traffic accident,2025-02-09,2024-12-14,"Today, at around 0930hrs, the Police were info..."
freq,8,3,3,1


In [7]:

police_df['road'], police_df['town'] = zip(*police_df['content'].apply(
    lambda x: extract_road_and_town(x, locations_df)
))

police_df['location'] = police_df['content'].apply(
    lambda x: extract_road_and_town(x, locations_df)
)

In [8]:
police_df

Unnamed: 0,title,date_published,date_modified,content,road,town,location
0,Collision between a car and a motorbike in Żur...,2025-10-09,2025-10-09,"Today, at around 0930hrs, the Police were info...",Triq il-Belt Valletta,Iż-Żurrieq,"(Triq il-Belt Valletta, Iż-Żurrieq)"
1,Car-motorcycle traffic accident,2025-06-20,2025-06-20,"Yesterday, at around 1830hrs, the Police were ...",Triq Dawret il-Gudja,Il-Gudja,"(Triq Dawret il-Gudja, Il-Gudja)"
2,Car-motorcycle collision in Ħal Qormi,2025-05-12,2025-05-12,"Today, at around 0800hrs, the Police were info...",Valley Road,Ħal Qormi,"(Valley Road, Ħal Qormi)"
3,Collision between motorcycle and car in Għaxaq,2025-07-30,2025-07-30,"Yesterday, at around 1800hrs, the Police were ...",Triq Dawret Ħal Għaxaq,Ħal Ghaxaq,"(Triq Dawret Ħal Għaxaq, Ħal Ghaxaq)"
4,Car-motorcycle collision,2025-04-07,2025-04-07,"Yesterday, at around quarter to nine in the ev...",Triq il-Buqana,Ir-Rabat,"(Triq il-Buqana, Ir-Rabat)"
...,...,...,...,...,...,...,...
106,Motorcycle accident in Attard,2025-02-05,2025-02-05,"A 52-year-old man and residing in Ħaż-Żebbuġ, ...",Vjal L-Istadium Nazzjonali in Ta' Qali,Ħ'Attard,"(Vjal L-Istadium Nazzjonali in Ta' Qali, Ħ'Att..."
107,Naxxar traffic accident,2024-12-19,2024-12-19,"Today, at around 1045hrs, the Police were info...",Triq il-Ġermanja,In-Naxxar,"(Triq il-Ġermanja, In-Naxxar)"
108,Żebbuġ traffic accident,2025-03-16,2025-03-16,"Today, at around 0800hrs, the Police were info...",Vjal il-Helsien,Ħaż-Żebbuġ,"(Vjal il-Helsien, Ħaż-Żebbuġ)"
109,Collision between a car and e-scooter,2025-07-18,2025-07-18,"Yesterday, at around 2215 hrs, the Police were...",Triq il-Wied ta' Birkirkara,Birkirkara,"(Triq il-Wied ta' Birkirkara, Birkirkara)"


In [9]:
print(police_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111 entries, 0 to 110
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   title           111 non-null    object
 1   date_published  111 non-null    object
 2   date_modified   111 non-null    object
 3   content         111 non-null    object
 4   road            109 non-null    object
 5   town            111 non-null    object
 6   location        111 non-null    object
dtypes: object(7)
memory usage: 6.2+ KB
None


In [10]:

police_df['all_text'] = police_df['title'].apply(lambda x: unicodedata.normalize("NFKD", str(x))) + ' ' + \
                        police_df['content'].apply(lambda x: unicodedata.normalize("NFKD", str(x)))

In [11]:
police_df['all_text'] = police_df['title'] + ' ' + police_df['content']

In [12]:
def extract_injuries(text):
    text = text.lower()
    
    # Capture 1-2 words immediately before injury/injuries
    #pattern = r"\b(?:\w+\s)?(\w+)\s+injur(?:y|ies)\b"
    pattern = r"\b(grievous|grevious|serious|slight)\s+injur(?:y|ies)\b"
    
    matches = re.findall(pattern, text)
    return list(set(matches))

In [13]:
# --- Data Splitting Logic ---

def split_separate_accidents(df: pd.DataFrame) -> pd.DataFrame:
    """
    Splits records that contain multiple accidents (e.g. "Separate traffic accidents").
    Works on an existing DataFrame and checks the 'all_text' column.
    """
    new_rows = []
    
    # Regex to identify split points: "The first accident...", "The second incident..."
    split_pattern = r"(The first (?:accident|incident)|The second (?:accident|incident)|The third (?:accident|incident))"
    
    for _, row in df.iterrows():
        all_text = str(row['all_text']).lower()
        
        if "separate traffic accidents" in all_text:
            # Split content based on the pattern
            parts = re.split(split_pattern, row['all_text'], flags=re.IGNORECASE)
            
            current_accident = ""
            found_accidents = []
            
            # Reconstruct the sentences
            for part in parts:
                if not part: continue
                if re.match(split_pattern, part, re.IGNORECASE):
                    if current_accident:
                        found_accidents.append(current_accident.strip())
                    current_accident = part
                else:
                    current_accident += part
            if current_accident:
                found_accidents.append(current_accident.strip())
            
            # Only keep parts that start with "The first/second/third accident/incident"
            # This filters out the intro text
            found_accidents = [acc for acc in found_accidents 
                              if re.match(split_pattern, acc, re.IGNORECASE)]
            
            # Create a new row for each found accident
            for accident_text in found_accidents:
                new_row = row.copy()
                new_row['all_text'] = accident_text
                new_rows.append(new_row)
        else:
            new_rows.append(row)
            
    return pd.DataFrame(new_rows)

# Split separate accidents in the existing police_df
police_df = split_separate_accidents(police_df)

print(f"Total {len(police_df)} records after splitting separate accidents.")

Total 115 records after splitting separate accidents.


In [14]:
_NUMBER_WORDS = {
    "one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
    "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10
}

DEATH_PATTERNS = [
    r"\bdied\b",
    r"\bdead\b",
    r"\bpassed away\b",
    r"\bpronounced dead\b",
    r"\bwas certified dead\b",
    r"\bwere certified dead\b",
    r"\bkilled\b",
]

def _word_to_int(word: str):
    return _NUMBER_WORDS.get(word.lower())


def extract_fatalities_count(text: str):
    """Try to guess a fatalities count from the text.

    This is heuristic. We return (mentioned, count_or_none).
    """
    if not text:
        return False, None

    mentioned = any(re.search(p, text, flags=re.IGNORECASE) for p in DEATH_PATTERNS)
    if not mentioned:
        return False, None

    # 1) digits near a death keyword
    m = re.search(
        r"(\d+)\s+(?:persons|people|men|women|victims|individuals)\s+(?:have\s+)?(?:died|were\s+killed|were\s+pronounced\s+dead|passed\s+away)",
        text,
        flags=re.IGNORECASE,
    )
    if m:
        return True, int(m.group(1))

    # 2) number words near a death keyword
    m = re.search(
        r"\b(one|two|three|four|five|six|seven|eight|nine|ten)\b\s+(?:persons|people|men|women|victims|individuals)\s+(?:have\s+)?(?:died|were\s+killed|were\s+pronounced\s+dead|passed\s+away)",
        text,
        flags=re.IGNORECASE,
    )
    if m:
        return True, _word_to_int(m.group(1))

    # 3) common singular phrasing: "a man ... later died"
    m = re.search(
        r"\b(a|an|one)\s+(?:man|woman|person|victim|pedestrian|motorcyclist|cyclist)\b.*?(?:died|was\s+pronounced\s+dead|passed\s+away)",
        text,
        flags=re.IGNORECASE,
    )
    if m:
        return True, 1

    # Mentioned but no count we trust
    return True, None


def add_fatalities_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Add columns:
    - fatalities_mentioned (bool)
    - fatalities_count (Int64 nullable)
    """
    mentioned_list = []
    count_list = []
    for text in df["content"].fillna("").astype(str):
        mentioned, count = extract_fatalities_count(text)
        mentioned_list.append(bool(mentioned))
        count_list.append(count)

    out = df.copy()
    out["fatalities_mentioned"] = mentioned_list
    out["fatalities_count"] = pd.array(count_list, dtype="Int64")
    return out
police_df = add_fatalities_columns(police_df)

In [15]:
nlp = spacy.load("en_core_web_sm")

def normalize(text):
    # turn car-motorcycle into car motorcycle
    text = text.lower()
    text = re.sub(r"[-/]", " ", text)
    text = unicodedata.normalize("NFKD", text)
    text = text.encode("ascii", "ignore").decode("utf-8")
    return text


# Vehicle types you want to detect
vehicle_list = [
    "car", "van", "truck", "motorcycle", "bike", "bus", "suv", "tractor",
    "lorry", "cycle", "scooter", "taxi", "school bus", "pickup", "trailer", "quadbike", "vehicle"
]

accident_terms = [
    "collision", "crash", "accident", "rear-end", "head-on",
    "side impact", "hit and run", "rollover"
]

locations_df = pd.read_csv('../data/locations.csv')
locations = [normalize(loc) for loc in locations_df['Town'].tolist()]
locations_dict = {normalize(loc.Town): "Town" for loc in locations_df.itertuples()}


vehicle_brands = ['volvo', 'volkswagen', 'toyota', 'mercedes', 'ford', 'bmw', 'kia', 'audi', 'renault', 'skoda', 'peugeot', 'saab', 'nissan', 'opel', 
'hyundai', 'citroën', 'mazda', 'nissan', 'chevrolet', 'subaru', 'tesla', 'mitsubishi', 'seat', 'honda', 'fiat', 'suzuki', 'dacia', 'porsche', 'mini', 'ski-doo', 
'polaris', 'smart', 'ferrari', 'lamborghini', 'jaguar', 'land rover', 'aston martin', 'bentley', 'rolls royce', 'mclaren', 'daf', 'iveco', 'citroen']

motorcycles_df = pd.read_csv('../data/motorcycles.csv')
motorcycle_brand = [brand.lower() for brand in motorcycles_df['Brand'].tolist()]

#remove duplicates since honda make cars and bikes
all_brands = list(dict.fromkeys(vehicle_brands + motorcycle_brand))

hospitals = ["gozo general hospital", "mater dei"]

vehicle_patterns = [nlp.make_doc(vehicle) for vehicle in vehicle_list]
accident_patterns = [nlp.make_doc(accident) for accident in accident_terms]
hospital_patterns = [nlp.make_doc(hospital) for hospital in hospitals]
brand_patterns = [nlp.make_doc(brands) for brands in all_brands]

matcher = PhraseMatcher(nlp.vocab)
matcher.add("VEHICLE", vehicle_patterns)
matcher.add("ACCIDENT_TYPE", accident_patterns)
matcher.add("HOSPITAL", hospital_patterns)
matcher.add("BRANDS", brand_patterns)

pedestrian_matcher = PhraseMatcher(nlp.vocab)
pedestrian_patterns = [nlp.make_doc("was hit by")]
pedestrian_matcher.add("PEDESTRIAN_ACCIDENT", pedestrian_patterns)

doc = nlp("The pedestrian was hit by a car.")

deteceted_vehicles = []

def extract_entities(text):
    text = normalize(text)    
    doc = nlp(text)
    matches = matcher(doc)
    vehicles = []
    accidents = []
    hospitals = []
    brands = []
    
    for match_id, start, end in matches:
        label = nlp.vocab.strings[match_id]
        span = doc[start:end].text
        if label == "VEHICLE":
            vehicles.append(span)
        if label == "ACCIDENT_TYPE":
            accidents.append(span)
        if label == "HOSPITAL":
            hospitals.append(span)
        if label == "BRANDS":
            brands.append(span)


    injuries = extract_injuries(text)
    ped_matches = pedestrian_matcher(doc)
    if ped_matches:
        vehicles.append("pedestrian")

    vehicles_cap = [vehicle.title() for vehicle in set(vehicles)]
    accidents_cap = [accident.title() for accident in set(accidents)]
    hospitals_cap = [hospital.title() for hospital in set(hospitals)]
    injuries_cap = [i.title() for i in set(injuries)]
    brands_cap = [brand.title() for brand in set(brands)]

    
    return list(set(vehicles_cap)), list(set(accidents_cap)), list(set(hospitals_cap)), list(set(injuries_cap)), list(set(brands_cap))
    
police_df["vehicles"], police_df["accident_types"], police_df["hospital"] , police_df['injuries'], police_df['brands'] = zip(*police_df["all_text"].apply(extract_entities))



In [16]:
def extract_time(text):
    # Regex pattern to extract 5 digits followed by "hrs" with or without space
    pattern = r"\b(\d{2})(\d{2})\s*hrs\b"
    match = re.search(pattern, text, flags=re.IGNORECASE)
    if match:
        hh = match.group(1)
        mm = match.group(2)
    else: # since the policeperson doesn't enter the time sometimes
        hh = '12'    
        mm = '00'
    return f"{hh}:{mm}"  
    

In [17]:
def extract_age(text):
    doc = nlp(text)
    for match in re.finditer(r"\b(\d{1,3})-year-old\b", doc.text):
        return int(match.group(1))
    return None


In [18]:
def get_day_name(date_str):
    date_obj = datetime.strptime(date_str, "%Y-%m-%d")
    day_name = date_obj.strftime("%A")

    return day_name

In [19]:
police_df["day"] = police_df["date_published"].apply(get_day_name)
police_df["time"] = police_df["all_text"].apply(extract_time)
police_df["age"] = police_df["all_text"].apply(extract_age)

In [20]:
police_df['injuries']

0                      []
1              [Grievous]
2              [Grievous]
3              [Grievous]
4              [Grievous]
              ...        
106            [Grievous]
107                    []
108    [Slight, Grievous]
109            [Grievous]
110            [Grievous]
Name: injuries, Length: 115, dtype: object

In [21]:
#police_df.to_csv('police_press_releases_enriched.csv', index=False)

In [22]:
car_dict = {brand.title(): "Car" for brand in vehicle_brands}
motorcycle_dict = {brand.title(): "Motorcycle" for brand in motorcycle_brand}
vehicle_dict = {**car_dict, **motorcycle_dict}


def fill_missing_vehicle_types(row):
    updated_vehicles = set(row['vehicles'])  # start with existing vehicles
    for brand in row['brands']:
        vehicle_type = vehicle_dict.get(brand)  # get type
        if vehicle_type:
            updated_vehicles.add(vehicle_type)  # add type, not brand
    return list(updated_vehicles)

# Apply the function
police_df['vehicles'] = police_df.apply(fill_missing_vehicle_types, axis=1)


In [23]:
geolocator = Nominatim(user_agent="GetLoc", timeout=10)

# Nominatim policy-friendly: ~1 request/second
geocode = RateLimiter(
    geolocator.geocode,
    min_delay_seconds=1,
    swallow_exceptions=True
)

cache = {}

def get_coordinates(town: str):
    if pd.isna(town) or not str(town).strip():
        return (None, None)

    town = str(town).strip()

    if town in cache:
        return cache[town]

    # Add country context + restrict to Malta
    query = f"{town}, Malta"
    loc = geocode(query, country_codes="mt", exactly_one=True)

    if loc is None:
        cache[town] = (None, None)
    else:
        cache[town] = (loc.latitude, loc.longitude)

    return cache[town]

police_df[["latitude", "longitude"]] = police_df["town"].apply(
    lambda x: pd.Series(get_coordinates(x))
).astype(float)


In [24]:
cache_session = requests_cache.CachedSession(".cache", expire_after=24 * 3600)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)

URL = "https://historical-forecast-api.open-meteo.com/v1/forecast"

HOURLY_VARS = [
    "temperature_2m",
    "precipitation",
    "rain",
    "showers",
    "visibility",
    "windspeed_10m",
    "winddirection_10m",
]
DAILY_VARS = [
    "weather_code",
    "rain_sum",
    "wind_gusts_10m_max",
    "wind_speed_10m_max",
    "wind_direction_10m_dominant",
]

def _ensure_datetime_series(s, utc=True):
    dt = pd.to_datetime(s, errors="coerce")
    if utc:
        if getattr(dt.dt, "tz", None) is None:
            dt = dt.dt.tz_localize("UTC")
        else:
            dt = dt.dt.tz_convert("UTC")
    return dt

def _fetch_weather_for_latlon_date(lat: float, lon: float, day: str):
    """
    Fetches hourly+daily weather for a single (lat, lon, day).
    Returns:
      hourly_df: columns ['datetime_utc'] + HOURLY_VARS
      daily_df : columns ['date_utc'] + DAILY_VARS
    """
    params = {
        "latitude": float(lat),
        "longitude": float(lon),
        "start_date": day,
        "end_date": day,
        "hourly": HOURLY_VARS,
        "daily": DAILY_VARS,
        "timezone": "auto",
    }

    responses = openmeteo.weather_api(URL, params=params)
    resp = responses[0]

    hourly = resp.Hourly()
    hourly_datetimes = pd.date_range(
        start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
        end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
        freq=pd.Timedelta(seconds=hourly.Interval()),
        inclusive="left",
    )

    hourly_data = {"datetime_utc": hourly_datetimes}
    for i, var in enumerate(HOURLY_VARS):
        hourly_data[var] = hourly.Variables(i).ValuesAsNumpy()

    hourly_df = pd.DataFrame(hourly_data)

    daily = resp.Daily()
    daily_dates = pd.date_range(
        start=pd.to_datetime(daily.Time(), unit="s", utc=True),
        end=pd.to_datetime(daily.TimeEnd(), unit="s", utc=True),
        freq=pd.Timedelta(seconds=daily.Interval()),
        inclusive="left",
    )

    daily_data = {"date_utc": daily_dates}
    for i, var in enumerate(DAILY_VARS):
        daily_data[var] = daily.Variables(i).ValuesAsNumpy()

    daily_df = pd.DataFrame(daily_data)

    return hourly_df, daily_df

# Normalize date
police_df = police_df.copy()
police_df["date_published"] = pd.to_datetime(police_df["date_published"], errors="coerce").dt.date
police_df["_published_day"] = police_df["date_published"].astype(str)  # YYYY-MM-DD

police_df["_time_utc"] = pd.to_datetime(police_df["time"], errors="coerce", utc=True)

_weather_cache = {}

def enrich_row_with_weather(row: pd.Series) -> pd.Series:
    lat = row.get("latitude")
    lon = row.get("longitude")
    day = row.get("_published_day")
    t = row.get("_time_utc")

    if pd.isna(lat) or pd.isna(lon) or (day is None) or (day == "NaT"):
        return row

    key = (float(lat), float(lon), str(day))
    if key not in _weather_cache:
        try:
            _weather_cache[key] = _fetch_weather_for_latlon_date(*key)
        except Exception:
            _weather_cache[key] = (None, None)

    hourly_df, daily_df = _weather_cache[key]
    if hourly_df is None or daily_df is None or hourly_df.empty or daily_df.empty:
        for v in HOURLY_VARS:
            row[f"hourly_{v}"] = np.nan
        for v in DAILY_VARS:
            row[f"daily_{v}"] = np.nan
        return row

    if pd.isna(t):
        closest_idx = 0
    else:
        deltas = (hourly_df["datetime_utc"] - t).abs()
        closest_idx = int(deltas.idxmin())

    closest_hour = hourly_df.loc[closest_idx]

    day_dt = pd.to_datetime(str(day), utc=True)
    daily_match = daily_df.loc[daily_df["date_utc"] == day_dt]
    daily_rec = daily_match.iloc[0] if not daily_match.empty else daily_df.iloc[0]

    for v in HOURLY_VARS:
        row[f"hourly_{v}"] = closest_hour.get(v, np.nan)
    for v in DAILY_VARS:
        row[f"daily_{v}"] = daily_rec.get(v, np.nan)

    row["weather_matched_hour_utc"] = closest_hour.get("datetime_utc", pd.NaT)

    return row

police_df = police_df.apply(enrich_row_with_weather, axis=1)

police_df.drop(columns=["_published_day", "_time_utc"], inplace=True, errors="ignore")



  police_df["_time_utc"] = pd.to_datetime(police_df["time"], errors="coerce", utc=True)


In [25]:
police_df

Unnamed: 0,title,date_published,date_modified,content,road,town,location,all_text,fatalities_mentioned,fatalities_count,...,hourly_showers,hourly_visibility,hourly_windspeed_10m,hourly_winddirection_10m,daily_weather_code,daily_rain_sum,daily_wind_gusts_10m_max,daily_wind_speed_10m_max,daily_wind_direction_10m_dominant,weather_matched_hour_utc
0,Collision between a car and a motorbike in Żur...,2025-10-09,2025-10-09,"Today, at around 0930hrs, the Police were info...",Triq il-Belt Valletta,Iż-Żurrieq,"(Triq il-Belt Valletta, Iż-Żurrieq)",Collision between a car and a motorbike in Żur...,False,,...,0.0,52960.0,3.893995,146.309906,3.0,0.0,24.119999,12.574260,258.461548,2025-10-09 22:00:00+00:00
1,Car-motorcycle traffic accident,2025-06-20,2025-06-20,"Yesterday, at around 1830hrs, the Police were ...",Triq Dawret il-Gudja,Il-Gudja,"(Triq Dawret il-Gudja, Il-Gudja)","Car-motorcycle traffic accident Yesterday, at ...",False,,...,0.0,33800.0,6.763786,334.798920,80.0,0.0,21.599998,10.080000,346.301239,2025-06-20 22:00:00+00:00
2,Car-motorcycle collision in Ħal Qormi,2025-05-12,2025-05-12,"Today, at around 0800hrs, the Police were info...",Valley Road,Ħal Qormi,"(Valley Road, Ħal Qormi)","Car-motorcycle collision in Ħal Qormi Today, a...",False,,...,0.0,20300.0,11.753877,297.349792,3.0,0.0,41.399998,18.598450,334.419128,2025-05-12 22:00:00+00:00
3,Collision between motorcycle and car in Għaxaq,2025-07-30,2025-07-30,"Yesterday, at around 1800hrs, the Police were ...",Triq Dawret Ħal Għaxaq,Ħal Ghaxaq,"(Triq Dawret Ħal Għaxaq, Ħal Ghaxaq)",Collision between motorcycle and car in Għaxaq...,False,,...,0.0,25440.0,9.028754,274.573822,2.0,0.0,45.719997,18.532133,304.318237,2025-07-30 22:00:00+00:00
4,Car-motorcycle collision,2025-04-07,2025-04-07,"Yesterday, at around quarter to nine in the ev...",Triq il-Buqana,Ir-Rabat,"(Triq il-Buqana, Ir-Rabat)","Car-motorcycle collision Yesterday, at around ...",False,,...,0.0,46880.0,7.754637,68.198532,80.0,0.0,51.480000,25.849905,70.311134,2025-04-07 22:00:00+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,Motorcycle accident in Attard,2025-02-05,2025-02-05,"A 52-year-old man and residing in Ħaż-Żebbuġ, ...",Vjal L-Istadium Nazzjonali in Ta' Qali,Ħ'Attard,"(Vjal L-Istadium Nazzjonali in Ta' Qali, Ħ'Att...",Motorcycle accident in Attard A 52-year-old ma...,False,,...,0.0,5660.0,6.877790,96.008919,3.0,0.0,27.719999,13.276144,74.127525,2025-02-05 22:00:00+00:00
107,Naxxar traffic accident,2024-12-19,2024-12-19,"Today, at around 1045hrs, the Police were info...",Triq il-Ġermanja,In-Naxxar,"(Triq il-Ġermanja, In-Naxxar)","Naxxar traffic accident Today, at around 1045h...",False,,...,0.0,43520.0,11.440979,245.854462,3.0,0.0,39.959999,20.674158,282.561737,2024-12-19 22:00:00+00:00
108,Żebbuġ traffic accident,2025-03-16,2025-03-16,"Today, at around 0800hrs, the Police were info...",Vjal il-Helsien,Ħaż-Żebbuġ,"(Vjal il-Helsien, Ħaż-Żebbuġ)","Żebbuġ traffic accident Today, at around 080...",False,,...,0.0,33520.0,10.514218,308.047089,3.0,0.0,53.639996,23.441502,313.701691,2025-03-16 22:00:00+00:00
109,Collision between a car and e-scooter,2025-07-18,2025-07-18,"Yesterday, at around 2215 hrs, the Police were...",Triq il-Wied ta' Birkirkara,Birkirkara,"(Triq il-Wied ta' Birkirkara, Birkirkara)",Collision between a car and e-scooter Yesterda...,False,,...,0.0,15260.0,4.896529,17.102825,2.0,0.0,40.680000,19.174856,317.652130,2025-07-18 22:00:00+00:00


In [26]:
police_df.to_csv('../data/police_press_releases_full_enriched.csv', index=False)

In [27]:
model = SentenceTransformer('all-MiniLM-L6-v2')

embeddings = model.encode(police_df["all_text"].values, show_progress_bar=True)

Batches: 100%|██████████| 4/4 [00:02<00:00,  1.60it/s]


In [28]:
from sklearn.cluster import KMeans

k = 5  # set number of clusters
kmeans = KMeans(n_clusters=k, random_state=42)
police_df["cluster"] = kmeans.fit_predict(embeddings)


police_df.groupby("cluster")["all_text"].head(3)

0     Collision between a car and a motorbike in Żur...
1     Car-motorcycle traffic accident Yesterday, at ...
2     Car-motorcycle collision in Ħal Qormi Today, a...
3     Collision between motorcycle and car in Għaxaq...
4     Car-motorcycle collision Yesterday, at around ...
5     Traffic accident in Marsa Yesterday, at around...
6     Van-car collision in Lija Today, at around 103...
7     Zejtun traffic accident This morning, at aroun...
9     Number of persons hospitalised following Żabba...
11    Italian woman grievously injured in a traffic ...
12    Woman grievously injured in traffic accident A...
13    Xewkija traffic accident Yesterday evening, at...
14    Woman grievously injured in traffic accident A...
15    Man grievously injured in a traffic accident A...
16    Traffic accident in Paola Two persons were inj...
Name: all_text, dtype: object