# Percent Attacks in Catchment Areas

This notebook replicates `perc_attacks_incatchmentareas.py`. It: 
- Loads ACLED and catchment/hospital Excel files.
- Builds two-week segments for the hospital timelines.
- Counts ACLED events within hospital catchment areas (area -> circular radius approximation).
- Writes results to `perc_attacks_incatchmentareas_results.xlsx`.

In [5]:
#edit one is from 02/09/2026
import math
from datetime import datetime, timedelta
from pathlib import Path
import pandas as pd
import numpy as np

def parse_date(x):
    if pd.isna(x):
        return None
    if isinstance(x, datetime):
        return x.date()
    for fmt in ("%m/%d/%Y", "%Y-%m-%d", "%d-%m-%Y"):
        try:
            return datetime.strptime(str(x), fmt).date()
        except Exception:
            continue
    try:
        return pd.to_datetime(x).date()
    except Exception:
        raise ValueError(f"Unrecognized date format: {x}")

def haversine_km(lat1, lon1, lat2, lon2):
    R = 6371.0088
    phi1 = math.radians(lat1)
    phi2 = math.radians(lat2)
    dphi = math.radians(lat2 - lat1)
    dlambda = math.radians(lon2 - lon1)
    a = math.sin(dphi/2)**2 + math.cos(phi1)*math.cos(phi2)*math.sin(dlambda/2)**2
    return R * 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))

def guess_cols(df, candidates):
    for c in candidates:
        for col in df.columns:
            if c.lower() in str(col).lower():
                return col
    return None

def load_acled(path):
    ac = pd.read_excel(path)
    date_col = guess_cols(ac, ["event_date", "date", "iso_date", "eventdate"])
    lat_col = guess_cols(ac, ["latitude", "lat", "y"])
    lon_col = guess_cols(ac, ["longitude", "lon", "long", "x"])
    if date_col is None or lat_col is None or lon_col is None:
        raise RuntimeError("Could not find date/lat/lon columns in ACLED file")
    ac["_date"] = pd.to_datetime(ac[date_col]).dt.date
    ac["_lat"] = pd.to_numeric(ac[lat_col], errors="coerce")
    ac["_lon"] = pd.to_numeric(ac[lon_col], errors="coerce")
    ac = ac.dropna(subset=["_lat", "_lon", "_date"]).copy()
    return ac

def load_hospitals(path):
    h = pd.read_excel(path)
    lat_col = guess_cols(h, ["latitude", "lat", "y"])
    lon_col = guess_cols(h, ["longitude", "lon", "long", "x"])
    name_col = guess_cols(h, ["hospital", "name"])
    if lat_col is None or lon_col is None:
        raise RuntimeError("Could not find lat/lon in hospitals file")
    return h, name_col, lat_col, lon_col

def extract_timelines(hosp_df, name_col):
    """
    Extract open/close timeline periods for each hospital.
    Returns dict: hospital_name -> list of (start_date, end_date) tuples
    """
    timelines = {}
    for _, row in hosp_df.iterrows():
        hosp_name = row[name_col]
        periods = []
        
        # Extract all Open/Closed column pairs
        open_cols = [col for col in hosp_df.columns if 'open' in col.lower()]
        closed_cols = [col for col in hosp_df.columns if 'closed' in col.lower()]
        
        # Sort to ensure we pair them correctly (Open, Closed, Open.1, Closed.1, etc.)
        open_cols_sorted = sorted(open_cols, key=lambda x: (x.count('.'), x))
        closed_cols_sorted = sorted(closed_cols, key=lambda x: (x.count('.'), x))
        
        for open_col, closed_col in zip(open_cols_sorted, closed_cols_sorted):
            start = parse_date(row[open_col])
            end = parse_date(row[closed_col])
            
            if start is not None and end is not None:
                periods.append((start, end))
        
        if periods:
            timelines[hosp_name] = periods
    
    return timelines

def segments(start_date, end_date, delta_days=14):
    """Generate two-week segments from start_date to end_date"""
    cur = start_date
    while cur <= end_date:
        seg_end = min(end_date, cur + timedelta(days=delta_days-1))
        yield cur, seg_end
        cur = seg_end + timedelta(days=1)

def get_open_hospitals(hosp_df, name_col, lat_col, lon_col, timelines, seg_date):
    """Get list of hospitals open on a given date with their lat/lon"""
    open_hospitals = []
    for _, row in hosp_df.iterrows():
        hosp_name = row[name_col]
        if hosp_name not in timelines:
            continue
        
        for start, end in timelines[hosp_name]:
            if start <= seg_date <= end:
                lat = float(row[lat_col])
                lon = float(row[lon_col])
                open_hospitals.append((hosp_name, lat, lon))
                break
    
    return open_hospitals

def calculate_catchment_areas(ac_df, hosp_df, name_col, lat_col, lon_col, timelines, seg_start, seg_end, max_distance_km=5.0):
    """
    Calculate catchment areas using Voronoi-like assignment.
    For each ACLED event, assign to the closest open hospital (max 5km).
    Returns dict: hospital_name -> number of points assigned to it
    
    METHODOLOGY:
    - For each ACLED attack event in the time segment, calculate distance to each hospital
    - Assign the event to the nearest hospital (Voronoi approach)
    - Only count events within max_distance_km (5km default) of the nearest hospital
    - The catchment area is estimated by multiplying (proportion of events) × 2500 km²
    
    WHY 2500 km²?
    - Total Gaza area is approximately 360 km² (6.5 km × 25 km strip)
    - However, some hospitals draw patients from wider regions
    - 2500 km² is used as a normalization factor to estimate effective service area
    - This assumes an idealogical uniform distribution with margin for surrounding displacement
    - Some hospitals have 2500 km² because they capture 100% of assigned catchment points in that segment
    
    Note: This uses ACLED event points to estimate the area.
    For full accuracy, this should use a regular grid or full event set.
    """
    # Sample ACLED events in this segment
    mask = (ac_df["_date"] >= seg_start) & (ac_df["_date"] <= seg_end)
    events_in_seg = ac_df.loc[mask].copy()
    
    if events_in_seg.empty:
        return {}
    
    # Get open hospitals at segment start
    open_hospitals = get_open_hospitals(hosp_df, name_col, lat_col, lon_col, timelines, seg_start)
    
    if not open_hospitals:
        return {}
    
    # For each event, find the closest open hospital
    catchment_counts = {h[0]: 0 for h in open_hospitals}
    
    for _, event in events_in_seg.iterrows():
        event_lat = event["_lat"]
        event_lon = event["_lon"]
        
        # Find closest hospital
        min_dist = float('inf')
        closest_hospital = None
        
        for hosp_name, hosp_lat, hosp_lon in open_hospitals:
            dist = haversine_km(event_lat, event_lon, hosp_lat, hosp_lon)
            if dist < min_dist:
                min_dist = dist
                closest_hospital = hosp_name
        
        # Only assign if within max distance
        if closest_hospital and min_dist <= max_distance_km:
            catchment_counts[closest_hospital] += 1
    
    return catchment_counts

def count_attacks_in_segment(ac_df, hosp_name, hosp_lat, hosp_lon, seg_start, seg_end, max_distance_km=5.0):
    """Count ACLED events in a segment that belong to this hospital's catchment"""
    mask = (ac_df["_date"] >= seg_start) & (ac_df["_date"] <= seg_end)
    events_in_seg = ac_df.loc[mask].copy()
    
    if events_in_seg.empty:
        return 0
    
    count = 0
    for _, event in events_in_seg.iterrows():
        event_lat = event["_lat"]
        event_lon = event["_lon"]
        dist = haversine_km(event_lat, event_lon, hosp_lat, hosp_lon)
        
        # Check if this hospital is the closest (would require checking all hospitals)
        # For now, count all within max distance - this will be refined when we have catchment areas
        if dist <= max_distance_km:
            count += 1
    
    return count

def process_for_hospital(hosp_name, hosp_lat, hosp_lon, ac_df, hosp_df, name_col, lat_col, lon_col, timelines, max_distance_km=5.0):
    """Process a hospital's segments and calculate attack percentages"""
    if hosp_name not in timelines:
        return pd.DataFrame()
    
    rows = []
    
    for period_start, period_end in timelines[hosp_name]:
        for seg_start, seg_end in segments(period_start, period_end, 14):
            # Count total attacks in segment
            mask = (ac_df["_date"] >= seg_start) & (ac_df["_date"] <= seg_end)
            total_attacks_in_seg = int(mask.sum())
            
            # Calculate catchment areas for this segment
            catchment_counts = calculate_catchment_areas(ac_df, hosp_df, name_col, lat_col, lon_col, 
                                                        timelines, seg_start, seg_end, max_distance_km)
            
            # Get this hospital's catchment area count
            attacks_in_catchment = catchment_counts.get(hosp_name, 0)
            
            # Calculate percentage
            pct = (attacks_in_catchment / total_attacks_in_seg * 100) if total_attacks_in_seg > 0 else 0.0
            
            # Calculate area based on catchment counts (rough estimate)
            # This is a proxy: more points = larger area
            total_catchment_points = sum(catchment_counts.values())
            if total_catchment_points > 0:
                approx_area = (attacks_in_catchment / total_catchment_points) * 2500  # Rough scaling
            else:
                approx_area = 0.0
            
            rows.append({
                "hospital": hosp_name,
                "hosp_lat": hosp_lat,
                "hosp_lon": hosp_lon,
                "catchment_area_km2": approx_area,
                "seg_start": seg_start,
                "seg_end": seg_end,
                "attacks_in_catchment": attacks_in_catchment,
                "pct_of_attacks_in_catchment": pct,
                "total_attacks_in_segment": total_attacks_in_seg,
            })
    
    return pd.DataFrame(rows)

def main(base=None):
    if base is None:
        base = Path('c:/Users/qingl/OneDrive/Desktop/Monona Zhou/bassproj/BassConnectionsFireinmysoul')
    else:
        base = Path(base)
    
    acled_path = base / "ACLED_May_09_25_Gaza.xlsx"
    hospitals_path = base / "Hospitals_OpenCloseoverTime.xlsx"

    print("Loading files...")
    ac = load_acled(acled_path)
    hosp_df, name_col, lat_col, lon_col = load_hospitals(hospitals_path)
    
    # Define hospital timelines manually (ignoring open/close spreadsheet columns)
    # Based on verified hospital operational periods
    timelines = {
        "Al Shifa Medical Hospital": [(datetime(2023, 10, 7).date(), datetime(2023, 11, 3).date())],
        "European Hospital": [(datetime(2023, 12, 11).date(), datetime(2024, 4, 28).date())],
        "Nasser Hospital": [(datetime(2024, 11, 11).date(), datetime(2025, 2, 2).date())],
    }
    
    print(f"Using specified hospital timelines:")
    for hosp, periods in timelines.items():
        print(f"  {hosp}: {periods}")
    
    out_path = base / "perc_attacks_incatchmentareas_results.xlsx"
    writer = pd.ExcelWriter(out_path, engine="openpyxl")
    
    for _, row in hosp_df.iterrows():
        hosp_name = row[name_col]
        if hosp_name not in timelines:
            print(f"Skipping {hosp_name} (no timeline data)")
            continue
        
        lat = float(row[lat_col])
        lon = float(row[lon_col])
        
        print(f"Processing {hosp_name}...")
        df_out = process_for_hospital(hosp_name, lat, lon, ac, hosp_df, name_col, lat_col, lon_col, timelines)
        
        if not df_out.empty:
            print(f"  Generated {len(df_out)} two-week segments")
            # Use first 31 chars of hospital name for sheet name (Excel limit)
            sheet_name = hosp_name[:31]
            df_out.to_excel(writer, sheet_name=sheet_name, index=False)
        else:
            print(f"  No segments generated")
    
    writer.close()
    print(f"Results written to {out_path}")

In [6]:
# Self-contained catchment + map cell (replaces external script call)
import warnings
warnings.filterwarnings('ignore')

from pathlib import Path
from datetime import datetime, timedelta
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import Point, Polygon, box, MultiPolygon
from shapely.ops import unary_union
import folium

# local helper functions already defined earlier in notebook are used where possible
# fallbacks / small helpers here to be safe

def two_week_segments(start_date, end_date):
    cur = start_date
    while cur <= end_date:
        seg_end = min(end_date, cur + timedelta(days=13))
        yield cur, seg_end
        cur = seg_end + timedelta(days=1)

# base paths
base = Path('c:/Users/qingl/OneDrive/Desktop/Monona Zhou/bassproj/BassConnectionsFireinmysoul')
acled_path = base / 'ACLED_May_09_25_Gaza.xlsx'
hospitals_path = base / 'Hospitals_OpenCloseoverTime.xlsx'
gaza_geo = base / 'gaza_boundary.geojson'

print('Loading ACLED and hospitals...')
ac = load_acled(acled_path)
# load_hospitals from earlier cell returns (df, name_col, lat_col, lon_col)
hosp_df, name_col, lat_col, lon_col = load_hospitals(hospitals_path)

# read schedule from Excel using extract_timelines() defined earlier
timelines_from_excel = extract_timelines(hosp_df, name_col)
print(f'Parsed timelines for {len(timelines_from_excel)} hospitals from Excel')

# Define hospital-of-interest timelines (use the dates you provided)
hosp_interest_periods = {
    'Al Shifa Medical Hospital': (datetime(2023,10,7).date(), datetime(2023,11,3).date()),
    'European Hospital': (datetime(2023,12,11).date(), datetime(2024,4,28).date()),
    'Nasser Hospital': (datetime(2024,11,11).date(), datetime(2025,2,2).date()),
}

# load Gaza boundary
gaza = gpd.read_file(gaza_geo)
if gaza.crs is None:
    gaza = gaza.set_crs('EPSG:4326')
gaza = gaza.to_crs('EPSG:4326')
gaza_union = gaza.unary_union

# helper: get hospitals open during any day in period using parsed Excel timelines
def get_open_hospitals_for_period(hosp_df, name_col, lat_col, lon_col, timelines_excel, seg_start, seg_end):
    open_list = []
    for _, row in hosp_df.iterrows():
        hn = row[name_col]
        if hn not in timelines_excel:
            continue
        periods = timelines_excel[hn]
        # periods are (start, end) date-like
        for s,e in periods:
            s_date = pd.to_datetime(s).date()
            e_date = pd.to_datetime(e).date()
            # overlap?
            if s_date <= seg_end and e_date >= seg_start:
                lat = float(row[lat_col])
                lon = float(row[lon_col])
                open_list.append((hn, lat, lon))
                break
    return open_list

# Voronoi builder: try scipy, else grid-approx fallback
def build_voronoi_polygons(open_hospitals, clip_geom):
    # open_hospitals: list of (name, lat, lon)
    if len(open_hospitals) == 0:
        return {}

    try:
        from scipy.spatial import Voronoi
        use_scipy = True
    except Exception:
        use_scipy = False

    hosp_points = [(h[2], h[1]) for h in open_hospitals]  # (lon, lat)
    names = [h[0] for h in open_hospitals]

    gdf_pts = gpd.GeoDataFrame({'Hospital': names}, geometry=[Point(xy) for xy in hosp_points], crs='EPSG:4326')
    proj_pts = gdf_pts.to_crs('EPSG:3857')
    clip_proj = gpd.GeoSeries([clip_geom], crs='EPSG:4326').to_crs('EPSG:3857').iloc[0]

    if use_scipy and len(proj_pts) >= 2:
        coords = np.array([(p.x, p.y) for p in proj_pts.geometry])
        vor = Voronoi(coords)
        polys = {}
        # build projected bounding box to clip infinite regions
        minx, miny, maxx, maxy = clip_proj.bounds
        bbox = box(minx-10000, miny-10000, maxx+10000, maxy+10000)

        for i, region_index in enumerate(vor.point_region):
            region = vor.regions[region_index]
            if not region or -1 in region:
                poly = bbox
            else:
                verts = [vor.vertices[v] for v in region]
                poly = Polygon(verts)
            poly = poly.intersection(bbox)
            # convert back to WGS84
            poly_wgs = gpd.GeoSeries([poly], crs='EPSG:3857').to_crs('EPSG:4326').iloc[0]
            poly_clipped = poly_wgs.intersection(clip_geom)
            if not poly_clipped.is_empty:
                polys[names[i]] = poly_clipped

        # assign leftover to nearest
        covered = unary_union([p for p in polys.values() if p is not None and not p.is_empty]) if polys else None
        leftover = clip_geom.difference(covered) if covered is not None else clip_geom
        if leftover and not leftover.is_empty:
            pieces = [leftover] if isinstance(leftover, Polygon) else list(leftover.geoms)
            for piece in pieces:
                centroid = piece.representative_point()
                dists = [centroid.distance(Point(lon,lat)) for _,lat,lon in open_hospitals]
                idx = int(np.argmin(dists))
                name = names[idx]
                polys[name] = polys.get(name, Polygon()).union(piece)

        return polys

    # Fallback grid approximation: rasterize clip_geom, assign nearest hospital
    minx, miny, maxx, maxy = clip_geom.bounds
    # build a modest grid (~300m resolution) to keep runtime reasonable
    res = 0.003  # degrees ~ ~300m at mid-lat (approx)
    xs = np.arange(minx, maxx, res)
    ys = np.arange(miny, maxy, res)
    grid_pts = []
    owner = {}
    for x in xs:
        for y in ys:
            pt = Point(x,y)
            if not pt.within(clip_geom):
                continue
            # find nearest
            dmin = 1e9
            idx = None
            for i, (_, lat, lon) in enumerate(open_hospitals):
                d = (lat - y)**2 + (lon - x)**2
                if d < dmin:
                    dmin = d
                    idx = i
            owner.setdefault(idx, []).append(pt)
    polys = {}
    for i, pts in owner.items():
        mp = unary_union([p.buffer(res*0.7) for p in pts]).buffer(0)
        poly = mp.intersection(clip_geom)
        if not poly.is_empty:
            polys[names[i]] = poly
    return polys

# function to compute area_km2 from polygon using projected area
def area_km2(poly):
    if poly.is_empty:
        return 0.0
    geod_gdf = gpd.GeoSeries([poly], crs='EPSG:4326').to_crs('EPSG:3857')
    area_m2 = geod_gdf.iloc[0].area
    return float(area_m2) / 1e6

# Process function for one segment: build voronoi for open hospitals, count attacks
from collections import defaultdict

def process_segment(seg_start, seg_end, focus_hospital_name=None):
    open_hosps = get_open_hospitals_for_period(hosp_df, name_col, lat_col, lon_col, timelines_from_excel, seg_start, seg_end)
    print(f'Open hospitals during {seg_start} to {seg_end}: {[h[0] for h in open_hosps]}')
    polys = build_voronoi_polygons(open_hosps, gaza_union)

    # filter ACLED events in segment
    mask = (ac['_date'] >= seg_start) & (ac['_date'] <= seg_end)
    events_seg = ac[mask].copy()
    events_seg['geometry'] = events_seg.apply(lambda r: Point(r['_lon'], r['_lat']), axis=1)
    events_gdf = gpd.GeoDataFrame(events_seg, geometry='geometry', crs='EPSG:4326')

    # count attacks per hospital polygon
    counts = defaultdict(int)
    events_assigned = []
    for hosp_name, poly in polys.items():
        if poly.is_empty:
            continue
        within = events_gdf[events_gdf.within(poly)]
        counts[hosp_name] = len(within)
        events_assigned.append((hosp_name, within))

    # collect areas
    areas = {h: area_km2(polys[h]) for h in polys}

    return {'polys': polys, 'areas': areas, 'counts': dict(counts), 'events_assigned': events_assigned, 'open_hospitals': open_hosps}

# --- Generate Al Nasser first two-week map ---
nasser_start, nasser_end = hosp_interest_periods['Nasser Hospital']
# first segment
nasser_seg_start, nasser_seg_end = list(two_week_segments(nasser_start, nasser_end))[0]
print('\nBuilding Al Nasser first segment map for', nasser_seg_start, 'to', nasser_seg_end)
res_nasser = process_segment(nasser_seg_start, nasser_seg_end, focus_hospital_name='Nasser Hospital')

# Save Excel summary for three hospitals (all their segments)
rows = []
for hosp_name, (start_dt, end_dt) in hosp_interest_periods.items():
    for seg_start, seg_end in two_week_segments(start_dt, end_dt):
        r = process_segment(seg_start, seg_end)
        area = r['areas'].get(hosp_name, 0.0)
        attacks = r['counts'].get(hosp_name, 0)
        total_attacks = len(ac[(ac['_date'] >= seg_start) & (ac['_date'] <= seg_end)])
        pct = (attacks / total_attacks * 100) if total_attacks>0 else 0.0
        rows.append({
            'Hospital': hosp_name,
            'Segment Start': seg_start,
            'Segment End': seg_end,
            'Open Hospitals': ', '.join([h[0] for h in r['open_hospitals']]),
            'Catchment Area (km2)': round(area,2),
            'Attacks in Catchment': attacks,
            'Total Attacks in Segment': total_attacks,
            'Percent of Attacks': round(pct,2)
        })

df_summary = pd.DataFrame(rows)
out_file = base / 'perc_attacks_incatchmentareas_recomputed.xlsx'
df_summary.to_excel(out_file, index=False)
print('\nSaved results to', out_file.name)

# Build and save Folium map for Al Nasser first segment
m = folium.Map(location=[31.9,35.2], zoom_start=11)
# add Gaza outline
folium.GeoJson(gaza.__geo_interface__, name='Gaza boundary', style_function=lambda x: {'fill':False,'color':'black','weight':2}).add_to(m)

# colors
color_map = ['#e41a1c','#377eb8','#4daf4a','#984ea3','#ff7f00']
for i,(hname, lat, lon) in enumerate(res_nasser['open_hospitals']):
    c = color_map[i % len(color_map)]
    poly = res_nasser['polys'].get(hname)
    if poly is None:
        continue
    # handle multipolygon
    if isinstance(poly, MultiPolygon):
        polys_iter = poly.geoms
    else:
        polys_iter = [poly]
    for p in polys_iter:
        coords = [[pt[1], pt[0]] for pt in list(p.exterior.coords)]
        folium.Polygon(locations=coords, color=c, fill=True, fillOpacity=0.3, popup=f"{hname}: {res_nasser['areas'].get(hname,0):.2f} km2 | attacks: {res_nasser['counts'].get(hname,0)}").add_to(m)
    folium.CircleMarker(location=[lat,lon], radius=6, color=c, fill=True, fillColor=c, popup=hname).add_to(m)

# add attack points that were assigned to Nasser Hospital in this segment (if any)
nasser_events = [ev for name, ev in res_nasser['events_assigned'] if name == 'Nasser Hospital']
if nasser_events:
    events_gdf = nasser_events[0]
    for _, row in events_gdf.iterrows():
        folium.CircleMarker(location=[row['_lat'], row['_lon']], radius=3, color='black', fill=True, fillColor='red').add_to(m)

map_out = base / 'catchment_visualization_Nasser_first_segment.html'
m.save(str(map_out))
print('Saved Al Nasser first-segment map to', map_out.name)

# display summary head
print('\nSummary (first 10 rows):')
display(df_summary.head(10))

print('\nDone.')


Loading ACLED and hospitals...
Parsed timelines for 4 hospitals from Excel

Building Al Nasser first segment map for 2024-11-11 to 2024-11-24
Open hospitals during 2024-11-11 to 2024-11-24: []
Open hospitals during 2023-10-07 to 2023-10-20: ['Al Shifa Medical Hospital', 'Al-Quds Hospital', 'Nasser Hospital', 'European Hospital']
Open hospitals during 2023-10-21 to 2023-11-03: ['Al Shifa Medical Hospital', 'Al-Quds Hospital', 'Nasser Hospital', 'European Hospital']
Open hospitals during 2023-12-11 to 2023-12-24: ['Al Shifa Medical Hospital', 'Nasser Hospital', 'European Hospital']
Open hospitals during 2023-12-25 to 2024-01-07: ['Al Shifa Medical Hospital', 'Nasser Hospital', 'European Hospital']
Open hospitals during 2024-01-08 to 2024-01-21: ['Al Shifa Medical Hospital', 'Nasser Hospital', 'European Hospital']
Open hospitals during 2024-01-22 to 2024-02-04: ['Al Shifa Medical Hospital', 'Nasser Hospital', 'European Hospital']
Open hospitals during 2024-02-05 to 2024-02-18: ['Al Shifa 

QhullError: QH6214 qhull input error: not enough points(2) to construct initial simplex (need 4)

While executing:  | qhull v Qz Qc Qbb
Options selected for Qhull 2020.2.r 2020/08/31:
  run-id 767283449  voronoi  Qz-infinity-point  Qcoplanar-keep  Qbbound-last
  _pre-merge  _zero-centrum  Qinterior-keep  _maxoutside  0
