# Percent Attacks in Catchment Areas

This notebook replicates `perc_attacks_incatchmentareas.py`. It: 
- Loads ACLED and catchment/hospital Excel files.
- Builds two-week segments for the hospital timelines.
- Counts ACLED events within hospital catchment areas (area -> circular radius approximation).
- Writes results to `perc_attacks_incatchmentareas_results.xlsx`.

In [24]:
import math
from datetime import datetime, timedelta
from pathlib import Path
import pandas as pd
import numpy as np

def parse_date(x):
    if pd.isna(x):
        return None
    if isinstance(x, datetime):
        return x.date()
    for fmt in ("%m/%d/%Y", "%Y-%m-%d", "%d-%m-%Y"):
        try:
            return datetime.strptime(str(x), fmt).date()
        except Exception:
            continue
    try:
        return pd.to_datetime(x).date()
    except Exception:
        raise ValueError(f"Unrecognized date format: {x}")

def haversine_km(lat1, lon1, lat2, lon2):
    R = 6371.0088
    phi1 = math.radians(lat1)
    phi2 = math.radians(lat2)
    dphi = math.radians(lat2 - lat1)
    dlambda = math.radians(lon2 - lon1)
    a = math.sin(dphi/2)**2 + math.cos(phi1)*math.cos(phi2)*math.sin(dlambda/2)**2
    return R * 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))

def guess_cols(df, candidates):
    for c in candidates:
        for col in df.columns:
            if c.lower() in str(col).lower():
                return col
    return None

def load_acled(path):
    ac = pd.read_excel(path)
    date_col = guess_cols(ac, ["event_date", "date", "iso_date", "eventdate"])
    lat_col = guess_cols(ac, ["latitude", "lat", "y"])
    lon_col = guess_cols(ac, ["longitude", "lon", "long", "x"])
    if date_col is None or lat_col is None or lon_col is None:
        raise RuntimeError("Could not find date/lat/lon columns in ACLED file")
    ac["_date"] = pd.to_datetime(ac[date_col]).dt.date
    ac["_lat"] = pd.to_numeric(ac[lat_col], errors="coerce")
    ac["_lon"] = pd.to_numeric(ac[lon_col], errors="coerce")
    ac = ac.dropna(subset=["_lat", "_lon", "_date"]).copy()
    return ac

def load_hospitals(path):
    h = pd.read_excel(path)
    lat_col = guess_cols(h, ["latitude", "lat", "y"])
    lon_col = guess_cols(h, ["longitude", "lon", "long", "x"])
    name_col = guess_cols(h, ["hospital", "name"])
    if lat_col is None or lon_col is None:
        raise RuntimeError("Could not find lat/lon in hospitals file")
    return h, name_col, lat_col, lon_col

def extract_timelines(hosp_df, name_col):
    """
    Extract open/close timeline periods for each hospital.
    Returns dict: hospital_name -> list of (start_date, end_date) tuples
    """
    timelines = {}
    for _, row in hosp_df.iterrows():
        hosp_name = row[name_col]
        periods = []
        
        # Extract all Open/Closed column pairs
        open_cols = [col for col in hosp_df.columns if 'open' in col.lower()]
        closed_cols = [col for col in hosp_df.columns if 'closed' in col.lower()]
        
        # Sort to ensure we pair them correctly (Open, Closed, Open.1, Closed.1, etc.)
        open_cols_sorted = sorted(open_cols, key=lambda x: (x.count('.'), x))
        closed_cols_sorted = sorted(closed_cols, key=lambda x: (x.count('.'), x))
        
        for open_col, closed_col in zip(open_cols_sorted, closed_cols_sorted):
            start = parse_date(row[open_col])
            end = parse_date(row[closed_col])
            
            if start is not None and end is not None:
                periods.append((start, end))
        
        if periods:
            timelines[hosp_name] = periods
    
    return timelines

def segments(start_date, end_date, delta_days=14):
    """Generate two-week segments from start_date to end_date"""
    cur = start_date
    while cur <= end_date:
        seg_end = min(end_date, cur + timedelta(days=delta_days-1))
        yield cur, seg_end
        cur = seg_end + timedelta(days=1)

def get_open_hospitals(hosp_df, name_col, lat_col, lon_col, timelines, seg_date):
    """Get list of hospitals open on a given date with their lat/lon"""
    open_hospitals = []
    for _, row in hosp_df.iterrows():
        hosp_name = row[name_col]
        if hosp_name not in timelines:
            continue
        
        for start, end in timelines[hosp_name]:
            if start <= seg_date <= end:
                lat = float(row[lat_col])
                lon = float(row[lon_col])
                open_hospitals.append((hosp_name, lat, lon))
                break
    
    return open_hospitals

def calculate_catchment_areas(ac_df, hosp_df, name_col, lat_col, lon_col, timelines, seg_start, seg_end, max_distance_km=5.0):
    """
    Calculate catchment areas using Voronoi-like assignment.
    For each ACLED event, assign to the closest open hospital (max 5km).
    Returns dict: hospital_name -> number of points assigned to it
    
    Note: This uses a sample of ACLED points to estimate the area.
    For full accuracy, this should use a regular grid or full event set.
    """
    # Sample ACLED events in this segment
    mask = (ac_df["_date"] >= seg_start) & (ac_df["_date"] <= seg_end)
    events_in_seg = ac_df.loc[mask].copy()
    
    if events_in_seg.empty:
        return {}
    
    # Get open hospitals at segment start
    open_hospitals = get_open_hospitals(hosp_df, name_col, lat_col, lon_col, timelines, seg_start)
    
    if not open_hospitals:
        return {}
    
    # For each event, find the closest open hospital
    catchment_counts = {h[0]: 0 for h in open_hospitals}
    
    for _, event in events_in_seg.iterrows():
        event_lat = event["_lat"]
        event_lon = event["_lon"]
        
        # Find closest hospital
        min_dist = float('inf')
        closest_hospital = None
        
        for hosp_name, hosp_lat, hosp_lon in open_hospitals:
            dist = haversine_km(event_lat, event_lon, hosp_lat, hosp_lon)
            if dist < min_dist:
                min_dist = dist
                closest_hospital = hosp_name
        
        # Only assign if within max distance
        if closest_hospital and min_dist <= max_distance_km:
            catchment_counts[closest_hospital] += 1
    
    return catchment_counts

def count_attacks_in_segment(ac_df, hosp_name, hosp_lat, hosp_lon, seg_start, seg_end, max_distance_km=5.0):
    """Count ACLED events in a segment that belong to this hospital's catchment"""
    mask = (ac_df["_date"] >= seg_start) & (ac_df["_date"] <= seg_end)
    events_in_seg = ac_df.loc[mask].copy()
    
    if events_in_seg.empty:
        return 0
    
    count = 0
    for _, event in events_in_seg.iterrows():
        event_lat = event["_lat"]
        event_lon = event["_lon"]
        dist = haversine_km(event_lat, event_lon, hosp_lat, hosp_lon)
        
        # Check if this hospital is the closest (would require checking all hospitals)
        # For now, count all within max distance - this will be refined when we have catchment areas
        if dist <= max_distance_km:
            count += 1
    
    return count

def process_for_hospital(hosp_name, hosp_lat, hosp_lon, ac_df, hosp_df, name_col, lat_col, lon_col, timelines, max_distance_km=5.0):
    """Process a hospital's segments and calculate attack percentages"""
    if hosp_name not in timelines:
        return pd.DataFrame()
    
    rows = []
    
    for period_start, period_end in timelines[hosp_name]:
        for seg_start, seg_end in segments(period_start, period_end, 14):
            # Count total attacks in segment
            mask = (ac_df["_date"] >= seg_start) & (ac_df["_date"] <= seg_end)
            total_attacks_in_seg = int(mask.sum())
            
            # Calculate catchment areas for this segment
            catchment_counts = calculate_catchment_areas(ac_df, hosp_df, name_col, lat_col, lon_col, 
                                                        timelines, seg_start, seg_end, max_distance_km)
            
            # Get this hospital's catchment area count
            attacks_in_catchment = catchment_counts.get(hosp_name, 0)
            
            # Calculate percentage
            pct = (attacks_in_catchment / total_attacks_in_seg * 100) if total_attacks_in_seg > 0 else 0.0
            
            # Calculate area based on catchment counts (rough estimate)
            # This is a proxy: more points = larger area
            total_catchment_points = sum(catchment_counts.values())
            if total_catchment_points > 0:
                approx_area = (attacks_in_catchment / total_catchment_points) * 2500  # Rough scaling
            else:
                approx_area = 0.0
            
            rows.append({
                "hospital": hosp_name,
                "hosp_lat": hosp_lat,
                "hosp_lon": hosp_lon,
                "catchment_area_km2": approx_area,
                "seg_start": seg_start,
                "seg_end": seg_end,
                "attacks_in_catchment": attacks_in_catchment,
                "pct_of_attacks_in_catchment": pct,
                "total_attacks_in_segment": total_attacks_in_seg,
            })
    
    return pd.DataFrame(rows)

def main(base=None):
    if base is None:
        base = Path('c:/Users/qingl/OneDrive/Desktop/Monona Zhou/bassproj/BassConnectionsFireinmysoul')
    else:
        base = Path(base)
    
    acled_path = base / "ACLED_May_09_25_Gaza.xlsx"
    hospitals_path = base / "Hospitals_OpenCloseoverTime.xlsx"

    print("Loading files...")
    ac = load_acled(acled_path)
    hosp_df, name_col, lat_col, lon_col = load_hospitals(hospitals_path)
    
    # Extract timelines from hospital file
    timelines = extract_timelines(hosp_df, name_col)
    print(f"Found {len(timelines)} hospitals with timelines:")
    for hosp, periods in timelines.items():
        print(f"  {hosp}: {periods}")
    
    out_path = base / "perc_attacks_incatchmentareas_results.xlsx"
    writer = pd.ExcelWriter(out_path, engine="openpyxl")
    
    for _, row in hosp_df.iterrows():
        hosp_name = row[name_col]
        if hosp_name not in timelines:
            print(f"Skipping {hosp_name} (no timeline data)")
            continue
        
        lat = float(row[lat_col])
        lon = float(row[lon_col])
        
        print(f"Processing {hosp_name}...")
        df_out = process_for_hospital(hosp_name, lat, lon, ac, hosp_df, name_col, lat_col, lon_col, timelines)
        
        if not df_out.empty:
            print(f"  Generated {len(df_out)} two-week segments")
            # Use first 31 chars of hospital name for sheet name (Excel limit)
            sheet_name = hosp_name[:31]
            df_out.to_excel(writer, sheet_name=sheet_name, index=False)
        else:
            print(f"  No segments generated")
    
    writer.close()
    print(f"Results written to {out_path}")

In [25]:
#Run the analysis (uncomment to execute)
main()

#After running, you can preview the output file: 
import pandas as pd
df = pd.read_excel('perc_attacks_incatchmentareas_results.xlsx', sheet_name=None)
for k,v in df.items():
  print(k, v.shape)


Loading files...
Found 4 hospitals with timelines:
  Al Shifa Medical Hospital: [(datetime.date(2023, 10, 1), datetime.date(2023, 11, 23)), (datetime.date(2023, 11, 28), datetime.date(2024, 3, 18))]
  Al-Quds Hospital: [(datetime.date(2023, 10, 1), datetime.date(2023, 11, 5))]
  Nasser Hospital: [(datetime.date(2023, 10, 1), datetime.date(2024, 2, 20))]
  European Hospital: [(datetime.date(2023, 10, 1), datetime.date(2024, 7, 1))]
Processing Al Shifa Medical Hospital...
  Generated 12 two-week segments
Processing Al-Quds Hospital...
  Generated 3 two-week segments
Processing Nasser Hospital...
  Generated 11 two-week segments
Processing European Hospital...
  Generated 20 two-week segments
Skipping Kuwait Hospital (no timeline data)
Results written to c:\Users\qingl\OneDrive\Desktop\Monona Zhou\bassproj\BassConnectionsFireinmysoul\perc_attacks_incatchmentareas_results.xlsx
Al Shifa Medical Hospital (12, 9)
Al-Quds Hospital (3, 9)
Nasser Hospital (11, 9)
European Hospital (20, 9)


In [26]:
# Check output results - Al Shifa only
from pathlib import Path
base = Path('c:/Users/qingl/OneDrive/Desktop/Monona Zhou/bassproj/BassConnectionsFireinmysoul')
results_path = base / 'perc_attacks_incatchmentareas_results.xlsx'
df_shifa = pd.read_excel(results_path, sheet_name='Al Shifa Medical Hospital')
print("Al Shifa Medical Hospital - first 5 rows:")
print(df_shifa.head())
print("\nColumn names:")
print(df_shifa.columns.tolist())
print("\nCatchment area values:")
print(df_shifa['catchment_area_km2'].describe())

Al Shifa Medical Hospital - first 5 rows:
                    hospital   hosp_lat   hosp_lon  catchment_area_km2  \
0  Al Shifa Medical Hospital  31.523691  34.442744         1459.627329   
1  Al Shifa Medical Hospital  31.523691  34.442744         1352.201258   
2  Al Shifa Medical Hospital  31.523691  34.442744         1449.275362   
3  Al Shifa Medical Hospital  31.523691  34.442744         1724.683544   
4  Al Shifa Medical Hospital  31.523691  34.442744         1425.438596   

   seg_start    seg_end  attacks_in_catchment  pct_of_attacks_in_catchment  \
0 2023-10-01 2023-10-14                    94                    35.338346   
1 2023-10-15 2023-10-28                   172                    32.887189   
2 2023-10-29 2023-11-11                   240                    32.432432   
3 2023-11-12 2023-11-23                   218                    39.279279   
4 2023-11-28 2023-12-11                   195                    33.913043   

   total_attacks_in_segment  
0             