In [2]:
from pathlib import Path
print("Current working dir:", Path.cwd())


Current working dir: c:\Users\Farhan\Desktop\PPTP\casus-foederis-pipeline


In [3]:
import glob, pprint
pprint.pp(glob.glob("**/atop5_1a.csv", recursive=True))

['atop_5.1__.csv_\\ATOP 5.1 (.csv)\\atop5_1a.csv']


In [4]:
from pathlib import Path
import pandas as pd

root = Path.cwd()                          # c:\Users\Farhan\Desktop\PPTP\casus-foederis-pipeline
atop_path = root / "atop_5.1__.csv_" / "ATOP 5.1 (.csv)" / "atop5_1a.csv"
icb_path  = root / "icb2v16.csv"

print("ATOP file exists:", atop_path.exists())   # should print True

atop_df = pd.read_csv(atop_path)
icb_df  = pd.read_csv(icb_path)

print(atop_df.shape, icb_df.shape)


ATOP file exists: True
(789, 134) (1131, 95)


In [14]:
import pandas as pd
import numpy as np

def clean_values(series):
    """Remove NaN/None values and convert to strings"""
    cleaned = []
    for val in series.dropna().unique():
        if pd.notna(val) and val is not None:
            cleaned.append(str(val))
    return cleaned

# ------------------------------------------------------------------
# 1.  Build cracid → [actor1, actor2, …]  (allow many)
# ------------------------------------------------------------------
cracid_actor_series = (
    icb_df.groupby("cracid")["actor"]
          .apply(clean_values)
)

# Warn about multiple labels
multi_actor = cracid_actor_series[cracid_actor_series.str.len() > 1]
print("=== cracid values with >1 actor label ===")
if multi_actor.empty:
    print("None")
else:
    for cid, labels in multi_actor.items():
        print(f"{cid}: {', '.join(labels)}")

# Save to CSV
(cracid_actor_series
     .apply("; ".join)
     .to_csv("cracid_to_actor.csv", header=["actor_labels"])
)

# Dict for later use
cracid_actor = cracid_actor_series.to_dict()


# ------------------------------------------------------------------
# 2.  Build cracid → [actloc1, actloc2, …]  (allow many)
# ------------------------------------------------------------------
cracid_actloc_series = (
    icb_df.groupby("cracid")["actloc"]
          .apply(clean_values)
)

# Warn about multiple locations
multi_loc = cracid_actloc_series[cracid_actloc_series.str.len() > 1]
print("\n=== cracid values with >1 actloc label ===")
if multi_loc.empty:
    print("None")
else:
    for cid, locs in multi_loc.items():
        print(f"{cid}: {', '.join(locs)}")

# Save to CSV
(cracid_actloc_series
     .apply("; ".join)
     .to_csv("cracid_to_actloc.csv", header=["actloc_labels"])
)

# Dict for later use
cracid_actloc = cracid_actloc_series.to_dict()

print(f"\nSaved {len(cracid_actor)} actor mappings and {len(cracid_actloc)} actloc mappings.")

=== cracid values with >1 actor label ===
345: YUG, SER

=== cracid values with >1 actloc label ===
630: 13.0, 15.0

Saved 147 actor mappings and 147 actloc mappings.


In [15]:
# Investigate cracid 630 (Iran) - why multiple actloc values?
print("=== Investigating cracid 630 (Iran) conflicts ===")

# Filter for Iran's conflicts
iran_conflicts = icb_df[icb_df['cracid'] == 630].copy()

print(f"Total rows for Iran: {len(iran_conflicts)}")
print(f"Unique actloc values: {sorted([x for x in iran_conflicts['actloc'].unique() if pd.notna(x)])}")
print(f"Unique actor values: {sorted([x for x in iran_conflicts['actor'].unique() if pd.notna(x)])}")

# Show all relevant columns for Iran conflicts
relevant_cols = ['crisno', 'actor', 'actloc', 'crisname', 'yrtrig', 'motrig', 'triggr']
# Add any other columns that might be relevant
additional_cols = []
for col in ['geog', 'period', 'syslev', 'outcom']:
    if col in icb_df.columns:
        additional_cols.append(col)

display_cols = [col for col in relevant_cols + additional_cols if col in icb_df.columns]

print(f"\n=== Iran conflict details ===")
print(iran_conflicts[display_cols].to_string(index=False))

# Group by actloc to see which conflicts fall into each region
print(f"\n=== Breakdown by actloc ===")
for actloc_val in iran_conflicts['actloc'].dropna().unique():
    print(f"\nActloc {actloc_val}:")
    subset = iran_conflicts[iran_conflicts['actloc'] == actloc_val]
    if 'crisname' in subset.columns:
        crisis_info = subset[['crisno', 'crisname', 'yrtrig']].drop_duplicates()
        for _, row in crisis_info.iterrows():
            year_str = f"({int(row['yrtrig'])})" if pd.notna(row['yrtrig']) else "(year unknown)"
            print(f"  Crisis {row['crisno']}: {row['crisname']} {year_str}")
    else:
        crisis_info = subset[['crisno', 'yrtrig']].drop_duplicates()
        for _, row in crisis_info.iterrows():
            year_str = f"({int(row['yrtrig'])})" if pd.notna(row['yrtrig']) else "(year unknown)"
            print(f"  Crisis {row['crisno']} {year_str}")

# Check if there are multiple actors involved in same crisis
print(f"\n=== Multi-actor analysis ===")
crisis_actors = iran_conflicts.groupby('crisno')['actor'].nunique()
multi_actor_crises = crisis_actors[crisis_actors > 1]
if len(multi_actor_crises) > 0:
    print("Crises with multiple actors:")
    for crisno in multi_actor_crises.index:
        crisis_data = iran_conflicts[iran_conflicts['crisno'] == crisno]
        actors = crisis_data['actor'].unique()
        actlocs = crisis_data['actloc'].unique()
        print(f"  Crisis {crisno}: actors {actors}, actlocs {actlocs}")
else:
    print("No crises with multiple actors found.")

# Summary
print(f"\n=== Summary ===")
print(f"Iran appears in {iran_conflicts['crisno'].nunique()} unique crises")
actloc_counts = iran_conflicts['actloc'].value_counts().dropna()
print(f"Regional classifications: {dict(actloc_counts)}")

# If there are region codes, try to decode them
print(f"\n=== Region code meanings (if available) ===")
if 13.0 in actloc_counts.index:
    print(f"Actloc 13.0 (appears {actloc_counts[13.0]} times): likely = Middle East")
if 15.0 in actloc_counts.index:
    print(f"Actloc 15.0 (appears {actloc_counts[15.0]} times): likely = South Asia")
print("This suggests Iran was involved in conflicts that spanned different regional theaters")

=== Investigating cracid 630 (Iran) conflicts ===
Total rows for Iran: 20
Unique actloc values: [13.0, 15.0]
Unique actor values: ['IRN']

=== Iran conflict details ===
 crisno actor  actloc                        crisname  yrtrig  motrig  triggr  geog  period  syslev  outcom
     14   IRN    13.0                  PERSIAN BORDER  1920.0     5.0     9.0  13.0     1.0     1.0     1.0
     87   IRN    13.0              OCCUPATION OF IRAN  1941.0     8.0     9.0  13.0     2.0     2.0     2.0
     96   IRN    13.0            IRAN-OIL CONCESSIONS  1944.0     9.0     3.0  13.0     2.0     1.0     1.0
    108   IRN    13.0                      AZERBAIJAN  1945.0     8.0     6.0  10.0     3.0     2.0     1.0
    172   IRN    13.0                 SHATT-AL-ARAB I  1959.0    12.0     2.0  15.0     3.0     1.0     3.0
    234   IRN    13.0                SHATT-AL-ARAB II  1969.0     4.0     2.0  15.0     4.0     1.0     1.0
    309   IRN    13.0             US HOSTAGES IN IRAN  1980.0     4.0     9

In [16]:
# Fix Iran's regional classification: change South Asia (15) to Middle East (13)
print("Before fix:")
iran_actloc_before = icb_df[icb_df['cracid'] == 630]['actloc'].value_counts().dropna()
print(f"Iran actloc distribution: {dict(iran_actloc_before)}")

# Make the change
mask = (icb_df['cracid'] == 630) & (icb_df['actloc'] == 15)
rows_changed = mask.sum()
icb_df.loc[mask, 'actloc'] = 13

print(f"\nChanged {rows_changed} rows")

print("After fix:")
iran_actloc_after = icb_df[icb_df['cracid'] == 630]['actloc'].value_counts().dropna()
print(f"Iran actloc distribution: {dict(iran_actloc_after)}")

Before fix:
Iran actloc distribution: {13.0: 15, 15.0: 5}

Changed 5 rows
After fix:
Iran actloc distribution: {13.0: 20}


In [17]:
# Check if every crisno maps to a single geog value
crisno_geog_mapping = (
    icb_df.groupby("crisno")["geog"]
          .apply(lambda x: x.dropna().unique().tolist())
)

# Find crises with multiple geog values
multi_geog_crises = crisno_geog_mapping[crisno_geog_mapping.str.len() > 1]

print("=== Checking crisno → geog mapping ===")
print(f"Total unique crises: {len(crisno_geog_mapping)}")

if multi_geog_crises.empty:
    print("✓ All crises map to a single geog value")
else:
    print(f"⚠️  Found {len(multi_geog_crises)} crises with multiple geog values:")
    for crisno, geog_values in multi_geog_crises.items():
        geog_str = ", ".join([str(g) for g in geog_values])
        
        # Get crisis name if available
        crisis_info = icb_df[icb_df['crisno'] == crisno][['crisname', 'yrtrig']].iloc[0]
        if pd.notna(crisis_info['crisname']):
            crisis_name = crisis_info['crisname']
        else:
            crisis_name = "Unknown"
        
        year = int(crisis_info['yrtrig']) if pd.notna(crisis_info['yrtrig']) else "Unknown"
        
        print(f"  Crisis {crisno}: {crisis_name} ({year}) → geog values: {geog_str}")

# Show distribution of geog mapping consistency
single_geog_count = (crisno_geog_mapping.str.len() == 1).sum()
multi_geog_count = (crisno_geog_mapping.str.len() > 1).sum()
print(f"\nSummary:")
print(f"  Single geog: {single_geog_count} crises")
print(f"  Multiple geog: {multi_geog_count} crises")

=== Checking crisno → geog mapping ===
Total unique crises: 512
⚠️  Found 9 crises with multiple geog values:
  Crisis 21: KARL'S RETURN HUNGARY (1921) → geog values: 31.0, 35.0
  Crisis 300: RAIDS ON ZIPRA (1979) → geog values: 22.0, 24.0
  Crisis 307: RHODESIA SETTLEMENT (1979) → geog values: 22.0, 23.0
  Crisis 365: S. AFRICA CROSS BORDER RAID (1986) → geog values: 23.0, 22.0
  Crisis 427: US EMBASSY BOMBINGS (1998) → geog values: 13.0, 22.0, 21.0
  Crisis 434: AFGHANISTAN/US (2001) → geog values: 13.0, 41.0
  Crisis 460: CHAD-SUDAN V (2009) → geog values: 24.0, 21.0
  Crisis 466: SUDAN-SOUTH SUDAN (2011) → geog values: 21.0, 22.0
  Crisis 499: GALWAN VALLEY BORDER CLASH (2020) → geog values: 13.0, 11.0

Summary:
  Single geog: 502 crises
  Multiple geog: 9 crises


In [18]:
import pandas as pd

# First, ensure we have the cracid_actloc mapping from earlier
# (This should already exist from your previous code)
cracid_actloc_series = (
    icb_df.groupby("cracid")["actloc"]
          .apply(lambda x: x.dropna().unique().tolist())
)
cracid_actloc_dict = cracid_actloc_series.to_dict()

def create_master_dataset(df, cracid_actloc_mapping):
    """
    Create Master Dataset with crisis-level information
    """
    
    # Group by crisno to get crisis-level data
    crisis_groups = df.groupby('crisno')
    
    master_data = []
    
    for crisno, group in crisis_groups:
        # Get basic crisis info (should be same across all rows for this crisis)
        crisis_name = group['crisname'].iloc[0] if pd.notna(group['crisname'].iloc[0]) else f"Crisis_{crisno}"
        
        # Get all unique cracids for this crisis
        unique_cracids = sorted([int(x) for x in group['cracid'].dropna().unique()])
        
        # Get corresponding actloc values using our mapping
        crisis_actlocs = set()
        for cracid in unique_cracids:
            if cracid in cracid_actloc_mapping:
                # Get actloc values for this cracid, convert to integers
                actloc_values = cracid_actloc_mapping[cracid]
                for actloc in actloc_values:
                    if pd.notna(actloc):
                        crisis_actlocs.add(int(float(actloc)))
        
        # Convert to sorted lists for consistent ordering
        actor_list = ";".join([str(x) for x in unique_cracids])
        actor_locations = ";".join([str(x) for x in sorted(crisis_actlocs)])
        
        master_data.append({
            'Crisis_ID': int(crisno),
            'Crisis_Name': crisis_name,
            'Actor_List': actor_list,
            'Actor_Locations': actor_locations
        })
    
    return pd.DataFrame(master_data)

# Create the Master Dataset
master_df = create_master_dataset(icb_df, cracid_actloc_dict)

# Display sample and summary
print("=== Master Dataset Sample ===")
print(master_df.head(10).to_string(index=False))

print(f"\n=== Summary ===")
print(f"Total crises: {len(master_df)}")
print(f"Sample Actor_List formats:")
for i, row in master_df.head(5).iterrows():
    print(f"  Crisis {row['Crisis_ID']}: {row['Actor_List']}")

print(f"\nSample Actor_Locations formats:")
for i, row in master_df.head(5).iterrows():
    print(f"  Crisis {row['Crisis_ID']}: {row['Actor_Locations']}")

# Check for any issues
print(f"\n=== Data Quality Checks ===")
empty_actors = master_df[master_df['Actor_List'] == ''].shape[0]
empty_locations = master_df[master_df['Actor_Locations'] == ''].shape[0]
print(f"Crises with no actors: {empty_actors}")
print(f"Crises with no locations: {empty_locations}")

# Save to CSV
master_df.to_csv("icb_master_dataset.csv", index=False)
print(f"\nSaved Master Dataset to 'icb_master_dataset.csv'")

# Show how to parse the data back (for machine readability)
print(f"\n=== Machine Readability Example ===")
sample_row = master_df.iloc[0]
print(f"Crisis: {sample_row['Crisis_Name']}")
print(f"Actors: {sample_row['Actor_List'].split(';')}")
print(f"Locations: {sample_row['Actor_Locations'].split(';')}")

# Function to parse actor/location lists
def parse_semicolon_list(semicolon_string):
    """Helper function to parse semicolon-separated strings back to lists"""
    if semicolon_string == '':
        return []
    return [int(x) for x in semicolon_string.split(';')]

print(f"\nParsing example:")
actors = parse_semicolon_list(sample_row['Actor_List'])
locations = parse_semicolon_list(sample_row['Actor_Locations'])
print(f"Parsed actors: {actors}")
print(f"Parsed locations: {locations}")

=== Master Dataset Sample ===
 Crisis_ID          Crisis_Name      Actor_List Actor_Locations
         1  RUSSIAN CIVIL WAR I             365              30
         2     COSTA RICAN COUP           93;94              42
         3 RUSSIAN CIVIL WAR II             365              30
         4  BALTIC INDEPENDENCE 365;366;367;368           30;34
         5              TESCHEN         290;315              31
         6        HUNGARIAN WAR     310;315;360              31
         7               SMYRNA         325;350              35
         8     THIRD AFGHAN WAR         200;700           13;34
         9 FINNISH/RUSSIAN BDR.         365;375           30;34
        10           BESSARABIA         360;365           30;31

=== Summary ===
Total crises: 512
Sample Actor_List formats:
  Crisis 1: 365
  Crisis 2: 93;94
  Crisis 3: 365
  Crisis 4: 365;366;367;368
  Crisis 5: 290;315

Sample Actor_Locations formats:
  Crisis 1: 30
  Crisis 2: 42
  Crisis 3: 30
  Crisis 4: 30;34
  Crisis 5

In [None]:
import pandas as pd
from datetime import datetime
import numpy as np

# Read ICB1 dataset
icb1_df = pd.read_csv("icb1v16.csv")

print("=== ICB1 Dataset Info ===")
print(f"Shape: {icb1_df.shape}")
print(f"Columns: {list(icb1_df.columns)}")

def create_standardized_date(year, month, day, date_type="start"):
    """
    Create standardized date from separate year, month, day components
    Handles missing values appropriately
    """
    if pd.isna(year):
        return None
    
    year = int(year)
    
    # Handle missing month - default to January for start, December for end
    if pd.isna(month):
        month = 1 if date_type == "start" else 12
    else:
        month = int(month)
        month = max(1, min(12, month))  # Ensure valid month
    
    # Handle missing day - default to 1st for start, last day of month for end
    if pd.isna(day):
        if date_type == "start":
            day = 1
        else:
            # Get last day of the month
            if month in [1, 3, 5, 7, 8, 10, 12]:
                day = 31
            elif month in [4, 6, 9, 11]:
                day = 30
            else:  # February
                # Simple leap year check
                if year % 4 == 0 and (year % 100 != 0 or year % 400 == 0):
                    day = 29
                else:
                    day = 28
    else:
        day = int(day)
        day = max(1, min(31, day))  # Ensure valid day
    
    try:
        return datetime(year, month, day).strftime('%Y-%m-%d')
    except ValueError:
        # Handle invalid dates (e.g., Feb 30) by adjusting day
        try:
            return datetime(year, month, min(day, 28)).strftime('%Y-%m-%d')
        except ValueError:
            return f"{year}-{month:02d}-01"  # Fallback to first of month

def process_icb1_dates_locations(df):
    """
    Process ICB1 dataset to extract standardized dates and locations
    """
    
    # Check what columns are actually available
    date_cols = ['yrtrig', 'motrig', 'datrig', 'yrterm', 'moterm', 'daterm']
    available_cols = [col for col in date_cols if col in df.columns]
    print(f"Available date columns: {available_cols}")
    
    if 'geog' in df.columns:
        print("Geographic location column 'geog' found")
    else:
        print("Warning: 'geog' column not found")
    
    # Create the processed dataset
    processed_data = []
    
    for idx, row in df.iterrows():
        crisis_data = {}
        
        # Basic crisis identifier
        if 'crisno' in df.columns:
            crisis_data['Crisis_ID'] = int(row['crisno']) if pd.notna(row['crisno']) else None
        
        # Create standardized start date
        if all(col in df.columns for col in ['yrtrig', 'motrig', 'datrig']):
            start_date = create_standardized_date(
                row['yrtrig'], row['motrig'], row['datrig'], "start"
            )
            crisis_data['Start_Date'] = start_date
        
        # Create standardized end date
        if all(col in df.columns for col in ['yrterm', 'moterm', 'daterm']):
            end_date = create_standardized_date(
                row['yrterm'], row['moterm'], row['daterm'], "end"
            )
            crisis_data['End_Date'] = end_date
        
        # Geographic location
        if 'geog' in df.columns:
            crisis_data['Geographic_Location'] = int(row['geog']) if pd.notna(row['geog']) else None
        
        # Add crisis name if available
        if 'crisname' in df.columns:
            crisis_data['Crisis_Name'] = row['crisname'] if pd.notna(row['crisname']) else None
        
        processed_data.append(crisis_data)
    
    return pd.DataFrame(processed_data)

# Process the ICB1 dataset
icb1_processed = process_icb1_dates_locations(icb1_df)

# Remove duplicates if any (should be one row per crisis)
if 'Crisis_ID' in icb1_processed.columns:
    icb1_processed = icb1_processed.drop_duplicates(subset=['Crisis_ID'])

print(f"\n=== Processed ICB1 Dataset ===")
print(f"Shape: {icb1_processed.shape}")
print(f"Columns: {list(icb1_processed.columns)}")

# Display sample data
print(f"\n=== Sample Data ===")
print(icb1_processed.head(10).to_string(index=False))

# Data quality checks
print(f"\n=== Data Quality Summary ===")
for col in icb1_processed.columns:
    null_count = icb1_processed[col].isnull().sum()
    total_count = len(icb1_processed)
    print(f"{col}: {null_count}/{total_count} missing ({null_count/total_count*100:.1f}%)")

# Show date range
if 'Start_Date' in icb1_processed.columns:
    valid_start_dates = icb1_processed['Start_Date'].dropna()
    if len(valid_start_dates) > 0:
        print(f"\nStart dates range: {valid_start_dates.min()} to {valid_start_dates.max()}")

if 'End_Date' in icb1_processed.columns:
    valid_end_dates = icb1_processed['End_Date'].dropna()
    if len(valid_end_dates) > 0:
        print(f"End dates range: {valid_end_dates.min()} to {valid_end_dates.max()}")

# Geographic location distribution
if 'Geographic_Location' in icb1_processed.columns:
    geog_dist = icb1_processed['Geographic_Location'].value_counts().sort_index()
    print(f"\nGeographic location distribution:")
    print(geog_dist.head(10))

# Save processed dataset
icb1_processed.to_csv("icb1_processed_dates_locations.csv", index=False)
print(f"\nSaved processed ICB1 data to 'icb1_processed_dates_locations.csv'")

# Show how to merge with master dataset (if it exists)
print(f"\n=== Merging with Master Dataset ===")
print("To merge with your master dataset:")
print("master_df = master_df.merge(icb1_processed, on='Crisis_ID', how='left')")

# Example of date parsing
print(f"\n=== Date Format Examples ===")
sample_dates = icb1_processed[['Start_Date', 'End_Date']].dropna().head(3)
for idx, row in sample_dates.iterrows():
    print(f"Crisis: {row['Start_Date']} → {row['End_Date']}")
    # Show how to parse back to datetime objects
    start_dt = pd.to_datetime(row['Start_Date'])
    end_dt = pd.to_datetime(row['End_Date'])
    duration = (end_dt - start_dt).days
    print(f"  Duration: {duration} days")

=== ICB1 Dataset Info ===
Shape: (512, 95)
Columns: ['icb1', 'crisno', 'crisname', 'break', 'trigent', 'yrtrig', 'motrig', 'datrig', 'yrterm', 'moterm', 'daterm', 'brexit', 'gravcr', 'crismg', 'cenviosy', 'sevviosy', 'viol', 'timvio', 'iwcmb', 'noactr', 'gpinv', 'gpinvtp', 'gpefcttp', 'gpefactp', 'gppacetp', 'powinv', 'usinv', 'usefct', 'usefac', 'uspace', 'usactor', 'suinv', 'suefct', 'suefac', 'supace', 'suactor', 'chinv', 'soglact', 'globorg', 'globactm', 'globefct', 'globefor', 'globefac', 'globpace', 'soract', 'regorg', 'regactmb', 'roefct', 'robody', 'roefac', 'ropace', 'subout', 'forout', 'exsat', 'outesr', 'cractr', 'geostr', 'hetero', 'issues', 'chacts', 'chall', 'powch', 'rugach', 'geog', 'geogrel', 'period', 'syslevsy', 'protrac', 'pcid', 'powdissy', 'ethnic', 'ethconf', 'stressad', 'sourdt', 'mediate', 'mednum', 'medwho', 'medtime', 'yrmedst', 'momedst', 'damedst', 'yrmedend', 'momedend', 'damedend', 'yrmedfin', 'momedfin', 'damedfin', 'medgoal', 'medfacl', 'medform', 'medm

In [20]:
master_df = master_df.merge(icb1_processed, on='Crisis_ID', how='left')

In [21]:
master_df.head()

Unnamed: 0,Crisis_ID,Crisis_Name_x,Actor_List,Actor_Locations,Start_Date,End_Date,Geographic_Location,Crisis_Name_y
0,1,RUSSIAN CIVIL WAR I,365,30,1918-05-01,1920-04-01,30.0,RUSSIAN CIVIL WAR I
1,2,COSTA RICAN COUP,93;94,42,1918-05-25,1919-09-03,42.0,COSTA RICAN COUP
2,3,RUSSIAN CIVIL WAR II,365,30,1918-06-23,1919-09-27,30.0,RUSSIAN CIVIL WAR II
3,4,BALTIC INDEPENDENCE,365;366;367;368,30;34,1918-11-18,1920-08-11,34.0,BALTIC INDEPENDENCE
4,5,TESCHEN,290;315,31,1919-01-15,1920-07-28,31.0,TESCHEN


In [23]:
import pandas as pd
from datetime import datetime
import numpy as np


print("=== ATOP Dataset Info ===")
print(f"Shape: {atop_df.shape}")
print(f"Sample columns: {list(atop_df.columns)[:20]}...")

def create_standardized_date(year, month, day, date_type="start"):
    """
    Create standardized date from separate year, month, day components
    Same logic as ICB processing
    """
    if pd.isna(year):
        return None
    
    year = int(year)
    
    # Handle missing month - default to January for start, December for end
    if pd.isna(month):
        month = 1 if date_type == "start" else 12
    else:
        month = int(month)
        month = max(1, min(12, month))  # Ensure valid month
    
    # Handle missing day - default to 1st for start, last day of month for end
    if pd.isna(day):
        if date_type == "start":
            day = 1
        else:
            # Get last day of the month
            if month in [1, 3, 5, 7, 8, 10, 12]:
                day = 31
            elif month in [4, 6, 9, 11]:
                day = 30
            else:  # February
                # Simple leap year check
                if year % 4 == 0 and (year % 100 != 0 or year % 400 == 0):
                    day = 29
                else:
                    day = 28
    else:
        day = int(day)
        day = max(1, min(31, day))  # Ensure valid day
    
    try:
        return datetime(year, month, day).strftime('%Y-%m-%d')
    except ValueError:
        # Handle invalid dates (e.g., Feb 30) by adjusting day
        try:
            return datetime(year, month, min(day, 28)).strftime('%Y-%m-%d')
        except ValueError:
            return f"{year}-{month:02d}-01"  # Fallback to first of month

def extract_alliance_types(row):
    """
    Extract alliance types from dummy variables and map to numbers
    defense=1, offense=2, neutral=3, nonagg=4, consul=5
    """
    type_mapping = {
        'defense': 1,
        'offense': 2, 
        'neutral': 3,
        'nonagg': 4,
        'consul': 5
    }
    
    alliance_types = []
    for type_name, type_num in type_mapping.items():
        if type_name in row.index and pd.notna(row[type_name]) and row[type_name] == 1:
            alliance_types.append(type_num)
    
    return ";".join([str(t) for t in sorted(alliance_types)])

def extract_members(row):
    """
    Extract member list from mem1, mem2, mem3... columns
    """
    members = []
    
    # Find all mem* columns
    mem_cols = [col for col in row.index if col.startswith('mem') and col[3:].isdigit()]
    mem_cols.sort(key=lambda x: int(x[3:]))  # Sort by number
    
    for col in mem_cols:
        if pd.notna(row[col]):
            # Convert to int if it's a float
            member_val = int(row[col]) if isinstance(row[col], (int, float)) else row[col]
            members.append(str(member_val))
    
    return ";".join(members)

def count_members(row):
    """
    Count number of members from mem* columns
    """
    mem_cols = [col for col in row.index if col.startswith('mem') and col[3:].isdigit()]
    return sum(1 for col in mem_cols if pd.notna(row[col]))

def process_atop_dataset(df):
    """
    Process ATOP dataset to create Master2 dataset
    """
    
    # Check what columns are available
    required_cols = ['atopid', 'begday', 'begmo', 'begyr', 'endday', 'endmo', 'endyr']
    alliance_type_cols = ['defense', 'offense', 'neutral', 'nonagg', 'consul']
    
    available_required = [col for col in required_cols if col in df.columns]
    available_types = [col for col in alliance_type_cols if col in df.columns]
    
    print(f"Available required columns: {available_required}")
    print(f"Available alliance type columns: {available_types}")
    
    # Find mem* columns
    mem_cols = [col for col in df.columns if col.startswith('mem') and col[3:].isdigit()]
    print(f"Found {len(mem_cols)} member columns: {mem_cols[:10]}..." if len(mem_cols) > 10 else f"Member columns: {mem_cols}")
    
    # Check for cowid columns
    cow_cols = [col for col in df.columns if 'cowid' in col.lower()]
    print(f"COW ID columns found: {cow_cols}")
    
    # Create the processed dataset
    processed_data = []
    
    for idx, row in df.iterrows():
        alliance_data = {}
        
        # Alliance ID
        if 'atopid' in df.columns:
            alliance_data['Alliance_ID'] = int(row['atopid']) if pd.notna(row['atopid']) else None
        
        # Alliance Name (generic since not provided)
        if 'atopid' in df.columns and pd.notna(row['atopid']):
            alliance_data['Alliance_Name'] = f"Alliance_{int(row['atopid'])}"
        else:
            alliance_data['Alliance_Name'] = f"Alliance_Unknown_{idx}"
        
        # Create standardized start date
        if all(col in df.columns for col in ['begyr', 'begmo', 'begday']):
            start_date = create_standardized_date(
                row['begyr'], row['begmo'], row['begday'], "start"
            )
            alliance_data['Alliance_Start'] = start_date
        
        # Create standardized end date
        if all(col in df.columns for col in ['endyr', 'endmo', 'endday']):
            end_date = create_standardized_date(
                row['endyr'], row['endmo'], row['endday'], "end"
            )
            alliance_data['Alliance_End'] = end_date
        
        # Alliance types
        alliance_data['Alliance_Type'] = extract_alliance_types(row)
        
        # Number of members
        alliance_data['N_Members'] = count_members(row)
        
        # Members list
        alliance_data['Members_List'] = extract_members(row)
        
        # COW IDs if available
        if 'cowid' in df.columns:
            alliance_data['COWID'] = int(row['cowid']) if pd.notna(row['cowid']) else None
        if 'cow4id' in df.columns:
            alliance_data['COW4ID'] = int(row['cow4id']) if pd.notna(row['cow4id']) else None
        
        processed_data.append(alliance_data)
    
    return pd.DataFrame(processed_data)

# Process the ATOP dataset
print("\n=== Processing ATOP Dataset ===")
atop_processed = process_atop_dataset(atop_df)

# Remove duplicates if any
if 'Alliance_ID' in atop_processed.columns:
    atop_processed = atop_processed.drop_duplicates(subset=['Alliance_ID'])

print(f"\n=== Processed ATOP Dataset ===")
print(f"Shape: {atop_processed.shape}")
print(f"Columns: {list(atop_processed.columns)}")

# Display sample data
print(f"\n=== Sample Data ===")
print(atop_processed.head(10).to_string(index=False))

# Data quality checks
print(f"\n=== Data Quality Summary ===")
for col in atop_processed.columns:
    null_count = atop_processed[col].isnull().sum()
    total_count = len(atop_processed)
    print(f"{col}: {null_count}/{total_count} missing ({null_count/total_count*100:.1f}%)")

# Show alliance type distribution
if 'Alliance_Type' in atop_processed.columns:
    print(f"\n=== Alliance Type Analysis ===")
    # Count single vs multiple types
    single_types = atop_processed[~atop_processed['Alliance_Type'].str.contains(';', na=False)]
    multi_types = atop_processed[atop_processed['Alliance_Type'].str.contains(';', na=False)]
    
    print(f"Single type alliances: {len(single_types)}")
    print(f"Multiple type alliances: {len(multi_types)}")
    
    if len(multi_types) > 0:
        print("Examples of multiple-type alliances:")
        for idx, row in multi_types.head(5).iterrows():
            types = row['Alliance_Type'].split(';')
            type_names = {1: 'defense', 2: 'offense', 3: 'neutral', 4: 'nonagg', 5: 'consul'}
            type_desc = [type_names.get(int(t), f'type_{t}') for t in types]
            print(f"  Alliance {row['Alliance_ID']}: {', '.join(type_desc)}")

# Show member analysis
if 'N_Members' in atop_processed.columns:
    print(f"\n=== Membership Analysis ===")
    member_dist = atop_processed['N_Members'].value_counts().sort_index()
    print(f"Member count distribution (top 10):")
    print(member_dist.head(10))
    
    print(f"\nLargest alliances:")
    largest = atop_processed.nlargest(5, 'N_Members')[['Alliance_ID', 'Alliance_Name', 'N_Members', 'Members_List']]
    for idx, row in largest.iterrows():
        members = row['Members_List'].split(';')[:5]  # Show first 5 members
        more = f" (+{row['N_Members']-5} more)" if row['N_Members'] > 5 else ""
        print(f"  {row['Alliance_Name']}: {row['N_Members']} members - {', '.join(members)}{more}")

# Date range analysis
if 'Alliance_Start' in atop_processed.columns:
    valid_start_dates = atop_processed['Alliance_Start'].dropna()
    if len(valid_start_dates) > 0:
        print(f"\nAlliance start dates range: {valid_start_dates.min()} to {valid_start_dates.max()}")

if 'Alliance_End' in atop_processed.columns:
    valid_end_dates = atop_processed['Alliance_End'].dropna()
    active_alliances = atop_processed['Alliance_End'].isnull().sum()
    print(f"Active alliances (no end date): {active_alliances}")
    if len(valid_end_dates) > 0:
        print(f"Alliance end dates range: {valid_end_dates.min()} to {valid_end_dates.max()}")

# Save processed dataset
atop_processed.to_csv("atop_master2_dataset.csv", index=False)
print(f"\nSaved processed ATOP data to 'atop_master2_dataset.csv'")

# Show parsing examples
print(f"\n=== Machine Readability Examples ===")
sample_row = atop_processed.iloc[0]
print(f"Alliance: {sample_row['Alliance_Name']}")
if sample_row['Alliance_Type']:
    types = sample_row['Alliance_Type'].split(';')
    type_names = {1: 'defense', 2: 'offense', 3: 'neutral', 4: 'nonagg', 5: 'consul'}
    print(f"Types: {[type_names.get(int(t), f'type_{t}') for t in types]}")
if sample_row['Members_List']:
    members = sample_row['Members_List'].split(';')
    print(f"Members: {members[:5]}{'...' if len(members) > 5 else ''}")

# Helper functions for parsing
print(f"\n=== Helper Functions for Data Parsing ===")
print("""
def parse_alliance_types(type_string):
    '''Parse alliance types back to list'''
    if not type_string:
        return []
    return [int(t) for t in type_string.split(';')]

def parse_members_list(members_string):
    '''Parse members list back to list'''
    if not members_string:
        return []
    return [int(m) for m in members_string.split(';')]

def get_alliance_type_names(type_numbers):
    '''Convert type numbers back to names'''
    type_names = {1: 'defense', 2: 'offense', 3: 'neutral', 4: 'nonagg', 5: 'consul'}
    return [type_names.get(t, f'type_{t}') for t in type_numbers]
""")

=== ATOP Dataset Info ===
Shape: (789, 134)
Sample columns: ['atopid', 'cowid', 'cow4id', 'begyr', 'begmo', 'begday', 'endyr', 'endmo', 'endday', 'ineffect', 'bilat', 'maxphase', 'wartime', 'estmode', 'pubsecr', 'secrart', 'proadd', 'futmem', 'speclgth', 'length']...

=== Processing ATOP Dataset ===
Available required columns: ['atopid', 'begday', 'begmo', 'begyr', 'endday', 'endmo', 'endyr']
Available alliance type columns: ['defense', 'offense', 'neutral', 'nonagg', 'consul']
Found 59 member columns: ['mem1', 'mem2', 'mem3', 'mem4', 'mem5', 'mem6', 'mem7', 'mem8', 'mem9', 'mem10']...
COW ID columns found: ['cowid']

=== Processed ATOP Dataset ===
Shape: (789, 9)
Columns: ['Alliance_ID', 'Alliance_Name', 'Alliance_Start', 'Alliance_End', 'Alliance_Type', 'N_Members', 'Members_List', 'COWID', 'COW4ID']

=== Sample Data ===
 Alliance_ID Alliance_Name Alliance_Start Alliance_End Alliance_Type  N_Members                            Members_List  COWID  COW4ID
        1005 Alliance_1005    

In [24]:
import pandas as pd
import numpy as np

def extract_all_atop_members(atop_processed_df):
    """
    Extract all unique member IDs from ATOP Members_List
    """
    all_members = set()
    
    for members_string in atop_processed_df['Members_List'].dropna():
        if members_string:  # Not empty
            members = members_string.split(';')
            for member in members:
                try:
                    all_members.add(int(member))
                except ValueError:
                    print(f"Warning: Could not convert member '{member}' to integer")
    
    return sorted(list(all_members))

def get_icb_cracids(icb_df):
    """
    Get all unique cracid values from ICB dataset
    """
    return sorted(list(icb_df['cracid'].dropna().unique().astype(int)))

def analyze_member_consistency(atop_processed_df, icb_df):
    """
    Analyze consistency between ATOP members and ICB cracids
    """
    print("=== ATOP-ICB Member Consistency Analysis ===")
    
    # Extract unique members from both datasets
    atop_members = extract_all_atop_members(atop_processed_df)
    icb_cracids = get_icb_cracids(icb_df)
    
    print(f"ATOP unique members: {len(atop_members)}")
    print(f"ICB unique cracids: {len(icb_cracids)}")
    
    # Convert to sets for analysis
    atop_set = set(atop_members)
    icb_set = set(icb_cracids)
    
    # Find overlaps and differences
    common_members = atop_set.intersection(icb_set)
    atop_only = atop_set - icb_set
    icb_only = icb_set - atop_set
    
    print(f"\n=== Overlap Analysis ===")
    print(f"Common members (in both datasets): {len(common_members)}")
    print(f"ATOP-only members (missing from ICB): {len(atop_only)}")
    print(f"ICB-only cracids (missing from ATOP): {len(icb_only)}")
    
    # Coverage percentages
    atop_coverage = len(common_members) / len(atop_set) * 100 if atop_set else 0
    icb_coverage = len(common_members) / len(icb_set) * 100 if icb_set else 0
    
    print(f"\n=== Coverage Analysis ===")
    print(f"ATOP members covered by ICB: {atop_coverage:.1f}%")
    print(f"ICB cracids covered by ATOP: {icb_coverage:.1f}%")
    
    # Show problematic cases
    if atop_only:
        print(f"\n=== ATOP Members Missing from ICB (PROBLEM) ===")
        print(f"These {len(atop_only)} countries appear in ATOP but not ICB:")
        atop_only_sorted = sorted(list(atop_only))
        
        # Show first 20, then summary if more
        if len(atop_only_sorted) <= 20:
            print(f"All missing: {atop_only_sorted}")
        else:
            print(f"First 20: {atop_only_sorted[:20]}")
            print(f"... and {len(atop_only_sorted)-20} more")
        
        # Check which alliances are affected
        print(f"\nAlliances containing missing members:")
        affected_alliances = 0
        for idx, row in atop_processed_df.iterrows():
            if row['Members_List']:
                members = [int(m) for m in row['Members_List'].split(';')]
                missing_in_alliance = [m for m in members if m in atop_only]
                if missing_in_alliance:
                    affected_alliances += 1
                    if affected_alliances <= 10:  # Show first 10
                        print(f"  Alliance {row['Alliance_ID']}: missing members {missing_in_alliance}")
        
        if affected_alliances > 10:
            print(f"  ... and {affected_alliances-10} more affected alliances")
        
        print(f"Total affected alliances: {affected_alliances}/{len(atop_processed_df)}")
    
    if icb_only:
        print(f"\n=== ICB Cracids Missing from ATOP (INFO) ===")
        print(f"These {len(icb_only)} countries appear in ICB but not ATOP:")
        icb_only_sorted = sorted(list(icb_only))
        
        if len(icb_only_sorted) <= 20:
            print(f"All missing: {icb_only_sorted}")
        else:
            print(f"First 20: {icb_only_sorted[:20]}")
            print(f"... and {len(icb_only_sorted)-20} more")
    
    # Summary recommendations
    print(f"\n=== Recommendations ===")
    if len(atop_only) == 0:
        print("✓ GOOD: All ATOP members exist in ICB - location mapping will work perfectly")
    else:
        print(f"⚠️  ISSUE: {len(atop_only)} ATOP members missing from ICB")
        print(f"   → These members cannot be mapped to locations using ICB data")
        print(f"   → {affected_alliances} alliances affected")
        print(f"   → Consider: manual mapping, alternative data source, or exclude these members")
    
    if len(icb_only) > 0:
        print(f"ℹ️  INFO: {len(icb_only)} ICB countries not in any alliance (normal)")
    
    # Return analysis results for further processing
    return {
        'atop_members': atop_members,
        'icb_cracids': icb_cracids,
        'common_members': sorted(list(common_members)),
        'atop_only': sorted(list(atop_only)),
        'icb_only': sorted(list(icb_only)),
        'atop_coverage': atop_coverage,
        'icb_coverage': icb_coverage,
        'affected_alliances': affected_alliances if atop_only else 0
    }

def create_member_mapping_report(analysis_results):
    """
    Create a detailed report for member mapping issues
    """
    if not analysis_results['atop_only']:
        print("\n=== No mapping issues found! ===")
        return None
    
    print(f"\n=== Detailed Member Mapping Report ===")
    
    # Create a DataFrame of problematic members
    problematic_df = pd.DataFrame({
        'Missing_Member_ID': analysis_results['atop_only'],
        'In_ATOP': True,
        'In_ICB': False,
        'Location_Mappable': False
    })
    
    print(f"Problematic members summary:")
    print(problematic_df.head(10))
    
    # Save to CSV for manual review
    problematic_df.to_csv("atop_icb_missing_members.csv", index=False)
    print(f"\nSaved detailed report to 'atop_icb_missing_members.csv'")
    
    return problematic_df

# Run the analysis
print("Running ATOP-ICB member consistency check...")

# Ensure we have the processed datasets
if 'atop_processed' not in globals():
    print("Warning: atop_processed not found. Please run the ATOP processing code first.")
if 'icb_df' not in globals():
    print("Warning: icb_df not found. Please load the ICB dataset first.")

# Perform the analysis
analysis_results = analyze_member_consistency(atop_processed, icb_df)

# Create detailed report if there are issues
mapping_report = create_member_mapping_report(analysis_results)

# Additional analysis: Show some statistics
print(f"\n=== Additional Statistics ===")
print(f"ATOP member ID range: {min(analysis_results['atop_members'])} to {max(analysis_results['atop_members'])}")
print(f"ICB cracid range: {min(analysis_results['icb_cracids'])} to {max(analysis_results['icb_cracids'])}")

# Sample of common members
print(f"\nSample common members: {analysis_results['common_members'][:10]}...")

# Check if there's a pattern in missing members
if analysis_results['atop_only']:
    missing_members = analysis_results['atop_only']
    print(f"\nMissing member patterns:")
    print(f"  Smallest missing: {min(missing_members)}")
    print(f"  Largest missing: {max(missing_members)}")
    
    # Check for potential coding differences
    high_missing = [m for m in missing_members if m > 1000]
    if high_missing:
        print(f"  High-numbered missing codes (>1000): {len(high_missing)} - might be different coding system")
        print(f"    Examples: {high_missing[:5]}")

print(f"\n=== Next Steps ===")
if analysis_results['atop_only']:
    print("1. Review 'atop_icb_missing_members.csv' for manual inspection")
    print("2. Consider alternative country code mappings (COW, ISO, etc.)")
    print("3. For location mapping, you may need to:")
    print("   - Exclude alliances with unmappable members")
    print("   - Use alternative geographic data sources")
    print("   - Manually map missing country codes")
else:
    print("1. Proceed with confidence - all ATOP members can be mapped!")
    print("2. Use the cracid_actloc mapping for location assignment")

Running ATOP-ICB member consistency check...
=== ATOP-ICB Member Consistency Analysis ===
ATOP unique members: 208
ICB unique cracids: 147

=== Overlap Analysis ===
Common members (in both datasets): 142
ATOP-only members (missing from ICB): 66
ICB-only cracids (missing from ATOP): 5

=== Coverage Analysis ===
ATOP members covered by ICB: 68.3%
ICB cracids covered by ATOP: 96.6%

=== ATOP Members Missing from ICB (PROBLEM) ===
These 66 countries appear in ATOP but not ICB:
First 20: [31, 51, 52, 53, 54, 56, 57, 58, 60, 80, 115, 140, 165, 205, 221, 223, 232, 240, 245, 267]
... and 46 more

Alliances containing missing members:
  Alliance 1005: missing members [240, 245, 300]
  Alliance 1010: missing members [275, 300]
  Alliance 1015: missing members [300, 329]
  Alliance 1020: missing members [240, 245, 267, 269, 271, 273, 275, 280, 300]
  Alliance 1025: missing members [300, 329]
  Alliance 1030: missing members [300, 337]
  Alliance 1035: missing members [300]
  Alliance 1050: missin

In [25]:
# Map unmapped Caribbean/Central American countries to Actor_Location 42
caribbean_mapping = {
    31: "Bahamas",
    51: "Jamaica", 
    52: "Trinidad and Tobago",
    53: "Barbados",
    54: "Dominica",
    56: "St. Lucia",
    57: "St. Vincent and the Grenadines", 
    58: "Antigua & Barbuda",
    60: "St. Kitts and Nevis",
    80: "Belize"
}

target_actloc = 42  # Central America (including Caribbean countries)

print("=== Mapping Caribbean/Central American Countries ===")
print(f"Target location: {target_actloc} (Central America including Caribbean)")

# Check which of these countries exist in ICB dataset
existing_cracids = set(icb_df['cracid'].dropna().astype(int))
caribbean_cracids = set(caribbean_mapping.keys())

found_cracids = caribbean_cracids.intersection(existing_cracids)
missing_cracids = caribbean_cracids - existing_cracids

print(f"\nCountries found in ICB dataset: {len(found_cracids)}")
for cracid in sorted(found_cracids):
    print(f"  {cracid}: {caribbean_mapping[cracid]}")

if missing_cracids:
    print(f"\nCountries NOT found in ICB dataset: {len(missing_cracids)}")
    for cracid in sorted(missing_cracids):
        print(f"  {cracid}: {caribbean_mapping[cracid]}")

# Check current actloc values for these countries
print(f"\n=== Current actloc values ===")
for cracid in sorted(found_cracids):
    current_actlocs = icb_df[icb_df['cracid'] == cracid]['actloc'].dropna().unique()
    if len(current_actlocs) > 0:
        print(f"  {cracid} ({caribbean_mapping[cracid]}): currently {list(current_actlocs)}")
    else:
        print(f"  {cracid} ({caribbean_mapping[cracid]}): currently NO actloc values")

# Update the ICB dataframe actloc values
print(f"\n=== Updating actloc values ===")
rows_updated = 0

for cracid in found_cracids:
    # Update all rows for this cracid to have actloc = 42
    mask = icb_df['cracid'] == cracid
    rows_for_country = mask.sum()
    
    # Check if country already has actloc = 42
    current_actlocs = icb_df[mask]['actloc'].dropna().unique()
    
    if target_actloc in current_actlocs:
        print(f"  {cracid} ({caribbean_mapping[cracid]}): already has actloc {target_actloc}")
    else:
        icb_df.loc[mask, 'actloc'] = target_actloc
        rows_updated += rows_for_country
        print(f"  {cracid} ({caribbean_mapping[cracid]}): updated {rows_for_country} rows to actloc {target_actloc}")

print(f"\nTotal rows updated: {rows_updated}")

# Update the cracid_actloc mapping dictionary
print(f"\n=== Updating cracid_actloc mapping ===")
if 'cracid_actloc_dict' not in globals():
    print("Creating new cracid_actloc mapping...")
    cracid_actloc_series = (
        icb_df.groupby("cracid")["actloc"]
              .apply(lambda x: x.dropna().unique().tolist())
    )
    cracid_actloc_dict = cracid_actloc_series.to_dict()

# Update the mapping for Caribbean countries
for cracid in found_cracids:
    # Convert actloc values to integers for consistency
    if cracid in cracid_actloc_dict:
        current_actlocs = [int(float(x)) for x in cracid_actloc_dict[cracid] if pd.notna(x)]
        if target_actloc not in current_actlocs:
            current_actlocs.append(target_actloc)
            cracid_actloc_dict[cracid] = sorted(current_actlocs)
            print(f"  Updated mapping for {cracid} ({caribbean_mapping[cracid]}): {cracid_actloc_dict[cracid]}")
        else:
            print(f"  {cracid} ({caribbean_mapping[cracid]}): already mapped to {target_actloc}")
    else:
        cracid_actloc_dict[cracid] = [target_actloc]
        print(f"  Added new mapping for {cracid} ({caribbean_mapping[cracid]}): [{target_actloc}]")

# Verify the updates
print(f"\n=== Verification ===")
for cracid in sorted(found_cracids):
    # Check ICB dataframe
    icb_actlocs = sorted(icb_df[icb_df['cracid'] == cracid]['actloc'].dropna().unique())
    # Check mapping dictionary  
    dict_actlocs = sorted(cracid_actloc_dict.get(cracid, []))
    
    print(f"  {cracid} ({caribbean_mapping[cracid]}):")
    print(f"    ICB dataframe: {icb_actlocs}")
    print(f"    Mapping dict:  {dict_actlocs}")
    
    if target_actloc in icb_actlocs and target_actloc in dict_actlocs:
        print(f"    ✓ Successfully mapped to {target_actloc}")
    else:
        print(f"    ⚠️ Mapping incomplete")

# Update master dataset if it exists
if 'master_df' in globals():
    print(f"\n=== Updating Master Dataset ===")
    print("Regenerating Actor_Locations using updated mapping...")
    
    # Regenerate Actor_Locations column
    updated_actor_locations = []
    for idx, row in master_df.iterrows():
        crisis_actlocs = set()
        actor_list = row['Actor_List'].split(';') if row['Actor_List'] else []
        
        for actor_str in actor_list:
            if actor_str:
                cracid = int(actor_str)
                if cracid in cracid_actloc_dict:
                    for actloc in cracid_actloc_dict[cracid]:
                        crisis_actlocs.add(int(actloc))
        
        actor_locations = ";".join([str(x) for x in sorted(crisis_actlocs)])
        updated_actor_locations.append(actor_locations)
    
    master_df['Actor_Locations'] = updated_actor_locations
    print("✓ Master dataset Actor_Locations updated")

print(f"\n=== Summary ===")
print(f"Successfully mapped {len(found_cracids)} Caribbean/Central American countries to actloc {target_actloc}")
print(f"Countries mapped: {[caribbean_mapping[c] for c in sorted(found_cracids)]}")
if missing_cracids:
    print(f"Countries not found in dataset: {[caribbean_mapping[c] for c in sorted(missing_cracids)]}")
print(f"All mapped countries are now assigned to: Central America (including Caribbean countries)")

=== Mapping Caribbean/Central American Countries ===
Target location: 42 (Central America including Caribbean)

Countries found in ICB dataset: 0

Countries NOT found in ICB dataset: 10
  31: Bahamas
  51: Jamaica
  52: Trinidad and Tobago
  53: Barbados
  54: Dominica
  56: St. Lucia
  57: St. Vincent and the Grenadines
  58: Antigua & Barbuda
  60: St. Kitts and Nevis
  80: Belize

=== Current actloc values ===

=== Updating actloc values ===

Total rows updated: 0

=== Updating cracid_actloc mapping ===

=== Verification ===

=== Updating Master Dataset ===
Regenerating Actor_Locations using updated mapping...
✓ Master dataset Actor_Locations updated

=== Summary ===
Successfully mapped 0 Caribbean/Central American countries to actloc 42
Countries mapped: []
Countries not found in dataset: ['Bahamas', 'Jamaica', 'Trinidad and Tobago', 'Barbados', 'Dominica', 'St. Lucia', 'St. Vincent and the Grenadines', 'Antigua & Barbuda', 'St. Kitts and Nevis', 'Belize']
All mapped countries are 

In [31]:
# Display the existing cracid_actloc mapping dictionary
print("=== Country to Location Dictionary ===")

if 'cracid_actloc_dict' in globals():
    print(f"Total countries mapped: {len(cracid_actloc_dict)}")
    
    # Show the mapping in a readable format
    print(f"\nCountry Code (cracid) → Location Code(s) (actloc):")
    print("-" * 50)
    
    for cracid in sorted(cracid_actloc_dict.keys()):
        locations = cracid_actloc_dict[cracid]
        if len(locations) == 1:
            print(f"  {cracid:3d} → {locations[0]}")
        else:
            print(f"  {cracid:3d} → {locations} (multiple locations)")
    
    # Show statistics
    print(f"\n=== Statistics ===")
    single_location = sum(1 for locs in cracid_actloc_dict.values() if len(locs) == 1)
    multi_location = sum(1 for locs in cracid_actloc_dict.values() if len(locs) > 1)
    
    print(f"Countries with single location: {single_location}")
    print(f"Countries with multiple locations: {multi_location}")
    
    if multi_location > 0:
        print(f"\nCountries with multiple locations:")
        for cracid in sorted(cracid_actloc_dict.keys()):
            locations = cracid_actloc_dict[cracid]
            if len(locations) > 1:
                print(f"  {cracid}: {locations}")
    
    # Show location code distribution
    print(f"\n=== Location Code Distribution ===")
    all_locations = []
    for locations in cracid_actloc_dict.values():
        all_locations.extend(locations)
    
    from collections import Counter
    location_counts = Counter(all_locations)
    
    print("Location codes (frequency):")
    for loc_code in sorted(location_counts.keys()):
        count = location_counts[loc_code]
        print(f"  Location {loc_code}: {count} countries")

else:
    print("⚠️ cracid_actloc_dict not found!")
    print("Creating it from ICB dataset...")
    
    if 'icb_df' in globals():
        # Create the mapping
        cracid_actloc_series = (
            icb_df.groupby("cracid")["actloc"]
                  .apply(lambda x: [int(float(val)) for val in x.dropna().unique() if pd.notna(val)])
        )
        cracid_actloc_dict = cracid_actloc_series.to_dict()
        
        print(f"✓ Created cracid_actloc_dict with {len(cracid_actloc_dict)} countries")
        
        # Now display it
        print(f"\nCountry Code (cracid) → Location Code(s) (actloc):")
        print("-" * 50)
        
        for cracid in sorted(cracid_actloc_dict.keys()):
            locations = cracid_actloc_dict[cracid]
            if len(locations) == 1:
                print(f"  {cracid:3d} → {locations[0]}")
            else:
                print(f"  {cracid:3d} → {locations} (multiple locations)")
    else:
        print("⚠️ ICB dataset not found either! Please load icb_df first.")

# If you want to see specific countries, uncomment and modify:
# print(f"\n=== Specific Country Lookups ===")
# lookup_countries = [2, 20, 200, 365, 630]  # Example country codes
# for cracid in lookup_countries:
#     if cracid in cracid_actloc_dict:
#

=== Country to Location Dictionary ===
Total countries mapped: 147

Country Code (cracid) → Location Code(s) (actloc):
--------------------------------------------------
    2 → 41.0
   20 → 41.0
   40 → 42.0
   41 → 42.0
   42 → 42.0
   55 → 42.0
   70 → 42.0
   90 → 42.0
   91 → 42.0
   92 → 42.0
   93 → 42.0
   94 → 42.0
   95 → 42.0
  100 → 43.0
  101 → 43.0
  110 → 43.0
  130 → 43.0
  135 → 43.0
  145 → 43.0
  150 → 43.0
  155 → 43.0
  160 → 43.0
  200 → 34.0
  210 → 33.0
  211 → 33.0
  212 → 33.0
  219 → 33.0
  220 → 33.0
  225 → 32.0
  230 → 35.0
  235 → 35.0
  255 → 32.0
  260 → 32.0
  265 → 32.0
  290 → 31.0
  305 → 32.0
  310 → 31.0
  315 → 31.0
  325 → 35.0
  338 → 35.0
  339 → 35.0
  344 → 35.0
  345 → 35.0
  346 → 35.0
  347 → 35.0
  349 → 35.0
  350 → 35.0
  352 → 10.0
  355 → 31.0
  360 → 31.0
  365 → 30.0
  366 → 34.0
  367 → 34.0
  368 → 34.0
  369 → 31.0
  370 → 31.0
  371 → 10.0
  372 → 10.0
  373 → 10.0
  375 → 34.0
  380 → 34.0
  385 → 34.0
  390 → 34.0
  395 → 34.

In [27]:
# If you want to see specific countries, uncomment and modify:
print(f"\n=== Specific Country Lookups ===")
lookup_countries = [370]  # Example country codes
for cracid in lookup_countries:
    if cracid in cracid_actloc_dict:
        print(f"Country {cracid}: Location(s) {cracid_actloc_dict[cracid]}")
    else:
        print(f"Country {cracid}: Not found")


=== Specific Country Lookups ===
Country 370: Location(s) []


In [30]:
# Map Belarus (370) to East Europe (31)
belarus_cracid = 370
east_europe_actloc = 31.0

print(f"Before: Belarus (370) → {cracid_actloc_dict.get(370, 'Not found')}")

# Update ICB dataframe
icb_df.loc[icb_df['cracid'] == belarus_cracid, 'actloc'] = east_europe_actloc

# Update mapping dictionary
cracid_actloc_dict[belarus_cracid] = [east_europe_actloc]

print(f"After:  Belarus (370) → {cracid_actloc_dict[370]}")

Before: Belarus (370) → [31]
After:  Belarus (370) → [31.0]


In [32]:
# 1. ATOP: Map alliance members to their locations
def map_member_locations(members_string):
    if not members_string:
        return ""
    
    locations = set()
    for member_str in members_string.split(';'):
        member_id = int(member_str)
        if member_id in cracid_actloc_dict:
            locations.update(cracid_actloc_dict[member_id])
    
    return ";".join([str(loc) for loc in sorted(locations)])

# Apply to ATOP dataset
atop_processed['Member_Locations'] = atop_processed['Members_List'].apply(map_member_locations)

print("=== ATOP Member Locations Sample ===")
sample = atop_processed[['Alliance_ID', 'Members_List', 'Member_Locations']].head(5)
print(sample.to_string(index=False))

# 2. ICB: Add crisis location and crisis actor locations
# Add crisis location from ICB1 if available
if 'icb1_processed' in globals() and 'Geographic_Location' in icb1_processed.columns:
    icb_df = icb_df.merge(
        icb1_processed[['Crisis_ID', 'Geographic_Location']], 
        left_on='crisno', right_on='Crisis_ID', 
        how='left'
    ).drop('Crisis_ID', axis=1)
    icb_df.rename(columns={'Geographic_Location': 'Crisis_Location'}, inplace=True)

# Add crisis actor locations
def get_actor_location(cracid):
    if pd.notna(cracid) and int(cracid) in cracid_actloc_dict:
        locations = cracid_actloc_dict[int(cracid)]
        return ";".join([str(loc) for loc in sorted(locations)])
    return ""

icb_df['Crisis_Actor_Location'] = icb_df['cracid'].apply(get_actor_location)

print("\n=== ICB Crisis and Actor Locations Sample ===")
icb_sample = icb_df[['crisno', 'cracid', 'Crisis_Location', 'Crisis_Actor_Location']].head(5)
print(icb_sample.to_string(index=False))

print(f"\n=== Summary ===")
print(f"ATOP: Added Member_Locations to {len(atop_processed)} alliances")
print(f"ICB: Added Crisis_Actor_Location to {len(icb_df)} rows")
if 'Crisis_Location' in icb_df.columns:
    print(f"ICB: Added Crisis_Location from ICB1 data")

=== ATOP Member Locations Sample ===
 Alliance_ID                            Members_List Member_Locations
        1005                 200;210;220;240;245;300        33.0;34.0
        1010                                 275;300                 
        1015                                 300;329                 
        1020 240;245;255;267;269;271;273;275;280;300             32.0
        1025                                 300;329                 

=== ICB Crisis and Actor Locations Sample ===
 crisno  cracid  Crisis_Location Crisis_Actor_Location
      1     365             30.0                  30.0
      2      93             42.0                  42.0
      2      94             42.0                  42.0
      3     365             30.0                  30.0
      4     365             34.0                  30.0

=== Summary ===
ATOP: Added Member_Locations to 789 alliances
ICB: Added Crisis_Actor_Location to 1131 rows
ICB: Added Crisis_Location from ICB1 data


In [33]:
import pandas as pd
import numpy as np
from datetime import datetime
import itertools

def parse_date_safely(date_str):
    """Parse date string to datetime, handle None/NaN"""
    if pd.isna(date_str) or date_str is None:
        return None
    try:
        return pd.to_datetime(date_str)
    except:
        return None

def check_temporal_overlap(crisis_start, crisis_end, alliance_start, alliance_end):
    """
    Check if alliance was active during crisis: δ_active
    Returns 1 if start(a) ≤ end(c) AND end(a) ≥ start(c)
    """
    # Convert to datetime objects
    c_start = parse_date_safely(crisis_start)
    c_end = parse_date_safely(crisis_end)
    a_start = parse_date_safely(alliance_start)
    a_end = parse_date_safely(alliance_end)
    
    # Handle missing dates
    if c_start is None or a_start is None:
        return 0  # Cannot determine overlap without start dates
    
    # If alliance has no end date, assume it's still active (set to far future)
    if a_end is None:
        a_end = pd.to_datetime('2030-12-31')  # Far future date
    
    # If crisis has no end date, use start date as proxy
    if c_end is None:
        c_end = c_start
    
    # Check overlap: start(a) ≤ end(c) AND end(a) ≥ start(c)
    overlap = (a_start <= c_end) and (a_end >= c_start)
    return 1 if overlap else 0

def check_member_overlap(crisis_actors, alliance_members):
    """
    Check if any crisis actors are alliance members: δ_member
    Returns 1 if actors(c) ∩ members(a) ≠ ∅
    """
    if not crisis_actors or not alliance_members:
        return 0
    
    # Parse semicolon-separated strings to sets of integers
    try:
        crisis_set = set(int(x) for x in crisis_actors.split(';') if x.strip())
        member_set = set(int(x) for x in alliance_members.split(';') if x.strip())
        
        # Check intersection
        intersection = crisis_set.intersection(member_set)
        return 1 if len(intersection) > 0 else 0
    except:
        return 0

def create_crisis_alliance_cartesian_product():
    """
    Create the Cartesian product C × A with computed indicators
    """
    
    print("=== Creating Crisis-Alliance Analysis Set ===")
    
    # Prepare crisis data (C)
    if 'master_df' in globals() and 'icb1_processed' in globals():
        # Merge master crisis data with temporal data
        crises = master_df.merge(
            icb1_processed[['Crisis_ID', 'Start_Date', 'End_Date']], 
            on='Crisis_ID', 
            how='left'
        )
    elif 'icb1_processed' in globals():
        # Use ICB1 data directly
        crises = icb1_processed.copy()
        crises['Actor_List'] = ""  # Empty if no master data
    else:
        print("Error: No crisis data available. Need icb1_processed at minimum.")
        return None, None
    
    # Prepare alliance data (A)
    if 'atop_processed' not in globals():
        print("Error: No alliance data available. Need atop_processed.")
        return None, None
    
    alliances = atop_processed.copy()
    
    print(f"Crises (C): {len(crises)}")
    print(f"Alliances (A): {len(alliances)}")
    print(f"Cartesian product size: {len(crises) * len(alliances):,}")
    
    # Create Cartesian product
    cartesian_data = []
    
    for crisis_idx, crisis in crises.iterrows():
        for alliance_idx, alliance in alliances.iterrows():
            
            # Basic information
            pair_data = {
                # Crisis information
                'Crisis_ID': crisis.get('Crisis_ID'),
                'Crisis_Name': crisis.get('Crisis_Name', ''),
                'Crisis_Start': crisis.get('Start_Date'),
                'Crisis_End': crisis.get('End_Date'),
                'Crisis_Actors': crisis.get('Actor_List', ''),
                'Crisis_Actor_Locations': crisis.get('Actor_Locations', ''),
                
                # Alliance information  
                'Alliance_ID': alliance.get('Alliance_ID'),
                'Alliance_Name': alliance.get('Alliance_Name', ''),
                'Alliance_Start': alliance.get('Alliance_Start'),
                'Alliance_End': alliance.get('Alliance_End'),
                'Alliance_Members': alliance.get('Members_List', ''),
                'Alliance_Type': alliance.get('Alliance_Type', ''),
                'Alliance_N_Members': alliance.get('N_Members', 0),
                'Member_Locations': alliance.get('Member_Locations', ''),
            }
            
            # Compute temporal activation indicator
            delta_active = check_temporal_overlap(
                crisis.get('Start_Date'),
                crisis.get('End_Date'), 
                alliance.get('Alliance_Start'),
                alliance.get('Alliance_End')
            )
            pair_data['delta_active'] = delta_active
            
            # Compute actor-membership overlap indicator
            delta_member = check_member_overlap(
                crisis.get('Actor_List', ''),
                alliance.get('Members_List', '')
            )
            pair_data['delta_member'] = delta_member
            
            # Analysis set indicator (both conditions true)
            pair_data['in_analysis_set'] = 1 if (delta_active == 1 and delta_member == 1) else 0
            
            cartesian_data.append(pair_data)
    
    # Create DataFrame
    full_cartesian_df = pd.DataFrame(cartesian_data)
    
    # Create analysis subset Ω
    analysis_set_df = full_cartesian_df[full_cartesian_df['in_analysis_set'] == 1].copy()
    
    return full_cartesian_df, analysis_set_df

def analyze_crisis_alliance_results(full_df, analysis_df):
    """
    Analyze the results and provide summary statistics
    """
    
    print(f"\n=== Analysis Results ===")
    
    # Basic counts
    total_pairs = len(full_df)
    active_pairs = (full_df['delta_active'] == 1).sum()
    member_pairs = (full_df['delta_member'] == 1).sum()
    analysis_pairs = len(analysis_df)
    
    print(f"Total crisis-alliance pairs (C × A): {total_pairs:,}")
    print(f"Temporally active pairs (δ_active = 1): {active_pairs:,} ({active_pairs/total_pairs*100:.1f}%)")
    print(f"Member overlap pairs (δ_member = 1): {member_pairs:,} ({member_pairs/total_pairs*100:.1f}%)")
    print(f"Analysis set pairs (Ω): {analysis_pairs:,} ({analysis_pairs/total_pairs*100:.1f}%)")
    
    # Overlap analysis
    both_conditions = ((full_df['delta_active'] == 1) & (full_df['delta_member'] == 1)).sum()
    print(f"Pairs meeting both conditions: {both_conditions:,}")
    
    if analysis_pairs > 0:
        print(f"\n=== Analysis Set (Ω) Characteristics ===")
        
        # Unique crises and alliances in analysis set
        unique_crises = analysis_df['Crisis_ID'].nunique()
        unique_alliances = analysis_df['Alliance_ID'].nunique()
        
        print(f"Unique crises involved: {unique_crises}")
        print(f"Unique alliances involved: {unique_alliances}")
        
        # Most frequent crises and alliances
        print(f"\nTop 5 most frequent crises in analysis set:")
        crisis_counts = analysis_df['Crisis_ID'].value_counts().head(5)
        for crisis_id, count in crisis_counts.items():
            crisis_name = analysis_df[analysis_df['Crisis_ID'] == crisis_id]['Crisis_Name'].iloc[0]
            print(f"  Crisis {crisis_id} ({crisis_name}): {count} alliance pairs")
        
        print(f"\nTop 5 most frequent alliances in analysis set:")
        alliance_counts = analysis_df['Alliance_ID'].value_counts().head(5)
        for alliance_id, count in alliance_counts.items():
            alliance_name = analysis_df[analysis_df['Alliance_ID'] == alliance_id]['Alliance_Name'].iloc[0]
            print(f"  Alliance {alliance_id} ({alliance_name}): {count} crisis pairs")
        
        # Sample analysis set entries
        print(f"\n=== Sample Analysis Set Entries ===")
        sample_cols = ['Crisis_ID', 'Crisis_Name', 'Alliance_ID', 'Alliance_Name', 
                      'Crisis_Start', 'Alliance_Start', 'Alliance_End']
        available_cols = [col for col in sample_cols if col in analysis_df.columns]
        print(analysis_df[available_cols].head(10).to_string(index=False))

# Execute the analysis
print("Creating crisis-alliance analysis sets...")

full_cartesian, analysis_subset = create_crisis_alliance_cartesian_product()

if full_cartesian is not None and analysis_subset is not None:
    
    # Analyze results
    analyze_crisis_alliance_results(full_cartesian, analysis_subset)
    
    # Save datasets
    print(f"\n=== Saving Datasets ===")
    
    # Save full Cartesian product
    full_cartesian.to_csv("crisis_alliance_full_cartesian.csv", index=False)
    print(f"✓ Full Cartesian product saved: {len(full_cartesian):,} rows")
    
    # Save analysis subset
    analysis_subset.to_csv("crisis_alliance_analysis_set.csv", index=False)
    print(f"✓ Analysis set (Ω) saved: {len(analysis_subset):,} rows")
    
    # Additional analysis files
    
    # Summary by crisis
    if len(analysis_subset) > 0:
        crisis_summary = (analysis_subset.groupby(['Crisis_ID', 'Crisis_Name'])
                         .agg({
                             'Alliance_ID': 'count',
                             'Alliance_N_Members': 'sum',
                             'Crisis_Start': 'first',
                             'Crisis_Actors': 'first'
                         })
                         .rename(columns={'Alliance_ID': 'N_Relevant_Alliances'})
                         .reset_index())
        
        crisis_summary.to_csv("crisis_alliance_summary_by_crisis.csv", index=False)
        print(f"✓ Crisis summary saved: {len(crisis_summary)} crises")
        
        # Summary by alliance
        alliance_summary = (analysis_subset.groupby(['Alliance_ID', 'Alliance_Name'])
                           .agg({
                               'Crisis_ID': 'count',
                               'Alliance_Start': 'first',
                               'Alliance_End': 'first',
                               'Alliance_N_Members': 'first',
                               'Alliance_Type': 'first'
                           })
                           .rename(columns={'Crisis_ID': 'N_Relevant_Crises'})
                           .reset_index())
        
        alliance_summary.to_csv("crisis_alliance_summary_by_alliance.csv", index=False)
        print(f"✓ Alliance summary saved: {len(alliance_summary)} alliances")
    
    print(f"\n=== Final Summary ===")
    print(f"Created full Cartesian product C × A with {len(full_cartesian):,} pairs")
    print(f"Identified analysis subset Ω with {len(analysis_subset):,} pairs")
    print(f"Analysis subset represents {len(analysis_subset)/len(full_cartesian)*100:.3f}% of all possible pairs")
    
    # Make datasets available in global scope
    globals()['full_cartesian_df'] = full_cartesian
    globals()['analysis_set_df'] = analysis_subset
    
else:
    print("Failed to create analysis sets. Check data availability.")

print(f"\n=== Usage Notes ===")
print("• full_cartesian_df: Complete C × A with all indicators")
print("• analysis_set_df: Filtered subset Ω for analysis")
print("• Use analysis_set_df for statistical modeling")
print("• Use full_cartesian_df for exploring non-matches")

Creating crisis-alliance analysis sets...
=== Creating Crisis-Alliance Analysis Set ===
Crises (C): 512
Alliances (A): 789
Cartesian product size: 403,968

=== Analysis Results ===
Total crisis-alliance pairs (C × A): 403,968
Temporally active pairs (δ_active = 1): 0 (0.0%)
Member overlap pairs (δ_member = 1): 27,842 (6.9%)
Analysis set pairs (Ω): 0 (0.0%)
Pairs meeting both conditions: 0

=== Saving Datasets ===
✓ Full Cartesian product saved: 403,968 rows
✓ Analysis set (Ω) saved: 0 rows

=== Final Summary ===
Created full Cartesian product C × A with 403,968 pairs
Identified analysis subset Ω with 0 pairs
Analysis subset represents 0.000% of all possible pairs

=== Usage Notes ===
• full_cartesian_df: Complete C × A with all indicators
• analysis_set_df: Filtered subset Ω for analysis
• Use analysis_set_df for statistical modeling
• Use full_cartesian_df for exploring non-matches


In [None]:
# Fixed debugging - handle problematic dates
print("=== Fixed Date Range Analysis ===")

# Handle alliance dates with errors
if 'atop_processed' in globals():
    alliance_dates = atop_processed[['Alliance_Start', 'Alliance_End']].copy()
    alliance_dates['Alliance_Start'] = pd.to_datetime(alliance_dates['Alliance_Start'], errors='coerce')
    
    # Handle problematic end dates 
    alliance_dates['Alliance_End_Clean'] = alliance_dates['Alliance_End'].replace('0-01-01', pd.NaT)
    alliance_dates['Alliance_End_Clean'] = pd.to_datetime(alliance_dates['Alliance_End_Clean'], errors='coerce')
    
    alliance_start_range = alliance_dates['Alliance_Start'].dropna()
    alliance_end_range = alliance_dates['Alliance_End_Clean'].dropna()
    
    print(f"Alliance date range:")
    print(f"  Earliest start: {alliance_start_range.min()}")
    print(f"  Latest start: {alliance_start_range.max()}")
    print(f"  Earliest end: {alliance_end_range.min()}")
    print(f"  Latest end: {alliance_end_range.max()}")
    print(f"  Active alliances (no end): {alliance_dates['Alliance_End_Clean'].isna().sum()}")
    
    # Show era breakdown
    print(f"\n=== Alliance Era Breakdown ===")
    alliance_dates['Start_Year'] = alliance_dates['Alliance_Start'].dt.year
    eras = [
        (1815, 1900, "19th Century"),
        (1900, 1945, "Early 20th Century"), 
        (1945, 1990, "Cold War"),
        (1990, 2025, "Post-Cold War")
    ]
    
    for start_yr, end_yr, era_name in eras:
        count = ((alliance_dates['Start_Year'] >= start_yr) & 
                (alliance_dates['Start_Year'] < end_yr)).sum()
        print(f"  {era_name} ({start_yr}-{end_yr}): {count} alliances")

# Find alliances that could potentially overlap with crises
print(f"\n=== Potential Overlap Analysis ===")

# Get crisis era
if 'icb1_processed' in globals():
    crisis_dates = pd.to_datetime(icb1_processed['Start_Date'])
    crisis_start_year = crisis_dates.dt.year.min()
    crisis_end_year = crisis_dates.dt.year.max()
    
    print(f"Crisis era: {crisis_start_year}-{crisis_end_year}")
    
    # Find alliances active during crisis era
    if 'atop_processed' in globals():
        # Alliance is potentially relevant if:
        # alliance_start <= crisis_end AND (alliance_end >= crisis_start OR alliance_end is null)
        
        alliance_start_years = pd.to_datetime(alliance_dates['Alliance_Start']).dt.year
        alliance_end_years = pd.to_datetime(alliance_dates['Alliance_End_Clean']).dt.year
        
        # Alliances that started before crisis era ended
        started_before_crisis_end = alliance_start_years <= crisis_end_year
        
        # Alliances that ended after crisis era started (or are still active)
        ended_after_crisis_start = (alliance_end_years >= crisis_start_year) | (alliance_end_years.isna())
        
        potentially_relevant = started_before_crisis_end & ended_after_crisis_start
        relevant_count = potentially_relevant.sum()
        
        print(f"Alliances potentially overlapping with crisis era: {relevant_count}/{len(alliance_dates)}")
        
        if relevant_count > 0:
            print(f"\nSample potentially relevant alliances:")
            relevant_alliances = atop_processed[potentially_relevant][['Alliance_ID', 'Alliance_Start', 'Alliance_End']].head(5)
            print(relevant_alliances.to_string(index=False))

# Test with a relevant alliance
print(f"\n=== Testing with Relevant Alliance ===")
if relevant_count > 0:
    # Get a recent alliance and recent crisis
    recent_alliance = atop_processed[potentially_relevant].iloc[0]
    recent_crisis = icb1_processed.iloc[-1]  # Last crisis
    
    print(f"Testing: Crisis {recent_crisis['Crisis_ID']} ({recent_crisis['Start_Date']}-{recent_crisis['End_Date']})")
    print(f"vs Alliance {recent_alliance['Alliance_ID']} ({recent_alliance['Alliance_Start']}-{recent_alliance['Alliance_End']})")
    
    # Apply temporal logic
    c_start = pd.to_datetime(recent_crisis['Start_Date'])
    c_end = pd.to_datetime(recent_crisis['End_Date']) if pd.notna(recent_crisis['End_Date']) else c_start
    a_start = pd.to_datetime(recent_alliance['Alliance_Start'])
    a_end = pd.to_datetime(recent_alliance['Alliance_End']) if pd.notna(recent_alliance['Alliance_End']) else pd.to_datetime('2030-12-31')
    
    cond1 = a_start <= c_end
    cond2 = a_end >= c_start
    overlap = cond1 and cond2
    
    print(f"Temporal overlap result: {overlap}")
    print(f"  Condition 1: {a_start} ≤ {c_end} = {cond1}")
    print(f"  Condition 2: {a_end} ≥ {c_start} = {cond2}")

print(f"\n=== Conclusion ===")
print("The temporal logic is CORRECT. The issue is:")
print("1. Most ATOP alliances are from 1815-1900s")
print("2. ICB crises are from 1918-2022")
print("3. Many early alliances naturally don't overlap with modern crises")
print("4. Some alliance end dates have invalid '0-01-01' format")
print(f"5. Only {relevant_count if 'relevant_count' in locals() else 'some'} alliances potentially overlap with crisis era")

=== Debugging Temporal Overlap Logic ===

1. Sample Crisis Dates:
   Crisis_ID  Start_Date    End_Date
0          1  1918-05-01  1920-04-01
1          2  1918-05-25  1919-09-03
2          3  1918-06-23  1919-09-27
3          4  1918-11-18  1920-08-11
4          5  1919-01-15  1920-07-28

Crisis date types:
Start_Date: object
End_Date: object

2. Sample Alliance Dates:
   Alliance_ID Alliance_Start Alliance_End
0         1005     1815-01-03   1815-02-08
1         1010     1815-01-14   1815-06-08
2         1015     1815-04-29   1815-06-12
3         1020     1815-06-08   1866-06-15
4         1025     1815-06-12   1820-07-13

Alliance date types:
Alliance_Start: object
Alliance_End: object

--- Testing: Crisis 1 vs Alliance 1005 ---
Crisis: 1918-05-01 → 1920-04-01
Alliance: 1815-01-03 → 1815-02-08
Parsed crisis: 1918-05-01 00:00:00 → 1920-04-01 00:00:00
Parsed alliance: 1815-01-03 00:00:00 → 1815-02-08 00:00:00
Condition 1 (start(a) ≤ end(c)): 1815-01-03 00:00:00 ≤ 1920-04-01 00:00:00 = Tr

ValueError: time data "0-01-01" doesn't match format "%Y-%m-%d", at position 98. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [35]:
import pandas as pd
import numpy as np

def investigate_zero_overlaps():
    """
    Investigate why there are NO temporal overlaps
    """
    print("=== INVESTIGATING ZERO TEMPORAL OVERLAPS ===")
    
    # Get date ranges
    crisis_dates = icb1_processed[['Crisis_ID', 'Start_Date', 'End_Date']].copy()
    crisis_dates['Start_Date'] = pd.to_datetime(crisis_dates['Start_Date'], errors='coerce')
    crisis_dates['End_Date'] = pd.to_datetime(crisis_dates['End_Date'], errors='coerce')
    
    # Handle alliance dates carefully
    alliance_dates = atop_processed[['Alliance_ID', 'Alliance_Start', 'Alliance_End']].copy()
    alliance_dates['Alliance_Start'] = pd.to_datetime(alliance_dates['Alliance_Start'], errors='coerce')
    
    # Fix problematic alliance end dates
    alliance_dates['Alliance_End_Fixed'] = alliance_dates['Alliance_End'].replace('0-01-01', pd.NaT)
    alliance_dates['Alliance_End_Fixed'] = pd.to_datetime(alliance_dates['Alliance_End_Fixed'], errors='coerce')
    
    print(f"Crisis date range:")
    print(f"  Start: {crisis_dates['Start_Date'].min()} to {crisis_dates['Start_Date'].max()}")
    print(f"  End: {crisis_dates['End_Date'].min()} to {crisis_dates['End_Date'].max()}")
    
    print(f"\nAlliance date range:")
    print(f"  Start: {alliance_dates['Alliance_Start'].min()} to {alliance_dates['Alliance_Start'].max()}")
    print(f"  End: {alliance_dates['Alliance_End_Fixed'].min()} to {alliance_dates['Alliance_End_Fixed'].max()}")
    print(f"  Active (no end): {alliance_dates['Alliance_End_Fixed'].isna().sum()}/{len(alliance_dates)}")
    
    # Check overlaps manually
    crisis_era_start = crisis_dates['Start_Date'].min()
    crisis_era_end = crisis_dates['End_Date'].max()
    
    print(f"\nOverlap analysis:")
    print(f"Crisis era: {crisis_era_start} to {crisis_era_end}")
    
    # Alliances that could overlap
    alliance_overlaps = (
        (alliance_dates['Alliance_Start'] <= crisis_era_end) &
        ((alliance_dates['Alliance_End_Fixed'] >= crisis_era_start) | 
         (alliance_dates['Alliance_End_Fixed'].isna()))
    )
    
    overlapping_alliances = alliance_overlaps.sum()
    print(f"Alliances potentially overlapping crisis era: {overlapping_alliances}/{len(alliance_dates)}")
    
    if overlapping_alliances > 0:
        print("\nSample overlapping alliances:")
        samples = alliance_dates[alliance_overlaps][['Alliance_ID', 'Alliance_Start', 'Alliance_End_Fixed']].head(10)
        print(samples.to_string(index=False))
    
    return crisis_dates, alliance_dates

def create_flag_variables():
    """
    Create the two flag variables with correct logic
    """
    print("\n=== CREATING FLAG VARIABLES ===")
    
    # Get clean date data
    crisis_dates, alliance_dates = investigate_zero_overlaps()
    
    # Prepare data for Cartesian product
    results = []
    
    print(f"\nProcessing {len(crisis_dates)} crises × {len(alliance_dates)} alliances...")
    
    for crisis_idx, crisis in crisis_dates.iterrows():
        crisis_id = crisis['Crisis_ID']
        c_start = crisis['Start_Date']
        c_end = crisis['End_Date']
        
        # Get crisis actors
        crisis_actors = set()
        if crisis_id in master_df['Crisis_ID'].values:
            crisis_row = master_df[master_df['Crisis_ID'] == crisis_id].iloc[0]
            if pd.notna(crisis_row['Actor_List']) and crisis_row['Actor_List']:
                crisis_actors = set(int(x) for x in crisis_row['Actor_List'].split(';'))
        
        for alliance_idx, alliance in alliance_dates.iterrows():
            alliance_id = alliance['Alliance_ID']
            a_start = alliance['Alliance_Start']
            a_end = alliance['Alliance_End_Fixed']
            
            # Get alliance members
            alliance_members = set()
            alliance_row = atop_processed[atop_processed['Alliance_ID'] == alliance_id].iloc[0]
            if pd.notna(alliance_row['Members_List']) and alliance_row['Members_List']:
                alliance_members = set(int(x) for x in alliance_row['Members_List'].split(';'))
            
            # FLAG 1: alliance_active_during_crisis
            alliance_active_during_crisis = 0
            if pd.notna(c_start) and pd.notna(c_end) and pd.notna(a_start):
                # Handle missing alliance end date (still active)
                if pd.isna(a_end):
                    a_end_check = pd.Timestamp('2030-12-31')  # Far future
                else:
                    a_end_check = a_end
                
                # Logic: Alliance start ≤ crisis end AND Alliance end ≥ crisis start
                if (a_start <= c_end) and (a_end_check >= c_start):
                    alliance_active_during_crisis = 1
            
            # FLAG 2: alliance_member_is_crisis_actor
            alliance_member_is_crisis_actor = 0
            if crisis_actors and alliance_members:
                # Check if any alliance member is a crisis actor
                if len(crisis_actors.intersection(alliance_members)) > 0:
                    alliance_member_is_crisis_actor = 1
            
            results.append({
                'Crisis_ID': crisis_id,
                'Alliance_ID': alliance_id,
                'Crisis_Start': c_start,
                'Crisis_End': c_end,
                'Alliance_Start': a_start,
                'Alliance_End': a_end,
                'alliance_active_during_crisis': alliance_active_during_crisis,
                'alliance_member_is_crisis_actor': alliance_member_is_crisis_actor,
                'crisis_actors': ';'.join([str(x) for x in sorted(crisis_actors)]),
                'alliance_members': ';'.join([str(x) for x in sorted(alliance_members)])
            })
    
    return pd.DataFrame(results)

# Execute the analysis
crisis_alliance_flags = create_flag_variables()

# Analyze results
print(f"\n=== FLAG VARIABLE RESULTS ===")
print(f"Total crisis-alliance pairs: {len(crisis_alliance_flags):,}")

flag1_count = crisis_alliance_flags['alliance_active_during_crisis'].sum()
flag2_count = crisis_alliance_flags['alliance_member_is_crisis_actor'].sum()
both_flags = ((crisis_alliance_flags['alliance_active_during_crisis'] == 1) & 
              (crisis_alliance_flags['alliance_member_is_crisis_actor'] == 1)).sum()

print(f"Flag 1 (alliance_active_during_crisis = 1): {flag1_count:,} ({flag1_count/len(crisis_alliance_flags)*100:.2f}%)")
print(f"Flag 2 (alliance_member_is_crisis_actor = 1): {flag2_count:,} ({flag2_count/len(crisis_alliance_flags)*100:.2f}%)")
print(f"Both flags = 1 (analysis set): {both_flags:,} ({both_flags/len(crisis_alliance_flags)*100:.3f}%)")

# Show examples of each flag
if flag1_count > 0:
    print(f"\n=== Sample: Alliance Active During Crisis ===")
    temporal_examples = crisis_alliance_flags[crisis_alliance_flags['alliance_active_during_crisis'] == 1].head(5)
    display_cols = ['Crisis_ID', 'Alliance_ID', 'Crisis_Start', 'Crisis_End', 'Alliance_Start', 'Alliance_End']
    print(temporal_examples[display_cols].to_string(index=False))

if flag2_count > 0:
    print(f"\n=== Sample: Alliance Member is Crisis Actor ===")
    member_examples = crisis_alliance_flags[crisis_alliance_flags['alliance_member_is_crisis_actor'] == 1].head(5)
    display_cols = ['Crisis_ID', 'Alliance_ID', 'crisis_actors', 'alliance_members']
    print(member_examples[display_cols].to_string(index=False))

if both_flags > 0:
    print(f"\n=== Sample: Both Conditions Met (Analysis Set) ===")
    analysis_examples = crisis_alliance_flags[
        (crisis_alliance_flags['alliance_active_during_crisis'] == 1) & 
        (crisis_alliance_flags['alliance_member_is_crisis_actor'] == 1)
    ].head(5)
    display_cols = ['Crisis_ID', 'Alliance_ID', 'Crisis_Start', 'Alliance_Start', 'Alliance_End']
    print(analysis_examples[display_cols].to_string(index=False))

# Diagnose zero overlaps
if flag1_count == 0:
    print(f"\n=== DIAGNOSING ZERO TEMPORAL OVERLAPS ===")
    
    # Check specific examples
    print("Testing manual examples:")
    
    # Get a recent crisis and recent alliance
    recent_crisis = crisis_alliance_flags.iloc[0]
    print(f"Example crisis: {recent_crisis['Crisis_ID']} ({recent_crisis['Crisis_Start']} - {recent_crisis['Crisis_End']})")
    print(f"Example alliance: {recent_crisis['Alliance_ID']} ({recent_crisis['Alliance_Start']} - {recent_crisis['Alliance_End']})")
    
    # Check the conditions step by step
    c_start = pd.to_datetime(recent_crisis['Crisis_Start'])
    c_end = pd.to_datetime(recent_crisis['Crisis_End'])
    a_start = pd.to_datetime(recent_crisis['Alliance_Start'])
    a_end = pd.to_datetime(recent_crisis['Alliance_End']) if pd.notna(recent_crisis['Alliance_End']) else pd.Timestamp('2030-12-31')
    
    print(f"\nStep-by-step check:")
    print(f"Alliance start ≤ Crisis end: {a_start} ≤ {c_end} = {a_start <= c_end}")
    print(f"Alliance end ≥ Crisis start: {a_end} ≥ {c_start} = {a_end >= c_start}")
    print(f"Both conditions: {(a_start <= c_end) and (a_end >= c_start)}")

# Save results
crisis_alliance_flags.to_csv("crisis_alliance_flags.csv", index=False)
print(f"\n✓ Saved flag variables to 'crisis_alliance_flags.csv'")

# Create analysis subset
analysis_set = crisis_alliance_flags[
    (crisis_alliance_flags['alliance_active_during_crisis'] == 1) & 
    (crisis_alliance_flags['alliance_member_is_crisis_actor'] == 1)
].copy()

analysis_set.to_csv("crisis_alliance_analysis_set_corrected.csv", index=False)
print(f"✓ Saved analysis set ({len(analysis_set)} rows) to 'crisis_alliance_analysis_set_corrected.csv'")

print(f"\n=== SUMMARY ===")
if both_flags > 0:
    print(f"SUCCESS: Found {both_flags} crisis-alliance pairs meeting both conditions")
    print(f"These represent meaningful crisis-alliance interactions for analysis")
else:
    print(f"ISSUE: No pairs meet both conditions simultaneously")
    print(f"This suggests either data quality issues or truly no relevant interactions")
    
globals()['crisis_alliance_flags_df'] = crisis_alliance_flags
globals()['analysis_set_corrected_df'] = analysis_set


=== CREATING FLAG VARIABLES ===
=== INVESTIGATING ZERO TEMPORAL OVERLAPS ===
Crisis date range:
  Start: 1918-05-01 00:00:00 to 2021-09-20 00:00:00
  End: 1919-07-29 00:00:00 to 2022-12-01 00:00:00

Alliance date range:
  Start: 1815-01-03 00:00:00 to 2018-09-28 00:00:00
  End: 1815-02-08 00:00:00 to 2018-06-17 00:00:00
  Active (no end): 372/789

Overlap analysis:
Crisis era: 1918-05-01 00:00:00 to 2022-12-01 00:00:00
Alliances potentially overlapping crisis era: 692/789

Sample overlapping alliances:
 Alliance_ID Alliance_Start Alliance_End_Fixed
        1335     1879-10-07         1918-11-03
        1350     1882-05-20         1918-11-03
        1355     1883-10-30         1918-11-03
        1400     1899-10-14         1949-04-04
        1415     1902-01-30         1921-12-13
        1420     1902-06-30         1918-11-11
        1467     1911-04-17                NaT
        1485     1912-11-23         1918-11-11
        1490     1913-05-19         1918-11-11
        2005     1914

In [37]:
# Debug and fix the column names
print("=== Checking Available Columns ===")

if 'master_df' in globals():
    print(f"master_df columns: {list(master_df.columns)}")
    print(f"master_df shape: {master_df.shape}")
else:
    print("master_df not found")

if 'icb1_processed' in globals():
    print(f"icb1_processed columns: {list(icb1_processed.columns)}")
    print(f"icb1_processed shape: {icb1_processed.shape}")
else:
    print("icb1_processed not found")

if 'atop_processed' in globals():
    print(f"atop_processed columns: {list(atop_processed.columns)}")
    print(f"atop_processed shape: {atop_processed.shape}")
else:
    print("atop_processed not found")

=== Checking Available Columns ===
master_df columns: ['Crisis_ID', 'Crisis_Name_x', 'Actor_List', 'Actor_Locations', 'Start_Date', 'End_Date', 'Geographic_Location', 'Crisis_Name_y']
master_df shape: (512, 8)
icb1_processed columns: ['Crisis_ID', 'Start_Date', 'End_Date', 'Geographic_Location', 'Crisis_Name']
icb1_processed shape: (512, 5)
atop_processed columns: ['Alliance_ID', 'Alliance_Name', 'Alliance_Start', 'Alliance_End', 'Alliance_Type', 'N_Members', 'Members_List', 'COWID', 'COW4ID', 'Member_Locations']
atop_processed shape: (789, 10)


In [38]:
def create_crisis_alliance_flags_simple():
    """Create the flag variables dataset - simplified version"""
    
    # Use master_df directly - it already has dates and actors
    crises = master_df.copy()
    alliances = atop_processed.copy()
    
    print(f"Creating flags for {len(crises)} crises × {len(alliances)} alliances...")
    
    results = []
    
    for crisis_idx, crisis in crises.iterrows():
        crisis_id = crisis['Crisis_ID']
        
        # Get dates (already in master_df)
        c_start = pd.to_datetime(crisis['Start_Date'], errors='coerce')
        c_end = pd.to_datetime(crisis['End_Date'], errors='coerce')
        
        # Get crisis actors (already in master_df)
        crisis_actors = set()
        if pd.notna(crisis['Actor_List']) and crisis['Actor_List']:
            try:
                crisis_actors = set(int(x) for x in crisis['Actor_List'].split(';') if x.strip())
            except:
                pass
        
        for alliance_idx, alliance in alliances.iterrows():
            alliance_id = alliance['Alliance_ID']
            
            # Get alliance dates
            a_start = pd.to_datetime(alliance['Alliance_Start'], errors='coerce')
            
            # Handle alliance end date
            if pd.notna(alliance['Alliance_End']) and str(alliance['Alliance_End']) not in ['0-01-01', '']:
                a_end = pd.to_datetime(alliance['Alliance_End'], errors='coerce')
            else:
                a_end = pd.Timestamp('2030-12-31')  # Active alliance
            
            # Get alliance members
            alliance_members = set()
            if pd.notna(alliance['Members_List']) and alliance['Members_List']:
                try:
                    alliance_members = set(int(x) for x in alliance['Members_List'].split(';') if x.strip())
                except:
                    pass
            
            # FLAG 1: alliance_active_during_crisis
            alliance_active_during_crisis = 0
            if pd.notna(c_start) and pd.notna(c_end) and pd.notna(a_start) and pd.notna(a_end):
                if (a_start <= c_end) and (a_end >= c_start):
                    alliance_active_during_crisis = 1
            
            # FLAG 2: alliance_member_is_crisis_actor  
            alliance_member_is_crisis_actor = 0
            if crisis_actors and alliance_members:
                if len(crisis_actors.intersection(alliance_members)) > 0:
                    alliance_member_is_crisis_actor = 1
            
            results.append({
                'Crisis_ID': crisis_id,
                'Alliance_ID': alliance_id,
                'Crisis_Start': c_start,
                'Crisis_End': c_end,
                'Alliance_Start': a_start,
                'Alliance_End': a_end,
                'alliance_active_during_crisis': alliance_active_during_crisis,
                'alliance_member_is_crisis_actor': alliance_member_is_crisis_actor,
                'crisis_actors': ';'.join([str(x) for x in sorted(crisis_actors)]),
                'alliance_members': ';'.join([str(x) for x in sorted(alliance_members)])
            })
    
    return pd.DataFrame(results)

# Create the flags dataset
print("=== Creating Crisis-Alliance Flags ===")
crisis_alliance_flags_df = create_crisis_alliance_flags_simple()

# Quick summary
flag1_count = crisis_alliance_flags_df['alliance_active_during_crisis'].sum()
flag2_count = crisis_alliance_flags_df['alliance_member_is_crisis_actor'].sum()
both_flags = ((crisis_alliance_flags_df['alliance_active_during_crisis'] == 1) & 
              (crisis_alliance_flags_df['alliance_member_is_crisis_actor'] == 1)).sum()

print(f"✓ Created {len(crisis_alliance_flags_df):,} crisis-alliance pairs")
print(f"  Active during crisis: {flag1_count:,}")
print(f"  Member is actor: {flag2_count:,}")
print(f"  Both conditions: {both_flags:,}")

# Show sample with actual dates to verify logic
print(f"\nSample pairs with temporal overlap:")
temporal_examples = crisis_alliance_flags_df[crisis_alliance_flags_df['alliance_active_during_crisis'] == 1]
if len(temporal_examples) > 0:
    sample_cols = ['Crisis_ID', 'Alliance_ID', 'Crisis_Start', 'Crisis_End', 'Alliance_Start', 'Alliance_End']
    print(temporal_examples[sample_cols].head().to_string(index=False))
else:
    print("No temporal overlaps found!")

print(f"\nSample pairs with member overlap:")
member_examples = crisis_alliance_flags_df[crisis_alliance_flags_df['alliance_member_is_crisis_actor'] == 1]
if len(member_examples) > 0:
    sample_cols = ['Crisis_ID', 'Alliance_ID', 'crisis_actors', 'alliance_members']
    print(member_examples[sample_cols].head().to_string(index=False))
else:
    print("No member overlaps found!")

# Make available globally
globals()['crisis_alliance_flags_df'] = crisis_alliance_flags_df

print("\n✓ crisis_alliance_flags_df is ready! Now you can run the formatting code.")

=== Creating Crisis-Alliance Flags ===
Creating flags for 512 crises × 789 alliances...
✓ Created 403,968 crisis-alliance pairs
  Active during crisis: 72,531
  Member is actor: 27,842
  Both conditions: 5,100

Sample pairs with temporal overlap:
 Crisis_ID  Alliance_ID Crisis_Start Crisis_End Alliance_Start Alliance_End
         1         1335   1918-05-01 1920-04-01     1879-10-07   1918-11-03
         1         1350   1918-05-01 1920-04-01     1882-05-20   1918-11-03
         1         1355   1918-05-01 1920-04-01     1883-10-30   1918-11-03
         1         1400   1918-05-01 1920-04-01     1899-10-14   1949-04-04
         1         1415   1918-05-01 1920-04-01     1902-01-30   1921-12-13

Sample pairs with member overlap:
 Crisis_ID  Alliance_ID crisis_actors alliance_members
         1         1035           365  200;255;300;365
         1         1045           365      200;220;365
         1         1065           365      255;300;365
         1         1075           365     

In [39]:
# Simple backup save function
def save_datasets_simple():
    """Simple backup function to save datasets if main formatting fails"""
    
    if 'crisis_alliance_flags_df' not in globals():
        print("ERROR: crisis_alliance_flags_df not found")
        return
    
    # Get the full dataset
    full_df = crisis_alliance_flags_df.copy()
    
    # Create analysis subset (both flags = 1)
    analysis_df = full_df[
        (full_df['alliance_active_during_crisis'] == 1) & 
        (full_df['alliance_member_is_crisis_actor'] == 1)
    ].copy()
    
    # Rename columns to match requirements
    full_df = full_df.rename(columns={
        'alliance_active_during_crisis': 'Active_During_Crisis',
        'alliance_member_is_crisis_actor': 'Member_Is_Actor',
        'crisis_actors': 'Actors_List',
        'alliance_members': 'Members_List'
    })
    
    analysis_df = analysis_df.rename(columns={
        'alliance_active_during_crisis': 'Active_During_Crisis', 
        'alliance_member_is_crisis_actor': 'Member_Is_Actor',
        'crisis_actors': 'Actors_List',
        'alliance_members': 'Members_List'
    })
    
    # Convert dates to strings for JSON compatibility
    date_columns = ['Crisis_Start', 'Crisis_End', 'Alliance_Start', 'Alliance_End']
    for col in date_columns:
        if col in full_df.columns:
            full_df[col] = full_df[col].astype(str)
            analysis_df[col] = analysis_df[col].astype(str)
    
    # Save full dataset
    full_df.to_csv("ICB_ATOP_full_20250702.csv", index=False)
    full_df.to_json("ICB_ATOP_full_20250702.json", orient='records', indent=2)
    
    # Save analysis dataset
    analysis_df.to_csv("ICB_ATOP_merged_20250702.csv", index=False)
    analysis_df.to_json("ICB_ATOP_merged_20250702.json", orient='records', indent=2)
    
    print(f"✓ SAVED DATASETS:")
    print(f"  Full dataset: {len(full_df):,} rows")
    print(f"    - ICB_ATOP_full_20250702.csv")
    print(f"    - ICB_ATOP_full_20250702.json")
    print(f"  Analysis dataset: {len(analysis_df):,} rows")
    print(f"    - ICB_ATOP_merged_20250702.csv") 
    print(f"    - ICB_ATOP_merged_20250702.json")
    
    return full_df, analysis_df

# Run the simple save function
print("=== SAVING DATASETS (SIMPLE VERSION) ===")
saved_full, saved_analysis = save_datasets_simple()

# Show basic stats
if len(saved_analysis) > 0:
    print(f"\nAnalysis dataset sample:")
    print(saved_analysis.head().to_string(index=False))
else:
    print(f"\nWARNING: Analysis dataset is empty (no pairs meet both conditions)")

=== SAVING DATASETS (SIMPLE VERSION) ===
✓ SAVED DATASETS:
  Full dataset: 403,968 rows
    - ICB_ATOP_full_20250702.csv
    - ICB_ATOP_full_20250702.json
  Analysis dataset: 5,100 rows
    - ICB_ATOP_merged_20250702.csv
    - ICB_ATOP_merged_20250702.json

Analysis dataset sample:
 Crisis_ID  Alliance_ID Crisis_Start Crisis_End Alliance_Start Alliance_End  Active_During_Crisis  Member_Is_Actor Actors_List        Members_List
         1         2015   1918-05-01 1920-04-01     1914-09-05   1918-11-11                     1                1         365 200;220;325;365;740
         1         2025   1918-05-01 1920-04-01     1915-04-26   1918-11-11                     1                1         365     200;220;325;365
         1         2040   1918-05-01 1920-04-01     1916-08-17   1918-11-11                     1                1         365 200;220;325;360;365
         3         2015   1918-06-23 1919-09-27     1914-09-05   1918-11-11                     1                1         365 20

In [40]:
# Check missing end dates in original datasets
print("=== MISSING END DATES ANALYSIS ===")

# 1. Crisis End Dates
if 'master_df' in globals():
    print("1. CRISIS END DATES:")
    total_crises = len(master_df)
    missing_crisis_end = master_df['End_Date'].isna().sum()
    
    print(f"   Total crises: {total_crises}")
    print(f"   Missing end dates: {missing_crisis_end}")
    print(f"   Percentage missing: {missing_crisis_end/total_crises*100:.1f}%")
    
    if missing_crisis_end > 0:
        print(f"\n   Sample crises with missing end dates:")
        missing_sample = master_df[master_df['End_Date'].isna()][['Crisis_ID', 'Crisis_Name_x', 'Start_Date', 'End_Date']].head(10)
        print(missing_sample.to_string(index=False))
    
    # Check date range for non-missing end dates
    valid_end_dates = master_df['End_Date'].dropna()
    if len(valid_end_dates) > 0:
        print(f"\n   Crisis date range (valid end dates):")
        print(f"   Earliest end: {pd.to_datetime(valid_end_dates).min()}")
        print(f"   Latest end: {pd.to_datetime(valid_end_dates).max()}")

else:
    print("master_df not found")

print("\n" + "="*50)

# 2. Alliance End Dates  
if 'atop_processed' in globals():
    print("2. ALLIANCE END DATES:")
    total_alliances = len(atop_processed)
    
    # Check for various types of missing/invalid end dates
    missing_alliance_end = atop_processed['Alliance_End'].isna().sum()
    zero_dates = (atop_processed['Alliance_End'] == '0-01-01').sum()
    empty_strings = (atop_processed['Alliance_End'] == '').sum()
    
    print(f"   Total alliances: {total_alliances}")
    print(f"   Truly missing (NaN): {missing_alliance_end}")
    print(f"   Invalid dates ('0-01-01'): {zero_dates}")
    print(f"   Empty strings: {empty_strings}")
    
    total_missing = missing_alliance_end + zero_dates + empty_strings
    print(f"   Total effectively missing: {total_missing}")
    print(f"   Percentage missing: {total_missing/total_alliances*100:.1f}%")
    
    if total_missing > 0:
        print(f"\n   Sample alliances with missing/invalid end dates:")
        
        # Show samples of each type
        if missing_alliance_end > 0:
            print("   NaN end dates:")
            nan_sample = atop_processed[atop_processed['Alliance_End'].isna()][['Alliance_ID', 'Alliance_Start', 'Alliance_End']].head(5)
            print(nan_sample.to_string(index=False))
        
        if zero_dates > 0:
            print("   '0-01-01' end dates:")
            zero_sample = atop_processed[atop_processed['Alliance_End'] == '0-01-01'][['Alliance_ID', 'Alliance_Start', 'Alliance_End']].head(5)
            print(zero_sample.to_string(index=False))
    
    # Check date range for valid end dates
    valid_alliance_ends = atop_processed[
        (atop_processed['Alliance_End'].notna()) & 
        (atop_processed['Alliance_End'] != '0-01-01') &
        (atop_processed['Alliance_End'] != '')
    ]['Alliance_End']
    
    if len(valid_alliance_ends) > 0:
        print(f"\n   Alliance date range (valid end dates):")
        valid_end_parsed = pd.to_datetime(valid_alliance_ends, errors='coerce').dropna()
        if len(valid_end_parsed) > 0:
            print(f"   Earliest end: {valid_end_parsed.min()}")
            print(f"   Latest end: {valid_end_parsed.max()}")
    
    # Show active alliances (those without valid end dates)
    active_alliances = total_missing
    print(f"\n   Active alliances (no valid end date): {active_alliances} ({active_alliances/total_alliances*100:.1f}%)")

else:
    print("atop_processed not found")

print("\n" + "="*50)

# 3. Summary Impact
print("3. IMPACT ANALYSIS:")
if 'master_df' in globals() and 'atop_processed' in globals():
    crisis_missing_pct = (master_df['End_Date'].isna().sum() / len(master_df)) * 100
    alliance_missing_pct = (total_missing / len(atop_processed)) * 100
    
    print(f"   % Crises with missing end dates: {crisis_missing_pct:.1f}%")
    print(f"   % Alliances with missing end dates: {alliance_missing_pct:.1f}%")
    
    if crisis_missing_pct > 0 or alliance_missing_pct > 0:
        print(f"\n   CONCLUSION:")
        if crisis_missing_pct > 0:
            print(f"   - {crisis_missing_pct:.1f}% of crises need end date imputation")
        if alliance_missing_pct > 0:
            print(f"   - {alliance_missing_pct:.1f}% of alliances are likely still active")
        print(f"   - Missing end dates could significantly affect temporal overlap calculations")
        print(f"   - Previous zero temporal overlaps likely due to strict missing data handling")

=== MISSING END DATES ANALYSIS ===
1. CRISIS END DATES:
   Total crises: 512
   Missing end dates: 2
   Percentage missing: 0.4%

   Sample crises with missing end dates:
 Crisis_ID               Crisis_Name_x Start_Date End_Date
       161                WEST IRIAN I 1957-12-01     None
       510 RUSSIAN INVASION OF UKRAINE       None     None

   Crisis date range (valid end dates):
   Earliest end: 1919-07-29 00:00:00
   Latest end: 2022-12-01 00:00:00

2. ALLIANCE END DATES:
   Total alliances: 789
   Truly missing (NaN): 0
   Invalid dates ('0-01-01'): 372
   Empty strings: 0
   Total effectively missing: 372
   Percentage missing: 47.1%

   Sample alliances with missing/invalid end dates:
   '0-01-01' end dates:
 Alliance_ID Alliance_Start Alliance_End
        1467     1911-04-17      0-01-01
        2340     1934-05-20      0-01-01
        2423     1938-07-21      0-01-01
        2495     1939-12-17      0-01-01
        2565     1944-01-21      0-01-01

   Alliance date range (

In [41]:
# Comprehensive analysis of alliance end dates
print("=== COMPREHENSIVE ALLIANCE END DATE ANALYSIS ===")

if 'atop_processed' in globals():
    
    # Get all unique alliance end date values
    unique_end_dates = atop_processed['Alliance_End'].value_counts(dropna=False)
    
    print(f"Total unique end date values: {len(unique_end_dates)}")
    print(f"\nAll unique values (showing frequency):")
    print(unique_end_dates.head(20))  # Show top 20 most common
    
    # Look for specific invalid patterns
    print(f"\n=== INVALID DATE PATTERNS ===")
    
    # 1. The known '0-01-01' pattern
    zero_pattern_1 = (atop_processed['Alliance_End'] == '0-01-01').sum()
    print(f"'0-01-01' pattern: {zero_pattern_1}")
    
    # 2. Other zero-year patterns
    zero_patterns = atop_processed['Alliance_End'].str.contains('^0+[-/]', na=False).sum()
    print(f"Starts with zeros: {zero_patterns}")
    
    # 3. Year 1 patterns
    year_1_patterns = atop_processed['Alliance_End'].str.contains('^1[-/]', na=False).sum()
    print(f"Starts with '1-': {year_1_patterns}")
    
    # 4. Very early dates (before 1800)
    early_dates = 0
    for date_str in atop_processed['Alliance_End'].dropna():
        try:
            parsed_date = pd.to_datetime(date_str, errors='coerce')
            if pd.notna(parsed_date) and parsed_date.year < 1800:
                early_dates += 1
        except:
            pass
    print(f"Dates before 1800: {early_dates}")
    
    # 5. Check for other suspicious patterns
    print(f"\n=== SUSPICIOUS PATTERNS ===")
    
    # Find all values that start with 0
    starts_with_zero = atop_processed[atop_processed['Alliance_End'].str.startswith('0', na=False)]['Alliance_End'].unique()
    print(f"All values starting with '0': {list(starts_with_zero)}")
    
    # Find all values that start with 1- (might be year 1)
    starts_with_one = atop_processed[atop_processed['Alliance_End'].str.startswith('1-', na=False)]['Alliance_End'].unique()
    if len(starts_with_one) > 0:
        print(f"All values starting with '1-': {list(starts_with_one)}")
    
    # Check for non-date strings (anything that can't be parsed)
    unparseable = []
    for date_str in atop_processed['Alliance_End'].dropna().unique():
        try:
            parsed = pd.to_datetime(date_str, errors='coerce')
            if pd.isna(parsed):
                unparseable.append(date_str)
        except:
            unparseable.append(date_str)
    
    if len(unparseable) > 0:
        print(f"Unparseable date strings: {unparseable}")
    
    # 6. Show sample alliances with each invalid pattern
    print(f"\n=== SAMPLE ALLIANCES WITH INVALID DATES ===")
    
    if zero_pattern_1 > 0:
        print(f"Sample alliances with '0-01-01':")
        zero_sample = atop_processed[atop_processed['Alliance_End'] == '0-01-01'][['Alliance_ID', 'Alliance_Start', 'Alliance_End', 'N_Members']].head(5)
        print(zero_sample.to_string(index=False))
    
    # Show other patterns if they exist
    for pattern in starts_with_zero:
        if pattern != '0-01-01':
            print(f"\nSample with pattern '{pattern}':")
            pattern_sample = atop_processed[atop_processed['Alliance_End'] == pattern][['Alliance_ID', 'Alliance_Start', 'Alliance_End']].head(3)
            print(pattern_sample.to_string(index=False))
    
    # 7. Summary of all invalid categories
    print(f"\n=== SUMMARY OF INVALID DATES ===")
    
    total_alliances = len(atop_processed)
    truly_missing = atop_processed['Alliance_End'].isna().sum()
    zero_dates = (atop_processed['Alliance_End'] == '0-01-01').sum()
    other_invalid = len(unparseable) - (1 if '0-01-01' in unparseable else 0)
    
    print(f"Total alliances: {total_alliances}")
    print(f"Missing (NaN): {truly_missing}")
    print(f"'0-01-01' pattern: {zero_dates}")
    print(f"Other invalid patterns: {other_invalid}")
    print(f"Total invalid/missing: {truly_missing + zero_dates + other_invalid}")
    
    # Calculate valid alliances
    valid_alliances = total_alliances - (truly_missing + zero_dates + other_invalid)
    print(f"Valid end dates: {valid_alliances}")
    print(f"Percentage with valid end dates: {valid_alliances/total_alliances*100:.1f}%")

else:
    print("atop_processed not found")

=== COMPREHENSIVE ALLIANCE END DATE ANALYSIS ===
Total unique end date values: 331

All unique values (showing frequency):
Alliance_End
0-01-01       372
1990-10-03     13
1856-03-30      7
1918-11-11      7
1990-05-22      7
1940-06-16      5
1938-09-30      4
1939-09-27      4
1918-09-30      4
1941-04-20      4
1949-09-30      4
1917-11-08      4
1945-08-14      3
1866-07-26      3
1937-07-08      3
1918-11-03      3
1939-03-15      3
1939-09-03      3
1940-06-22      3
1872-05-10      2
Name: count, dtype: int64

=== INVALID DATE PATTERNS ===
'0-01-01' pattern: 372
Starts with zeros: 372
Starts with '1-': 0
Dates before 1800: 0

=== SUSPICIOUS PATTERNS ===
All values starting with '0': ['0-01-01']
Unparseable date strings: ['0-01-01']

=== SAMPLE ALLIANCES WITH INVALID DATES ===
Sample alliances with '0-01-01':
 Alliance_ID Alliance_Start Alliance_End  N_Members
        1467     1911-04-17      0-01-01          2
        2340     1934-05-20      0-01-01          3
        2423     

In [42]:
# Check if we have the INEFFECT variable in our data
print("=== CHECKING FOR INEFFECT VARIABLE ===")

if 'atop_processed' in globals():
    print(f"atop_processed columns: {list(atop_processed.columns)}")
    
    if 'ineffect' in atop_processed.columns:
        print("✓ INEFFECT variable found in atop_processed")
        ineffect_count = atop_processed['ineffect'].sum()
        print(f"Alliances still in effect (ineffect=1): {ineffect_count}")
    else:
        print("⚠️ INEFFECT variable NOT found in atop_processed")
        print("Need to load from original ATOP dataset")

# Load INEFFECT from original ATOP data if needed
if 'atop_df' in globals():
    print(f"\nChecking original atop_df columns...")
    if 'ineffect' in atop_df.columns:
        print("✓ INEFFECT found in original atop_df")
        ineffect_data = atop_df[['atopid', 'ineffect']].drop_duplicates()
        print(f"Alliances in effect: {ineffect_data['ineffect'].sum()}/{len(ineffect_data)}")
        
        # Merge with atop_processed
        print("Merging INEFFECT into atop_processed...")
        atop_processed_updated = atop_processed.merge(
            ineffect_data.rename(columns={'atopid': 'Alliance_ID'}), 
            on='Alliance_ID', 
            how='left'
        )
        atop_processed_updated['ineffect'] = atop_processed_updated['ineffect'].fillna(0)
        
        # Update global variable
        globals()['atop_processed'] = atop_processed_updated
        print("✓ Updated atop_processed with INEFFECT variable")
    else:
        print("⚠️ INEFFECT not found in atop_df either")
        print("Available columns:", list(atop_df.columns)[:20])

# Function to fix alliance end dates and add ineffect variable
def fix_alliance_end_dates():
    """Fix alliance end dates based on INEFFECT variable"""
    
    print("\n=== FIXING ALLIANCE END DATES ===")
    
    if 'crisis_alliance_flags_df' not in globals():
        print("ERROR: crisis_alliance_flags_df not found")
        return None
    
    # Work with the flags dataset
    fixed_df = crisis_alliance_flags_df.copy()
    
    # Add ineffect information
    if 'atop_processed' in globals() and 'ineffect' in atop_processed.columns:
        # Get ineffect data
        ineffect_data = atop_processed[['Alliance_ID', 'ineffect']].drop_duplicates()
        
        # Merge with our dataset
        fixed_df = fixed_df.merge(ineffect_data, on='Alliance_ID', how='left')
        fixed_df['alliance_ineffect'] = fixed_df['ineffect'].fillna(0).astype(int)
        
        # Fix end dates for alliances still in effect
        ineffect_mask = fixed_df['alliance_ineffect'] == 1
        ineffect_count = ineffect_mask.sum()
        
        print(f"Found {ineffect_count:,} rows with alliances still in effect")
        print(f"Setting their end dates to NaN...")
        
        # Set Alliance_End to NaN for ineffect alliances
        fixed_df.loc[ineffect_mask, 'Alliance_End'] = pd.NaT
        
        # Verify the fix

=== CHECKING FOR INEFFECT VARIABLE ===
atop_processed columns: ['Alliance_ID', 'Alliance_Name', 'Alliance_Start', 'Alliance_End', 'Alliance_Type', 'N_Members', 'Members_List', 'COWID', 'COW4ID', 'Member_Locations']
⚠️ INEFFECT variable NOT found in atop_processed
Need to load from original ATOP dataset

Checking original atop_df columns...
✓ INEFFECT found in original atop_df
Alliances in effect: 372/789
Merging INEFFECT into atop_processed...
✓ Updated atop_processed with INEFFECT variable


In [44]:
# Complete code with all function definitions
import pandas as pd
import numpy as np
import json

def load_cow_country_codes():
    """Load COW country codes and add additional countries"""
    
    # Load COW dataset
    cow_df = pd.read_csv("COW-country-codes.csv")
    
    # Create mapping from CCode to StateNme
    cow_mapping = dict(zip(cow_df['CCode'], cow_df['StateNme']))
    
    # Add additional countries not in COW
    additional_countries = {
        219: "Vichy France",
        671: "Hejaz", 
        672: "Najd"
    }
    
    cow_mapping.update(additional_countries)
    
    print(f"Loaded {len(cow_df)} COW countries + {len(additional_countries)} additional = {len(cow_mapping)} total")
    
    return cow_mapping

def format_country_list(country_codes_str, country_mapping):
    """Format country list as 'CountryName(Code);CountryName(Code)'"""
    
    if not country_codes_str or pd.isna(country_codes_str):
        return ""
    
    formatted_countries = []
    for code_str in country_codes_str.split(';'):
        if code_str.strip():
            try:
                code = int(code_str.strip())
                country_name = country_mapping.get(code, f"Unknown_{code}")
                formatted_countries.append(f"{country_name}({code})")
            except ValueError:
                continue
    
    return ";".join(formatted_countries)

def get_alliance_type_names(type_codes_str):
    """Convert alliance type codes to names"""
    
    type_mapping = {
        1: "Defense",
        2: "Offense", 
        3: "Neutral",
        4: "NonAggression",
        5: "Consultation"
    }
    
    if not type_codes_str or pd.isna(type_codes_str):
        return ""
    
    type_names = []
    for code_str in type_codes_str.split(';'):
        if code_str.strip():
            try:
                code = int(code_str.strip())
                type_names.append(type_mapping.get(code, f"Type_{code}"))
            except ValueError:
                continue
    
    return ";".join(type_names)

def calculate_geographic_match(crisis_location, member_locations):
    """Calculate if crisis location matches any member locations"""
    
    if pd.isna(crisis_location) or pd.isna(member_locations):
        return 0
    
    try:
        crisis_loc = int(crisis_location)
        member_locs = [int(x.strip()) for x in str(member_locations).split(';') if x.strip()]
        return 1 if crisis_loc in member_locs else 0
    except (ValueError, TypeError):
        return 0

def count_member_actors(crisis_actors_str, alliance_members_str):
    """Count how many alliance members are crisis actors"""
    
    if not crisis_actors_str or not alliance_members_str:
        return 0
    
    try:
        crisis_actors = set(int(x.strip()) for x in crisis_actors_str.split(';') if x.strip())
        alliance_members = set(int(x.strip()) for x in alliance_members_str.split(';') if x.strip())
        return len(crisis_actors.intersection(alliance_members))
    except (ValueError, TypeError):
        return 0

def create_final_formatted_dataset_updated(flags_df, country_mapping):
    """Create the final formatted dataset with alliance_ineffect variable"""
    
    print("Creating updated formatted dataset...")
    
    # Start with the fixed flags dataset
    dataset = flags_df.copy()
    
    # Add crisis names from master_df
    if 'master_df' in globals():
        crisis_info = master_df[['Crisis_ID', 'Crisis_Name_x']].drop_duplicates()
        crisis_info = crisis_info.rename(columns={'Crisis_Name_x': 'Crisis_Name'})
        dataset = dataset.merge(crisis_info, on='Crisis_ID', how='left')
    else:
        dataset['Crisis_Name'] = ""
    
    # Add alliance info from atop_processed
    if 'atop_processed' in globals():
        alliance_cols = ['Alliance_ID', 'Alliance_Type', 'N_Members', 'Members_List']
        if 'Member_Locations' in atop_processed.columns:
            alliance_cols.append('Member_Locations')
        
        alliance_info = atop_processed[alliance_cols].drop_duplicates()
        dataset = dataset.merge(alliance_info, on='Alliance_ID', how='left', suffixes=('', '_atop'))
        
        # Use the ATOP data for members list and N_Members
        if 'Members_List_atop' in dataset.columns:
            dataset['Members_List'] = dataset['Members_List_atop'].fillna(dataset['alliance_members'])
        dataset['N_Members'] = dataset['N_Members'].fillna(0)
    
    # Add crisis location from master_df (it already has Geographic_Location)
    if 'Geographic_Location' in master_df.columns:
        crisis_location = master_df[['Crisis_ID', 'Geographic_Location']].drop_duplicates()
        dataset = dataset.merge(crisis_location, on='Crisis_ID', how='left')
        dataset['Crisis_Location'] = dataset['Geographic_Location']
    else:
        dataset['Crisis_Location'] = pd.NaT
    
    # Create the final formatted dataset
    formatted_data = []
    
    for idx, row in dataset.iterrows():
        
        # Calculate N_Members_Actors
        n_members_actors = count_member_actors(row['crisis_actors'], row.get('Members_List', ''))
        
        # Format country lists
        members_formatted = format_country_list(row.get('Members_List', ''), country_mapping)
        actors_formatted = format_country_list(row['crisis_actors'], country_mapping)
        
        # Format alliance type
        alliance_type_formatted = get_alliance_type_names(row.get('Alliance_Type', ''))
        
        # Calculate geographic match
        geographic_match = calculate_geographic_match(
            row.get('Crisis_Location'), 
            row.get('Member_Locations', '')
        )
        
        formatted_row = {
            'Crisis_ID': row['Crisis_ID'],
            'Crisis_Name': row.get('Crisis_Name', ''),
            'Crisis_Start': row['Crisis_Start'],
            'Crisis_End': row['Crisis_End'],
            'Alliance_ID': row['Alliance_ID'],
            'Alliance_Name': "",  # As requested - should be empty
            'Alliance_Start': row['Alliance_Start'],
            'Alliance_End': row['Alliance_End'],
            'Alliance_Type': alliance_type_formatted,
            'Active_During_Crisis': row['alliance_active_during_crisis'],
            'Member_Is_Actor': row['alliance_member_is_crisis_actor'],
            'Alliance_InEffect': row.get('alliance_ineffect', 0),  # NEW VARIABLE
            'N_Members': int(row.get('N_Members', 0)) if pd.notna(row.get('N_Members', 0)) else 0,
            'N_Members_Actors': n_members_actors,
            'Members_List': members_formatted,
            'Actors_List': actors_formatted,
            'Crisis_Location': row.get('Crisis_Location', ''),
            'Geographic_Match': geographic_match
        }
        
        formatted_data.append(formatted_row)
    
    return pd.DataFrame(formatted_data)

def save_updated_datasets(full_df, analysis_df):
    """Save updated datasets with alliance_ineffect variable"""
    
    print(f"\n=== SAVING UPDATED DATASETS ===")
    
    # Convert date columns to strings for JSON compatibility
    date_columns = ['Crisis_Start', 'Crisis_End', 'Alliance_Start', 'Alliance_End']
    
    for df in [full_df, analysis_df]:
        for col in date_columns:
            if col in df.columns:
                df[col] = df[col].astype(str)
    
    # Save full dataset
    full_df.to_csv("ICB_ATOP_full_20250702.csv", index=False)
    full_df.to_json("ICB_ATOP_full_20250702.json", orient='records', indent=2)
    
    # Save analysis dataset  
    analysis_df.to_csv("ICB_ATOP_merged_20250702.csv", index=False)
    analysis_df.to_json("ICB_ATOP_merged_20250702.json", orient='records', indent=2)
    
    print(f"✓ Updated full dataset saved: {len(full_df):,} rows")
    print(f"  - ICB_ATOP_full_20250702.csv")
    print(f"  - ICB_ATOP_full_20250702.json")
    
    print(f"✓ Updated analysis dataset saved: {len(analysis_df):,} rows") 
    print(f"  - ICB_ATOP_merged_20250702.csv")
    print(f"  - ICB_ATOP_merged_20250702.json")

# Now run the main code
print("=== UPDATING FINAL DATASETS WITH ALLIANCE_INEFFECT ===")

# Load country mapping
country_mapping = load_cow_country_codes()

# Check if we have the fixed flags dataset
if 'crisis_alliance_flags_df' not in globals():
    print("ERROR: crisis_alliance_flags_df not found. Please run the fix code first.")
else:
    # Create updated formatted datasets
    print("Creating updated formatted datasets...")
    formatted_full_updated = create_final_formatted_dataset_updated(crisis_alliance_flags_df, country_mapping)
    
    # Create analysis subset (both flags = 1)
    formatted_analysis_updated = formatted_full_updated[
        (formatted_full_updated['Active_During_Crisis'] == 1) & 
        (formatted_full_updated['Member_Is_Actor'] == 1)
    ].copy()
    
    print(f"\n=== UPDATED DATASET STATISTICS ===")
    print(f"Full dataset: {len(formatted_full_updated):,} crisis-alliance pairs")
    print(f"Analysis dataset: {len(formatted_analysis_updated):,} crisis-alliance pairs")
    
    # Show alliance status breakdown
    if len(formatted_full_updated) > 0:
        ineffect_stats = formatted_full_updated['Alliance_InEffect'].value_counts()
        print(f"\nAlliance status in full dataset:")
        print(f"  Terminated alliances (InEffect=0): {ineffect_stats.get(0, 0):,}")
        print(f"  Active alliances (InEffect=1): {ineffect_stats.get(1, 0):,}")
    
    if len(formatted_analysis_updated) > 0:
        ineffect_stats_analysis = formatted_analysis_updated['Alliance_InEffect'].value_counts()
        print(f"\nAlliance status in analysis dataset:")
        print(f"  Terminated alliances (InEffect=0): {ineffect_stats_analysis.get(0, 0):,}")
        print(f"  Active alliances (InEffect=1): {ineffect_stats_analysis.get(1, 0):,}")
        
        print(f"\n=== SAMPLE UPDATED ANALYSIS DATASET ===")
        sample_cols = ['Crisis_ID', 'Crisis_Name', 'Alliance_ID', 'Alliance_Type', 
                      'Active_During_Crisis', 'Member_Is_Actor', 'Alliance_InEffect', 
                      'N_Members', 'N_Members_Actors']
        print(formatted_analysis_updated[sample_cols].head().to_string(index=False))
    
    # Save updated datasets
    save_updated_datasets(formatted_full_updated, formatted_analysis_updated)
    
    # Make available in global scope
    globals()['formatted_full_df'] = formatted_full_updated
    globals()['formatted_analysis_df'] = formatted_analysis_updated
    
    print(f"\n=== SUCCESS ===")
    print(f"Updated datasets saved with Alliance_InEffect variable!")

=== UPDATING FINAL DATASETS WITH ALLIANCE_INEFFECT ===
Loaded 243 COW countries + 3 additional = 220 total
Creating updated formatted datasets...
Creating updated formatted dataset...

=== UPDATED DATASET STATISTICS ===
Full dataset: 403,968 crisis-alliance pairs
Analysis dataset: 5,100 crisis-alliance pairs

Alliance status in full dataset:
  Terminated alliances (InEffect=0): 403,968
  Active alliances (InEffect=1): 0

Alliance status in analysis dataset:
  Terminated alliances (InEffect=0): 5,100
  Active alliances (InEffect=1): 0

=== SAMPLE UPDATED ANALYSIS DATASET ===
 Crisis_ID          Crisis_Name  Alliance_ID                Alliance_Type  Active_During_Crisis  Member_Is_Actor  Alliance_InEffect  N_Members  N_Members_Actors
         1  RUSSIAN CIVIL WAR I         2015                 Consultation                     1                1                  0          5                 1
         1  RUSSIAN CIVIL WAR I         2025 Defense;Offense;Consultation                     1  

In [45]:
# Fix Alliance_InEffect for alliances with end date 2030-12-31
print("=== Fixing Alliance_InEffect for placeholder end dates ===")

# Update in formatted_full_df
if 'formatted_full_df' in globals():
    # Check for 2030-12-31 dates (our placeholder for active alliances)
    placeholder_mask = formatted_full_df['Alliance_End'].astype(str).str.contains('2030-12-31', na=False)
    count_full = placeholder_mask.sum()
    
    # Update Alliance_InEffect to 1 for these cases
    formatted_full_df.loc[placeholder_mask, 'Alliance_InEffect'] = 1
    
    print(f"Updated {count_full:,} rows in full dataset")

# Update in formatted_analysis_df  
if 'formatted_analysis_df' in globals():
    placeholder_mask = formatted_analysis_df['Alliance_End'].astype(str).str.contains('2030-12-31', na=False)
    count_analysis = placeholder_mask.sum()
    
    formatted_analysis_df.loc[placeholder_mask, 'Alliance_InEffect'] = 1
    
    print(f"Updated {count_analysis:,} rows in analysis dataset")

# Re-save the datasets
if 'formatted_full_df' in globals() and 'formatted_analysis_df' in globals():
    formatted_full_df.to_csv("ICB_ATOP_full_20250702.csv", index=False)
    formatted_full_df.to_json("ICB_ATOP_full_20250702.json", orient='records', indent=2)
    
    formatted_analysis_df.to_csv("ICB_ATOP_merged_20250702.csv", index=False)
    formatted_analysis_df.to_json("ICB_ATOP_merged_20250702.json", orient='records', indent=2)
    
    print("✓ Re-saved datasets with corrected Alliance_InEffect values")

# Show updated statistics
if 'formatted_analysis_df' in globals():
    ineffect_stats = formatted_analysis_df['Alliance_InEffect'].value_counts()
    print(f"\nUpdated Alliance_InEffect distribution in analysis dataset:")
    print(f"  InEffect=0 (terminated): {ineffect_stats.get(0, 0):,}")
    print(f"  InEffect=1 (active): {ineffect_stats.get(1, 0):,}")

=== Fixing Alliance_InEffect for placeholder end dates ===
Updated 190,464 rows in full dataset
Updated 3,433 rows in analysis dataset
✓ Re-saved datasets with corrected Alliance_InEffect values

Updated Alliance_InEffect distribution in analysis dataset:
  InEffect=0 (terminated): 1,667
  InEffect=1 (active): 3,433


In [47]:
# Fixed version - handle float values in actloc_labels
import pandas as pd

def load_and_update_cracid_actloc_fixed():
    """Load and update the cracid to actloc mapping - handles float values"""
    
    print("=== LOADING AND UPDATING CRACID TO ACTLOC MAPPING (FIXED) ===")
    
    # Load existing mapping
    try:
        cracid_actloc_df = pd.read_csv("cracid_to_actloc.csv")
        print(f"Loaded existing cracid_to_actloc.csv with {len(cracid_actloc_df)} entries")
        print(f"Columns: {list(cracid_actloc_df.columns)}")
        print(f"Sample data:")
        print(cracid_actloc_df.head())
    except FileNotFoundError:
        print("cracid_to_actloc.csv not found - creating from existing data")
        if 'cracid_actloc_dict' in globals():
            cracid_actloc_df = pd.DataFrame([
                {'cracid': k, 'actloc_labels': ';'.join([str(x) for x in v])} 
                for k, v in cracid_actloc_dict.items()
            ])
        else:
            print("ERROR: No cracid mapping available")
            return None
    
    # Convert to dictionary for easy lookup
    cracid_to_location = {}
    
    for idx, row in cracid_actloc_df.iterrows():
        try:
            cracid = int(row['cracid'])
            
            # Handle multiple locations separated by semicolon
            # Convert float strings to int properly
            locations = []
            for x in str(row['actloc_labels']).split(';'):
                if x.strip():
                    try:
                        # Handle float strings like '41.0'
                        location = int(float(x.strip()))
                        locations.append(location)
                    except ValueError:
                        print(f"Warning: Could not parse location '{x.strip()}' for cracid {cracid}")
                        continue
            
            cracid_to_location[cracid] = locations
            
        except Exception as e:
            print(f"Error processing row {idx}: {e}")
            continue
    
    print(f"Successfully parsed {len(cracid_to_location)} country mappings")
    
    # Add missing Caribbean countries → region 42
    caribbean_countries = {
        31: 42,   # Bahamas
        51: 42,   # Jamaica  
        52: 42,   # Trinidad and Tobago
        53: 42,   # Barbados
        54: 42,   # Dominica
        56: 42,   # St. Lucia
        57: 42,   # St. Vincent and the Grenadines
        58: 42,   # Antigua & Barbuda
        60: 42,   # St. Kitts and Nevis
        80: 42    # Belize
    }
    
    # Add Belarus → region 31 (East Europe)
    belarus_mapping = {370: 31}
    
    # Update mappings
    updates_made = 0
    for cracid, location in {**caribbean_countries, **belarus_mapping}.items():
        if cracid not in cracid_to_location:
            cracid_to_location[cracid] = [location]
            updates_made += 1
            print(f"Added mapping: {cracid} → {location}")
        else:
            # Check if location is already in the list
            if location not in cracid_to_location[cracid]:
                cracid_to_location[cracid].append(location)
                updates_made += 1
                print(f"Added location {location} to existing cracid {cracid}")
            else:
                print(f"Country {cracid} already mapped to {cracid_to_location[cracid]}")
    
    print(f"Made {updates_made} new mappings")
    
    # Convert back to DataFrame and save
    updated_df = pd.DataFrame([
        {'cracid': k, 'actloc_labels': ';'.join([str(x) for x in v])} 
        for k, v in cracid_to_location.items()
    ]).sort_values('cracid')
    
    updated_df.to_csv("cracid_to_actloc.csv", index=False)
    print(f"✓ Saved updated cracid_to_actloc.csv with {len(updated_df)} entries")
    
    # Show sample of updated mapping
    print(f"\nSample updated mappings:")
    print(updated_df.head(10).to_string(index=False))
    
    return cracid_to_location

def calculate_geographic_match_fixed(crisis_location, alliance_members_str, cracid_mapping):
    """
    Fixed geographic match: 1 if ANY alliance member location matches crisis location
    """
    
    if pd.isna(crisis_location) or not alliance_members_str or pd.isna(alliance_members_str):
        return 0
    
    try:
        crisis_loc = int(float(crisis_location))  # Handle float strings
        
        # Get all alliance member codes
        member_codes = []
        for x in str(alliance_members_str).split(';'):
            if x.strip():
                try:
                    member_codes.append(int(float(x.strip())))
                except ValueError:
                    continue
        
        # Get all locations for all alliance members
        all_member_locations = set()
        for member_code in member_codes:
            if member_code in cracid_mapping:
                member_locations = cracid_mapping[member_code]
                all_member_locations.update(member_locations)
        
        # Check if crisis location matches any member location
        return 1 if crisis_loc in all_member_locations else 0
        
    except (ValueError, TypeError):
        return 0

def update_geographic_matches_in_datasets(cracid_mapping):
    """Update geographic matches in final datasets"""
    
    print(f"\n=== UPDATING GEOGRAPHIC MATCHES IN DATASETS ===")
    
    def extract_member_codes_from_formatted(members_formatted):
        """Extract codes from 'CountryName(123);CountryName(456)' format"""
        if not members_formatted or pd.isna(members_formatted):
            return ""
        
        codes = []
        for item in str(members_formatted).split(';'):
            if '(' in item and ')' in item:
                try:
                    code = item.split('(')[1].split(')')[0]
                    codes.append(code)
                except:
                    continue
        return ';'.join(codes)
    
    # Update formatted_full_df
    if 'formatted_full_df' in globals():
        print("Updating full dataset...")
        
        # Extract member codes
        formatted_full_df['member_codes'] = formatted_full_df['Members_List'].apply(extract_member_codes_from_formatted)
        
        # Recalculate geographic matches
        formatted_full_df['Geographic_Match'] = formatted_full_df.apply(
            lambda row: calculate_geographic_match_fixed(
                row['Crisis_Location'], 
                row['member_codes'], 
                cracid_mapping
            ), axis=1
        )
        
        # Remove temporary column
        formatted_full_df.drop('member_codes', axis=1, inplace=True)
        
        geo_matches_full = formatted_full_df['Geographic_Match'].sum()
        print(f"Full dataset: {geo_matches_full:,}/{len(formatted_full_df):,} geographic matches ({geo_matches_full/len(formatted_full_df)*100:.1f}%)")
    
    # Update formatted_analysis_df
    if 'formatted_analysis_df' in globals():
        print("Updating analysis dataset...")
        
        # Extract member codes
        formatted_analysis_df['member_codes'] = formatted_analysis_df['Members_List'].apply(extract_member_codes_from_formatted)
        
        # Recalculate geographic matches
        formatted_analysis_df['Geographic_Match'] = formatted_analysis_df.apply(
            lambda row: calculate_geographic_match_fixed(
                row['Crisis_Location'], 
                row['member_codes'], 
                cracid_mapping
            ), axis=1
        )
        
        # Remove temporary column
        formatted_analysis_df.drop('member_codes', axis=1, inplace=True)
        
        geo_matches_analysis = formatted_analysis_df['Geographic_Match'].sum()
        print(f"Analysis dataset: {geo_matches_analysis:,}/{len(formatted_analysis_df):,} geographic matches ({geo_matches_analysis/len(formatted_analysis_df)*100:.1f}%)")
    
    # Re-save all datasets
    print(f"\n=== RE-SAVING DATASETS WITH FIXED GEOGRAPHIC MATCHES ===")
    
    if 'formatted_full_df' in globals():
        formatted_full_df_save = formatted_full_df.copy()
        date_columns = ['Crisis_Start', 'Crisis_End', 'Alliance_Start', 'Alliance_End']
        for col in date_columns:
            if col in formatted_full_df_save.columns:
                formatted_full_df_save[col] = formatted_full_df_save[col].astype(str)
        
        formatted_full_df_save.to_csv("ICB_ATOP_full_20250702.csv", index=False)
        formatted_full_df_save.to_json("ICB_ATOP_full_20250702.json", orient='records', indent=2)
        print(f"✓ Updated full dataset saved")
    
    if 'formatted_analysis_df' in globals():
        formatted_analysis_df_save = formatted_analysis_df.copy()
        for col in date_columns:
            if col in formatted_analysis_df_save.columns:
                formatted_analysis_df_save[col] = formatted_analysis_df_save[col].astype(str)
        
        formatted_analysis_df_save.to_csv("ICB_ATOP_merged_20250702.csv", index=False)
        formatted_analysis_df_save.to_json("ICB_ATOP_merged_20250702.json", orient='records', indent=2)
        print(f"✓ Updated analysis dataset saved")

# Execute the fixed geographic update
print("=== FIXING GEOGRAPHIC MATCH VARIABLE (FIXED) ===")

# Step 1: Load and update country-to-location mapping
cracid_mapping = load_and_update_cracid_actloc_fixed()

if cracid_mapping is not None:
    # Step 2: Update geographic matches in datasets
    update_geographic_matches_in_datasets(cracid_mapping)
    
    print(f"\n=== SUCCESS ===")
    print(f"✓ Fixed float parsing in cracid_to_actloc.csv")
    print(f"✓ Updated geographic match logic")
    print(f"✓ Updated all 4 final datasets")

else:
    print("Failed to load country-to-location mapping")

=== FIXING GEOGRAPHIC MATCH VARIABLE (FIXED) ===
=== LOADING AND UPDATING CRACID TO ACTLOC MAPPING (FIXED) ===
Loaded existing cracid_to_actloc.csv with 147 entries
Columns: ['cracid', 'actloc_labels']
Sample data:
   cracid actloc_labels
0       2          41.0
1      20          41.0
2      40          42.0
3      41          42.0
4      42          42.0
Successfully parsed 147 country mappings
Added mapping: 31 → 42
Added mapping: 51 → 42
Added mapping: 52 → 42
Added mapping: 53 → 42
Added mapping: 54 → 42
Added mapping: 56 → 42
Added mapping: 57 → 42
Added mapping: 58 → 42
Added mapping: 60 → 42
Added mapping: 80 → 42
Added location 31 to existing cracid 370
Made 11 new mappings
✓ Saved updated cracid_to_actloc.csv with 157 entries

Sample updated mappings:
 cracid actloc_labels
      2            41
     20            41
     31            42
     40            42
     41            42
     42            42
     51            42
     52            42
     53            42
     54 

In [48]:
# Create geographic location mappings and update final datasets
import pandas as pd

def create_geographic_location_mapping():
    """Create and save geographic location code to name mapping"""
    
    print("=== CREATING GEOGRAPHIC LOCATION MAPPING ===")
    
    # Define the mappings
    geo_mappings = {
        9: "Central Asia",
        10: "West Asia",
        11: "East Asia", 
        12: "South-East Asia",
        13: "South Asia",
        15: "Middle East",
        20: "West Africa",
        21: "North Africa", 
        22: "East Africa",
        23: "Southern Africa",
        24: "Central Africa",
        30: "Euro-Asia",
        31: "East Europe",
        32: "Central Europe", 
        33: "West Europe",
        34: "North Europe",
        35: "South Europe",
        41: "North America",
        42: "Central America",
        43: "South America",
        51: "Australasia"
    }
    
    # Create DataFrame and save
    geo_mapping_df = pd.DataFrame([
        {'location_code': code, 'location_name': name} 
        for code, name in geo_mappings.items()
    ]).sort_values('location_code')
    
    geo_mapping_df.to_csv("geographic_location_mapping.csv", index=False)
    print(f"✓ Saved geographic_location_mapping.csv with {len(geo_mapping_df)} entries")
    print(geo_mapping_df.to_string(index=False))
    
    return geo_mappings

def format_crisis_location(location_code, geo_mappings):
    """Format crisis location as 'Location Name(code)'"""
    
    if pd.isna(location_code):
        return ""
    
    try:
        code = int(float(location_code))
        location_name = geo_mappings.get(code, f"Unknown_{code}")
        return f"{location_name}({code})"
    except (ValueError, TypeError):
        return ""

def update_final_datasets_with_location_names(geo_mappings):
    """Update final datasets with formatted location names"""
    
    print(f"\n=== UPDATING FINAL DATASETS WITH LOCATION NAMES ===")
    
    # Update formatted_full_df
    if 'formatted_full_df' in globals():
        print("Updating full dataset...")
        
        # Create new dataset without Alliance_InEffect (not in required format)
        updated_full = formatted_full_df.copy()
        
        # Remove Alliance_InEffect if it exists
        if 'Alliance_InEffect' in updated_full.columns:
            updated_full = updated_full.drop('Alliance_InEffect', axis=1)
        
        # Update Crisis_Location format
        updated_full['Crisis_Location'] = updated_full['Crisis_Location'].apply(
            lambda x: format_crisis_location(x, geo_mappings)
        )
        
        # Reorder columns to match required format
        required_columns = [
            'Crisis_ID', 'Crisis_Name', 'Crisis_Start', 'Crisis_End', 'Alliance_ID', 
            'Alliance_Name', 'Alliance_Start', 'Alliance_End', 'Alliance_Type', 
            'Active_During_Crisis', 'Member_Is_Actor', 'N_Members', 'N_Members_Actors', 
            'Members_List', 'Actors_List', 'Crisis_Location', 'Geographic_Match'
        ]
        
        # Keep only required columns that exist
        available_columns = [col for col in required_columns if col in updated_full.columns]
        updated_full = updated_full[available_columns]
        
        print(f"Full dataset columns: {list(updated_full.columns)}")
        globals()['formatted_full_df'] = updated_full
    
    # Update formatted_analysis_df
    if 'formatted_analysis_df' in globals():
        print("Updating analysis dataset...")
        
        # Create new dataset without Alliance_InEffect
        updated_analysis = formatted_analysis_df.copy()
        
        # Remove Alliance_InEffect if it exists
        if 'Alliance_InEffect' in updated_analysis.columns:
            updated_analysis = updated_analysis.drop('Alliance_InEffect', axis=1)
        
        # Update Crisis_Location format
        updated_analysis['Crisis_Location'] = updated_analysis['Crisis_Location'].apply(
            lambda x: format_crisis_location(x, geo_mappings)
        )
        
        # Reorder columns to match required format
        available_columns = [col for col in required_columns if col in updated_analysis.columns]
        updated_analysis = updated_analysis[available_columns]
        
        print(f"Analysis dataset columns: {list(updated_analysis.columns)}")
        globals()['formatted_analysis_df'] = updated_analysis
    
    # Re-save datasets
    print(f"\n=== RE-SAVING FINAL DATASETS ===")
    
    if 'formatted_full_df' in globals():
        # Convert dates to strings for JSON
        full_save = formatted_full_df.copy()
        date_columns = ['Crisis_Start', 'Crisis_End', 'Alliance_Start', 'Alliance_End']
        for col in date_columns:
            if col in full_save.columns:
                full_save[col] = full_save[col].astype(str)
        
        full_save.to_csv("ICB_ATOP_full_20250702.csv", index=False)
        full_save.to_json("ICB_ATOP_full_20250702.json", orient='records', indent=2)
        print(f"✓ Updated full dataset saved: {len(full_save):,} rows")
    
    if 'formatted_analysis_df' in globals():
        # Convert dates to strings for JSON
        analysis_save = formatted_analysis_df.copy()
        for col in date_columns:
            if col in analysis_save.columns:
                analysis_save[col] = analysis_save[col].astype(str)
        
        analysis_save.to_csv("ICB_ATOP_merged_20250702.csv", index=False)
        analysis_save.to_json("ICB_ATOP_merged_20250702.json", orient='records', indent=2)
        print(f"✓ Updated analysis dataset saved: {len(analysis_save):,} rows")

def show_sample_final_format():
    """Show sample of final formatted data"""
    
    print(f"\n=== SAMPLE FINAL FORMAT ===")
    
    if 'formatted_analysis_df' in globals() and len(formatted_analysis_df) > 0:
        
        print("Sample row from analysis dataset:")
        sample_row = formatted_analysis_df.iloc[0]
        
        print(f"Crisis_ID: {sample_row['Crisis_ID']}")
        print(f"Crisis_Name: {sample_row['Crisis_Name']}")
        print(f"Crisis_Start: {sample_row['Crisis_Start']}")
        print(f"Crisis_End: {sample_row['Crisis_End']}")
        print(f"Alliance_ID: {sample_row['Alliance_ID']}")
        print(f"Alliance_Name: {sample_row['Alliance_Name']}")
        print(f"Alliance_Start: {sample_row['Alliance_Start']}")
        print(f"Alliance_End: {sample_row['Alliance_End']}")
        print(f"Alliance_Type: {sample_row['Alliance_Type']}")
        print(f"Active_During_Crisis: {sample_row['Active_During_Crisis']}")
        print(f"Member_Is_Actor: {sample_row['Member_Is_Actor']}")
        print(f"N_Members: {sample_row['N_Members']}")
        print(f"N_Members_Actors: {sample_row['N_Members_Actors']}")
        print(f"Members_List: {sample_row['Members_List']}")
        print(f"Actors_List: {sample_row['Actors_List']}")
        print(f"Crisis_Location: {sample_row['Crisis_Location']}")
        print(f"Geographic_Match: {sample_row['Geographic_Match']}")
        
        print(f"\nFirst 3 rows of analysis dataset:")
        display_cols = ['Crisis_ID', 'Crisis_Name', 'Alliance_ID', 'Alliance_Type', 'Crisis_Location', 'Geographic_Match']
        available_display_cols = [col for col in display_cols if col in formatted_analysis_df.columns]
        print(formatted_analysis_df[available_display_cols].head(3).to_string(index=False))
        
        # Show crisis location distribution
        print(f"\nCrisis location distribution:")
        location_dist = formatted_analysis_df['Crisis_Location'].value_counts().head(10)
        print(location_dist.to_string())

# Execute the final update
print("=== FINAL DATASET UPDATE WITH LOCATION NAMES ===")

# Step 1: Create geographic mapping
geo_mappings = create_geographic_location_mapping()

# Step 2: Update datasets with location names
update_final_datasets_with_location_names(geo_mappings)

# Step 3: Show sample results
show_sample_final_format()

print(f"\n=== FINAL SUCCESS ===")
print(f"✅ Created geographic_location_mapping.csv")
print(f"✅ Updated Crisis_Location format to 'Region Name(code)'")
print(f"✅ Removed Alliance_InEffect (not in required format)")
print(f"✅ Reordered columns to match specification")
print(f"✅ Saved final datasets:")
print(f"   - ICB_ATOP_full_20250702.csv")
print(f"   - ICB_ATOP_full_20250702.json") 
print(f"   - ICB_ATOP_merged_20250702.csv")
print(f"   - ICB_ATOP_merged_20250702.json")
print(f"✅ Final format matches specification exactly!")

=== FINAL DATASET UPDATE WITH LOCATION NAMES ===
=== CREATING GEOGRAPHIC LOCATION MAPPING ===
✓ Saved geographic_location_mapping.csv with 21 entries
 location_code   location_name
             9    Central Asia
            10       West Asia
            11       East Asia
            12 South-East Asia
            13      South Asia
            15     Middle East
            20     West Africa
            21    North Africa
            22     East Africa
            23 Southern Africa
            24  Central Africa
            30       Euro-Asia
            31     East Europe
            32  Central Europe
            33     West Europe
            34    North Europe
            35    South Europe
            41   North America
            42 Central America
            43   South America
            51     Australasia

=== UPDATING FINAL DATASETS WITH LOCATION NAMES ===
Updating full dataset...
Full dataset columns: ['Crisis_ID', 'Crisis_Name', 'Crisis_Start', 'Crisis_End', 'Alliance

In [5]:
##CORRECTION CODE FOR TIME INVARIANCE IN MEMBERSHIP

import pandas as pd
from pathlib import Path

# ── 1. load the member-level file ──────────────────────────────
mfile = Path(r"atop_5.1__.csv_\ATOP 5.1 (.csv)\atop5_1m.csv")      # adjust if the file sits elsewhere
cols  = ["member", "atopid",
         "yrent", "moent", "dayent",
         "yrexit", "moexit", "dayexit"]

df = pd.read_csv(mfile, usecols=cols)

# ── 2. helper: combine Y/M/D into one string ───────────────────
def ymd_to_str(y, m, d, as_end=False):
    """
    Build YYYY-MM-DD; blank parts default to Jan/01 (start)
    or Dec/31 (end)                                         """
    if pd.isna(y):                                    # no year ⇒ missing date
        return pd.NA

    y = int(y)
    m = int(m) if pd.notna(m) else (12 if as_end else 1)
    d = int(d) if pd.notna(d) else (31 if as_end else 1)
    return f"{y:04d}-{m:02d}-{d:02d}"

# ── 3. build Start_Date / End_Date columns ─────────────────────
df["Start_Date"] = df.apply(
    lambda r: ymd_to_str(r.yrent,  r.moent,  r.dayent,  as_end=False), axis=1)

df["End_Date"]   = df.apply(
    lambda r: ymd_to_str(r.yrexit, r.moexit, r.dayexit, as_end=True),  axis=1)

# ── 4. keep only the requested four columns and save / inspect ─
tidy = df.rename(columns={"member": "Country",
                          "atopid": "Alliance_ID"})[
             ["Country", "Alliance_ID", "Start_Date", "End_Date"]]

print(tidy.head())
tidy.to_csv("atop_member_dates.csv", index=False)   # optional save


   Country  Alliance_ID  Start_Date    End_Date
0      200         1005  1815-01-03  1815-02-08
1      220         1005  1815-01-03  1815-02-08
2      300         1005  1815-01-03  1815-02-08
3      245         1005  1815-01-13  1815-02-08
4      240         1005  1815-01-19  1815-02-08


In [7]:
# ------------------------------------------------------------
# CONFIG
# ------------------------------------------------------------
from pathlib import Path
import pandas as pd, re, json

FULL_PATH       = "ICB_ATOP_full_20250702.csv"   # time-invariant file
ROSTER_PATH     = "atop_member_dates.csv"        # Country|Alliance_ID|Start_Date|End_Date
CRACID_LOC_PATH = "cracid_to_actloc.csv"         # optional geo dictionary
OUT_DIR         = Path("Corrected_Dataset")
OUT_DIR.mkdir(exist_ok=True, parents=True)

# ------------------------------------------------------------
# 1. LOAD DATA
# ------------------------------------------------------------
full   = pd.read_csv(FULL_PATH)
roster = pd.read_csv(ROSTER_PATH)      # per-member join/exit dates

# optional geographic dictionary  {cracid: [actloc,…]}
try:
    loc_df = pd.read_csv(CRACID_LOC_PATH)
    cracid_loc = {int(r.cracid): [int(float(x)) for x in str(r.actloc_labels).split(";") if x]
                  for r in loc_df.itertuples(index=False)}
except FileNotFoundError:
    cracid_loc = {}

# parse dates once
to_dt = pd.to_datetime
full["Crisis_Start_dt"] = to_dt(full["Crisis_Start"], errors="coerce")
full["Crisis_End_dt"]   = to_dt(full["Crisis_End"],   errors="coerce")

roster["Start_dt"]      = to_dt(roster["Start_Date"], errors="coerce")
roster["End_dt"]        = to_dt(roster["End_Date"],   errors="coerce").fillna(
                          pd.Timestamp("2030-12-31"))

# ------------------------------------------------------------
# 2. BUILD LOOK-UP: alliance_id → dataframe of active members
# ------------------------------------------------------------
roster_grp = roster.groupby("Alliance_ID", sort=False)

def active_members(aid, c_start, c_end):
    """return list[int] of members whose tenure overlaps the crisis window"""
    try:
        block = roster_grp.get_group(aid)
    except KeyError:
        return []
    mask = (block["Start_dt"] <= c_end) & (block["End_dt"] >= c_start)
    return block.loc[mask, "Country"].astype(int).tolist()

# helper: pull numeric codes out of 'Name(123)' or '123'
_code_re = re.compile(r"(\d+)")
def extract_codes(semi_str):
    if not semi_str or pd.isna(semi_str):
        return []
    return [int(m.group(1)) for token in str(semi_str).split(";")
                          for m in [_code_re.search(token)] if m]

# ------------------------------------------------------------
# 3. RE-COMPUTE MEMBERSHIP-BASED VARIABLES
# ------------------------------------------------------------
new_cols = {k: [] for k in
            ["Members_List","N_Members","Member_Is_Actor",
             "N_Members_Actors","Geographic_Match"]}

for row in full.itertuples(index=False):
    # members at crisis time
    members = active_members(row.Alliance_ID, row.Crisis_Start_dt, row.Crisis_End_dt)
    mset    = set(members)

    # crisis actors
    actors  = set(extract_codes(row.Actors_List))

    # populate
    new_cols["Members_List"].append(";".join(map(str, sorted(members))))
    new_cols["N_Members"].append(len(members))

    inter = actors & mset
    new_cols["Member_Is_Actor"].append(int(bool(inter)))
    new_cols["N_Members_Actors"].append(len(inter))

    # geographic match
    if cracid_loc and pd.notna(row.Crisis_Location):
        try:
            cloc = int(row.Crisis_Location.split("(")[-1].rstrip(")"))
        except ValueError:
            cloc = None
        member_locs = {loc for c in mset for loc in cracid_loc.get(c, [])}
        new_cols["Geographic_Match"].append(int(cloc in member_locs) if cloc else pd.NA)
    else:
        new_cols["Geographic_Match"].append(pd.NA)

# attach to dataframe
for k, v in new_cols.items():
    full[k] = v

# ------------------------------------------------------------
# 4. BUILD ANALYSIS SUBSET Ω  (δ_active & Member_Is_Actor)
# ------------------------------------------------------------
analysis = full[(full["Active_During_Crisis"] == 1) &
                (full["Member_Is_Actor"] == 1)].copy()

# ------------------------------------------------------------
# 5. SAVE  (CSV + JSON)
# ------------------------------------------------------------
csv_full      = OUT_DIR / "ICB_ATOP_full_20250702_corrected.csv"
csv_analysis  = OUT_DIR / "ICB_ATOP_merged_20250702_corrected.csv"
json_full     = OUT_DIR / "ICB_ATOP_full_20250702_corrected.json"
json_analysis = OUT_DIR / "ICB_ATOP_merged_20250702_corrected.json"

full.to_csv(csv_full, index=False)
analysis.to_csv(csv_analysis, index=False)

full.to_json(json_full, orient="records", indent=2)
analysis.to_json(json_analysis, orient="records", indent=2)

print("✓ Saved:")
print("  •", csv_full)
print("  •", csv_analysis)
print("  •", json_full)
print("  •", json_analysis)


✓ Saved:
  • Corrected_Dataset\ICB_ATOP_full_20250702_corrected.csv
  • Corrected_Dataset\ICB_ATOP_merged_20250702_corrected.csv
  • Corrected_Dataset\ICB_ATOP_full_20250702_corrected.json
  • Corrected_Dataset\ICB_ATOP_merged_20250702_corrected.json


In [8]:
import pandas as pd
from pathlib import Path

# path to the corrected full file you just generated
CORRECTED = Path("Corrected_Dataset/ICB_ATOP_full_20250702_corrected.csv")

# load just the NATO–Hungarian-uprising row
check = (
    pd.read_csv(CORRECTED)
      .loc[lambda d: (d["Crisis_ID"] == 155) & (d["Alliance_ID"] == 3180)]
)

if check.empty:
    raise RuntimeError("Row not found - did the IDs change?")

cols = ["Crisis_ID", "Crisis_Name",
        "Alliance_ID", "Alliance_Start", "Alliance_End",
        "Active_During_Crisis", "Member_Is_Actor",
        "N_Members", "N_Members_Actors",
        "Members_List", "Actors_List"]

print(check[cols].to_string(index=False, max_colwidth=90))

# quick sanity flags
roster      = check["Members_List"].iloc[0].split(";")
hungary_in  = any(code.strip()=="310" for code in roster)
print("\nHungary present in filtered roster? ", hungary_in)


 Crisis_ID        Crisis_Name  Alliance_ID Alliance_Start Alliance_End  Active_During_Crisis  Member_Is_Actor  N_Members  N_Members_Actors                                             Members_List              Actors_List
       155 HUNGARIAN UPRISING         3180     1949-04-04   2030-12-31                     1                0         15                 0 2;20;200;210;211;212;220;235;260;325;350;385;390;395;640 Hungary(310);Russia(365)

Hungary present in filtered roster?  False


In [9]:
from pathlib import Path
import pandas as pd, re, json

# ------------------------------------------------------------
# paths & filenames
# ------------------------------------------------------------
SRC_DIR  = Path("Corrected_Dataset")
OUT_DIR  = Path("readable_corrected")
OUT_DIR.mkdir(exist_ok=True, parents=True)

FULL_FILE_NUM  = SRC_DIR / "ICB_ATOP_full_20250702_corrected.csv"
SUBSET_FILE_NUM = SRC_DIR / "ICB_ATOP_merged_20250702_corrected.csv"

COW_PATH   = Path("COW-country-codes.csv")               # official list

# ------------------------------------------------------------
# 1. build augmented country-code mapping
# ------------------------------------------------------------
cow = pd.read_csv(COW_PATH, usecols=["CCode", "StateNme"]).drop_duplicates()
extra = pd.DataFrame({
    "CCode":   [219, 671, 672],
    "StateNme":["Vichy France", "Hejaz", "Najd"]
})
codes = pd.concat([cow, extra], ignore_index=True).drop_duplicates("CCode")
code2name = dict(zip(codes.CCode, codes.StateNme))

# ------------------------------------------------------------
# 2. helpers for list conversion
# ------------------------------------------------------------
_code_re = re.compile(r"(\d+)")

def extract_codes(semistr):
    """'Name(123);456' → [123,456]  (ints)"""
    if not semistr or pd.isna(semistr):
        return []
    return [int(m.group(1)) for token in str(semistr).split(";")
                           for m in [_code_re.search(token)] if m]

def to_readable(semistr):
    """numeric codes → 'Country(code)'  ; leaves already-readable rows unchanged"""
    if "(" in str(semistr):          # already labelled
        return semistr
    codes = extract_codes(semistr)
    labels = [f"{code2name.get(c, f'Unknown_{c}')}({c})" for c in sorted(codes)]
    return ";".join(labels)

def convert_dataframe(df):
    df = df.copy()
    df["Members_List"] = df["Members_List"].apply(to_readable)
    df["Actors_List"]  = df["Actors_List"].apply(to_readable)
    return df

# ------------------------------------------------------------
# 3. load, convert, save
# ------------------------------------------------------------
for infile in [FULL_FILE_NUM, SUBSET_FILE_NUM]:
    df_num  = pd.read_csv(infile)
    df_read = convert_dataframe(df_num)

    # keep same filename, different folder
    csv_out  = OUT_DIR / infile.name
    json_out = csv_out.with_suffix(".json")

    df_read.to_csv(csv_out, index=False)
    df_read.to_json(json_out, orient="records", indent=2)

    print(f"✓ saved {csv_out.name} and {json_out.name} to {OUT_DIR}/")


✓ saved ICB_ATOP_full_20250702_corrected.csv and ICB_ATOP_full_20250702_corrected.json to readable_corrected/
✓ saved ICB_ATOP_merged_20250702_corrected.csv and ICB_ATOP_merged_20250702_corrected.json to readable_corrected/
