In [5]:
import pandas as pd
import plotly.express as px
import json
import os
import requests # Import requests for loading GeoJSON
import re # Import regex for string cleaning

# === CONFIGURATION ===
FILE_PATH = '/content/cleaned_data.xlsx'
# Set GeoJSON URL to point to a LOCAL GeoJSON file you will download and upload
GEOJSON_URL = '/content/philippines_regions.json'
# IMPORTANT: You MUST manually download the GeoJSON for Philippine regions (Admin Areas level 1)
# from https://simplemaps.com/gis/country/ph and upload it to /content/ as 'philippines_regions.json'

TIME_COLUMN = 'Start_Year'
# Column in your Excel data that contains the geographic names (Region or Province)
LOCATION_NAME_COLUMN_FOR_MAP = 'Location_Name'
VALUE_COLUMN = 'Value'
METRIC_COLUMN = 'Metric'
TARGET_METRIC = 'GDP'
# Set the geographic level you want to map ('Region' or 'Province')
GEOGRAPHIC_LEVEL_TYPE = 'Region' # Set for regional map, matching your data's Location_Type
# =====================

def load_data(file_path):
    """Loads data from CSV or Excel based on file extension."""
    try:
        file_extension = os.path.splitext(file_path)[1].lower()
        if file_extension == '.csv':
            df = pd.read_csv(file_path)
            print(f"Loading data from CSV: {file_path}")
        elif file_extension == '.xlsx':
            df = pd.read_excel(file_path)
            print(f"Loading data from Excel: {file_path}")
        else:
            raise ValueError(f"Unsupported file type '{file_extension}'. Please provide a .csv or .xlsx file.")
        df.columns = df.columns.str.strip() # Clean column names
        print(f"\nSuccessfully loaded data from: '{file_path}'")
        return df
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found. Please ensure it's uploaded to /content/.")
        print("If you just restarted the Colab runtime, you might need to re-upload the file.")
        print("You can upload it using: `from google.colab import files; uploaded = files.upload()`")
        return None
    except Exception as e:
        print(f"An unexpected error occurred during data loading: {e}")
        return None

def load_geojson(path_or_url):
    """
    Loads GeoJSON data from a URL or local file.
    This version expects a single GeoJSON file.
    Includes enhanced error reporting for JSONDecodeError.
    """
    try:
        if os.path.exists(path_or_url) and os.path.isfile(path_or_url): # Check if it's a local file path
            with open(path_or_url, 'r') as f:
                geojson_data = json.load(f)
            print(f"Successfully loaded local GeoJSON from: {path_or_url}")
        else: # Assume it's a URL
            response = requests.get(path_or_url)
            response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
            geojson_data = response.json()
            print(f"Successfully loaded GeoJSON from URL: {path_or_url}")

        # --- NEW DEBUGGING: Print properties of first few features ---
        if 'features' in geojson_data and isinstance(geojson_data['features'], list) and geojson_data['features']:
            print(f"\n--- Sample Properties from GeoJSON features (first 5 from {path_or_url}) ---")
            for i, feature in enumerate(geojson_data['features'][:5]):
                print(f"Feature {i} properties: {feature.get('properties', {})}")
            print("--------------------------------------------------")
        else:
            print(f"  Warning: GeoJSON from '{path_or_url}' does not contain a 'features' list or it's not a list.")

        return geojson_data
    except FileNotFoundError:
        print(f"Error: Local GeoJSON file '{path_or_url}' not found. Please ensure it's uploaded to /content/ if it's a local path.")
        return None
    except requests.exceptions.RequestException as e:
        print(f"Error loading GeoJSON from URL: {e}")
        print(f"URL attempted: {path_or_url}")
        print("Please check the URL or try providing a local GeoJSON file.")
        return None
    except json.JSONDecodeError as e:
        print(f"Error decoding GeoJSON: {e}. The file might not be a valid JSON.")
        print(f"Content received (first 500 chars): {response.text[:500] if 'response' in locals() else 'N/A'}")
        print(f"URL attempted: {path_or_url}")
        print("This often means the content is not valid JSON (e.g., it's an HTML error page).")
        print("Please verify the GeoJSON URL points to a RAW JSON file, not a GitHub folder page.")
        return None
    except Exception as e:
        print(f"An unexpected error occurred during GeoJSON loading: {e}")
        return None

def prepare_gdp_data_for_map(df, time_col, location_name_col, value_col, metric_col, target_metric, geographic_level_type):
    """
    Filters and aggregates GDP data for mapping based on the specified geographic level.
    Ensures data is at the correct level and ready for merging.
    Includes cleaning of Location_Name for matching.
    """
    # Filter for GDP metric and the specified geographic level (Region or Province)
    df_gdp_level = df[
        (df[METRIC_COLUMN] == TARGET_METRIC) &
        (df['Location_Type'] == geographic_level_type)
    ].copy()

    if df_gdp_level.empty:
        print(f"Warning: No {geographic_level_type} GDP data found for metric '{TARGET_METRIC}'.")
        print(f"Available metrics: {df[METRIC_COLUMN].unique().tolist()}")
        print(f"Available Location_Types for GDP: {df[df[METRIC_COLUMN] == TARGET_METRIC]['Location_Type'].unique().tolist()}")
        return pd.DataFrame()

    # --- DEBUGGING: Print original unique Location_Names ---
    print(f"\n--- Original Unique '{location_name_col}' values in your data (before cleaning/mapping): ---")
    print(df_gdp_level[location_name_col].unique().tolist())
    print("--------------------------------------------------------------------------------")

    # --- CRITICAL CLEANING STEP FOR LOCATION_NAME ---
    # 1. Remove text in parentheses and strip whitespace
    df_gdp_level[location_name_col] = df_gdp_level[location_name_col].apply(
        lambda name: re.sub(r'\s*\(.*\)\s*', '', str(name)).strip()
    )

    # 2. Create a temporary uppercase column for robust matching
    df_gdp_level['Temp_Location_Name_Upper'] = df_gdp_level[location_name_col].str.upper()

    # 3. Define a comprehensive mapping from the UPPERCASE version of your data's names
    # to the EXACT casing of the GeoJSON names for REGIONS.
    # These are based on simplemaps.com GeoJSON region names.
    name_replacements_upper_to_geojson = {
        'REGION I': 'Ilocos',
        'REGION II': 'Cagayan Valley',
        'REGION III': 'Central Luzon',
        'REGION IV-A': 'Calabarzon', # Corrected to match GeoJSON casing
        'REGION IV-B': 'Mimaropa',   # Corrected to match GeoJSON casing
        'REGION V': 'Bicol',
        'REGION VI': 'Western Visayas',
        'REGION VII': 'Central Visayas',
        'REGION VIII': 'Eastern Visayas',
        'REGION IX': 'Zamboanga Peninsula',
        'REGION X': 'Northern Mindanao',
        'REGION XI': 'Davao',
        'REGION XII': 'Soccsksargen', # Corrected to match GeoJSON casing
        'REGION XIII': 'Caraga',
        'NATIONAL CAPITAL REGION': 'National Capital Region',
        'CORDILLERA ADMINISTRATIVE REGION': 'Cordillera Administrative Region',
        # Simplemaps GeoJSON uses 'Autonomous Region in Muslim Mindanao' (lowercase 'in')
        'BANGSAMORO AUTONOMOUS REGION IN MUSLIM MINDANAO': 'Autonomous Region in Muslim Mindanao',
        'AUTONOMOUS REGION IN MUSLIM MINDANAO': 'Autonomous Region in Muslim Mindanao',
        # Add common abbreviations/variations if present in your data's uppercase form
        'NCR': 'National Capital Region',
        'CAR': 'Cordillera Administrative Region',
        'ARMM': 'Autonomous Region in Muslim Mindanao',
        'BARMM': 'Autonomous Region in Muslim Mindanao',
        # Explicitly handle common numeric variations if they appear in your data
        'REGION 4A': 'Calabarzon', # Corrected to match GeoJSON casing
        'REGION 4B': 'Mimaropa',   # Corrected to match GeoJSON casing
        'REGION 12': 'Soccsksargen', # Corrected to match GeoJSON casing
    }

    # Apply the replacements using the temporary uppercase column.
    df_gdp_level[location_name_col] = df_gdp_level['Temp_Location_Name_Upper'].apply(
        lambda x: name_replacements_upper_to_geojson.get(x, x) # If not found, keep the uppercase name
    )

    # Drop the temporary column
    df_gdp_level.drop(columns=['Temp_Location_Name_Upper'], inplace=True)

    # Filter out non-geographical entries like 'EXISTING DATA BUT WITHHELD'
    df_gdp_level = df_gdp_level[df_gdp_level[location_name_col] != 'EXISTING DATA BUT WITHHELD'].copy()
    if df_gdp_level.empty:
        print("Warning: No valid geographical GDP data found after filtering out non-geographical entries.")
        return pd.DataFrame()

    # --- DEBUGGING: Print unique Location_Names after initial cleaning and replacement ---
    print(f"\n--- Unique '{location_name_col}' values in your data (after cleaning and initial replacements): ---")
    print(df_gdp_level[location_name_col].unique().tolist())
    print("--------------------------------------------------------------------------------")

    # Aggregate GDP by Location Name (Region) and Year
    # Summing 'Value' to get total GDP for each location per year
    df_gdp_aggregated = df_gdp_level.groupby([time_col, location_name_col]).agg(
        Total_GDP=(value_col, 'sum')
    ).reset_index()

    # Ensure 'Total_GDP' is numeric
    df_gdp_aggregated['Total_GDP'] = pd.to_numeric(df_gdp_aggregated['Total_GDP'], errors='coerce')
    df_gdp_aggregated.dropna(subset=['Total_GDP'], inplace=True) # Drop rows with NaN GDP

    # Convert 'Start_Year' to string for animation_frame
    df_gdp_aggregated[time_col] = df_gdp_aggregated[time_col].astype(str)

    print(f"\nPrepared GDP data for mapping. Number of entries: {len(df_gdp_aggregated)}")
    print(f"Years available: {df_gdp_aggregated[time_col].unique().tolist()}")

    # --- DEBUGGING: Print unique location names from your data (after full cleaning and aggregation) ---
    print(f"\n--- Unique '{location_name_col}' values in your data (for {geographic_level_type}s with GDP, AFTER FULL CLEANING AND AGGREGATION): ---")
    print(df_gdp_aggregated[location_name_col].unique().tolist())
    print("--------------------------------------------------------------------------------")

    return df_gdp_aggregated

def create_gdp_map(df_gdp, geojson_data, location_name_col, time_col, geographic_level_type):
    """
    Creates an interactive choropleth map of regional/provincial GDP.
    """
    if df_gdp.empty or geojson_data is None:
        print("Cannot create map: Missing GDP data or GeoJSON data.")
        return

    # For simplemaps.com GeoJSONs, the name is in 'properties.name' and ID is in 'id'
    name_property_key = 'name'
    id_property_key = 'id' # simplemaps GeoJSON features usually have a top-level 'id'

    # Map GeoJSON feature properties to a unique ID for merging
    try:
        # The simplemaps GeoJSON is a FeatureCollection, so features is a list of dicts.
        # Each dict has 'properties' and 'id'. 'properties' is a dict.
        geojson_id_map = {feature['properties'][name_property_key]: feature[id_property_key] for feature in geojson_data['features']}
    except KeyError as e:
        print(f"Error: GeoJSON feature properties do not contain the expected key '{name_property_key}' or feature does not have top-level '{id_property_key}'.")
        print("Please check the structure of your GeoJSON file(s).")
        print(f"Example feature properties from GeoJSON: {geojson_data['features'][0]['properties'] if geojson_data['features'] else 'N/A'}")
        print(f"Example feature top-level keys: {geojson_data['features'][0].keys() if geojson_data['features'] else 'N/A'}")
        return

    # --- DEBUGGING: Print unique names from GeoJSON ---
    print(f"\n--- Unique '{name_property_key}' properties from GeoJSON features (from the loaded GeoJSON file): ---")
    print(list(geojson_id_map.keys()))
    print("--------------------------------------------------------------------------------")

    # Create a 'geojson_id' column in your DataFrame for merging
    # This assumes LOCATION_NAME_COLUMN_FOR_MAP in your data matches the name_property_key in GeoJSON
    df_gdp['geojson_id'] = df_gdp[location_name_col].map(geojson_id_map)

    # Filter out locations that don't have a match in the GeoJSON
    df_gdp_mapped = df_gdp.dropna(subset=['geojson_id']).copy()

    # --- NEW DEBUGGING: Identify and print missing locations ---
    mapped_locations = set(df_gdp_mapped[location_name_col].unique())
    all_gdp_locations = set(df_gdp[location_name_col].unique())
    missing_locations = sorted(list(all_gdp_locations - mapped_locations))

    if missing_locations:
        print(f"\n--- WARNING: The following {len(missing_locations)} {geographic_level_type}(s) from your GDP data did NOT find a match in the GeoJSON: ---")
        print(missing_locations)
        print("--------------------------------------------------------------------------------")
    else:
        print(f"\n✅ All {geographic_level_type}(s) from your GDP data found a match in the GeoJSON!")
    # --- END NEW DEBUGGING ---

    if df_gdp_mapped.empty:
        print(f"Warning: No matching {geographic_level_type}s found between your GDP data and the GeoJSON file after mapping.")
        print(f"Please check if {geographic_level_type} names in your data's '{location_name_col}' column match '{name_property_key}' in the GeoJSON.")
        return

    # Create the choropleth map
    fig = px.choropleth_mapbox(
        df_gdp_mapped,
        geojson=geojson_data,
        locations='geojson_id', # Column in df_gdp_mapped that contains the GeoJSON feature IDs
        color='Total_GDP', # Column to color the map by
        featureidkey=f"id", # Key in GeoJSON features to match 'locations' (simplemaps uses top-level 'id')
        animation_frame=time_col, # Column for animation (years)
        color_continuous_scale="Viridis", # Color scale for GDP values
        range_color=(df_gdp_mapped['Total_GDP'].min(), df_gdp_mapped['Total_GDP'].max()), # Consistent color range
        mapbox_style="carto-positron", # Map style
        zoom=5, # Initial zoom level for Philippines
        center={"lat": 12.8797, "lon": 121.7740}, # Center of the Philippines
        opacity=0.7,
        labels={'Total_GDP': 'GDP (PHP Billions)', time_col: 'Year'},
        title=f'Philippine {geographic_level_type}al GDP by Year',
        hover_name=location_name_col # Added hover_name for better interactivity
    )

    fig.update_layout(
        margin={"r":0,"t":50,"l":0,"b":0},
        height=700,
        coloraxis_colorbar=dict(
            title="GDP (PHP Billions)",
            thicknessmode="pixels", thickness=30,
            lenmode="pixels", len=300,
            yanchor="middle", y=0.5,
            xanchor="right", x=0.95
        )
    )

    # Save the interactive map to an HTML file
    output_filename = f"Philippine_{geographic_level_type}_GDP_Map.html"
    fig.write_html(output_filename)
    print(f"✅ Interactive map saved as {output_filename}")

# --- Main Execution Block ---
if __name__ == "__main__":
    # 1. Load your cleaned data
    df_cleaned = load_data(FILE_PATH)
    if df_cleaned is None:
        exit() # Exit if data loading failed

    # 2. Load GeoJSON data (from local file or URL)
    geojson_data = load_geojson(GEOJSON_URL)
    if geojson_data is None:
        exit() # Exit if GeoJSON loading failed

    # 3. Prepare GDP data for mapping
    df_gdp_for_map = prepare_gdp_data_for_map(
        df_cleaned, TIME_COLUMN, LOCATION_NAME_COLUMN_FOR_MAP, VALUE_COLUMN, METRIC_COLUMN, TARGET_METRIC, GEOGRAPHIC_LEVEL_TYPE
    )
    if df_gdp_for_map.empty:
        print("No suitable GDP data found for mapping. Exiting.")
        exit()

    # 4. Create and save the interactive map
    create_gdp_map(df_gdp_for_map, geojson_data, LOCATION_NAME_COLUMN_FOR_MAP, TIME_COLUMN, GEOGRAPHIC_LEVEL_TYPE)


Loading data from Excel: /content/cleaned_data.xlsx

Successfully loaded data from: '/content/cleaned_data.xlsx'
Successfully loaded local GeoJSON from: /content/philippines_regions.json

--- Sample Properties from GeoJSON features (first 5 from /content/philippines_regions.json) ---
Feature 0 properties: {'source': 'https://simplemaps.com', 'id': 'PH11', 'name': 'Davao'}
Feature 1 properties: {'source': 'https://simplemaps.com', 'id': 'PH13', 'name': 'Caraga'}
Feature 2 properties: {'source': 'https://simplemaps.com', 'id': 'PH10', 'name': 'Northern Mindanao'}
Feature 3 properties: {'source': 'https://simplemaps.com', 'id': 'PH14', 'name': 'Autonomous Region in Muslim Mindanao'}
Feature 4 properties: {'source': 'https://simplemaps.com', 'id': 'PH09', 'name': 'Zamboanga Peninsula'}
--------------------------------------------------

--- Original Unique 'Location_Name' values in your data (before cleaning/mapping): ---
['National Capital Region (NCR)', 'Cordillera Administrative Region 

provinces


In [None]:
import pandas as pd
import plotly.express as px
import json
import os
import requests # Import requests for loading GeoJSON
import re # Import regex for string cleaning

# === CONFIGURATION ===
FILE_PATH = '/content/cleaned_data.xlsx'
# Set GeoJSON URL to point to a LOCAL GeoJSON file for provinces
GEOJSON_URL = '/content/philippines_provinces.json'
# IMPORTANT: You MUST manually download the GeoJSON for Philippine provinces (Admin Areas level 2)
# from https://simplemaps.com/gis/country/ph and upload it to /content/ as 'philippines_provinces.json'

TIME_COLUMN = 'Start_Year'
# Column in your Excel data that contains the geographic names (Region or Province)
LOCATION_NAME_COLUMN_FOR_MAP = 'Location_Name'
VALUE_COLUMN = 'Value'
METRIC_COLUMN = 'Metric'
TARGET_METRIC = 'GDP'
# Set the geographic level you want to map ('Region' or 'Province')
GEOGRAPHIC_LEVEL_TYPE = 'Province' # Changed to Province for provincial map
# =====================

def load_data(file_path):
    """Loads data from CSV or Excel based on file extension."""
    try:
        file_extension = os.path.splitext(file_path)[1].lower()
        if file_extension == '.csv':
            df = pd.read_csv(file_path)
            print(f"Loading data from CSV: {file_path}")
        elif file_extension == '.xlsx':
            df = pd.read_excel(file_path)
            print(f"Loading data from Excel: {file_path}")
        else:
            raise ValueError(f"Unsupported file type '{file_extension}'. Please provide a .csv or .xlsx file.")
        df.columns = df.columns.str.strip() # Clean column names
        print(f"\nSuccessfully loaded data from: '{file_path}'")
        return df
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found. Please ensure it's uploaded to /content/.")
        print("If you just restarted the Colab runtime, you might need to re-upload the file.")
        print("You can upload it using: `from google.colab import files; uploaded = files.upload()`")
        return None
    except Exception as e:
        print(f"An unexpected error occurred during data loading: {e}")
        return None

def load_geojson(path_or_url):
    """
    Loads GeoJSON data from a URL or local file.
    This version expects a single GeoJSON file.
    Includes enhanced error reporting for JSONDecodeError.
    """
    try:
        if os.path.exists(path_or_url) and os.path.isfile(path_or_url): # Check if it's a local file path
            with open(path_or_url, 'r') as f:
                geojson_data = json.load(f)
            print(f"Successfully loaded local GeoJSON from: {path_or_url}")
        else: # Assume it's a URL
            response = requests.get(path_or_url)
            response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
            geojson_data = response.json()
            print(f"Successfully loaded GeoJSON from URL: {path_or_url}")
        return geojson_data
    except FileNotFoundError:
        print(f"Error: Local GeoJSON file '{path_or_url}' not found. Please ensure it's uploaded to /content/ if it's a local path.")
        return None
    except requests.exceptions.RequestException as e:
        print(f"Error loading GeoJSON from URL: {e}")
        print(f"URL attempted: {path_or_url}")
        print("Please check the URL or try providing a local GeoJSON file.")
        return None
    except json.JSONDecodeError as e:
        print(f"Error decoding GeoJSON: {e}. The file might not be a valid JSON.")
        print(f"Content received (first 500 chars): {response.text[:500] if 'response' in locals() else 'N/A'}")
        print(f"URL attempted: {path_or_url}")
        print("This often means the content is not valid JSON (e.g., it's an HTML error page).")
        print("Please verify the GeoJSON URL points to a RAW JSON file, not a GitHub folder page.")
        return None
    except Exception as e:
        print(f"An unexpected error occurred during GeoJSON loading: {e}")
        return None

def prepare_gdp_data_for_map(df, time_col, location_name_col, value_col, metric_col, target_metric, geographic_level_type):
    """
    Filters and aggregates GDP data for mapping based on the specified geographic level.
    Ensures data is at the correct level and ready for merging.
    Includes cleaning of Location_Name for matching.
    """
    # Filter for GDP metric and the specified geographic level (Province)
    # Modified to include 'City' and 'Province/City' as Location_Type
    df_gdp_level = df[
        (df[METRIC_COLUMN] == TARGET_METRIC) &
        (df['Location_Type'].isin(['Province', 'City', 'Province/City'])) # Include relevant types
    ].copy()

    if df_gdp_level.empty:
        print(f"Warning: No {geographic_level_type} GDP data found for metric '{TARGET_METRIC}'.")
        print(f"Available metrics: {df[METRIC_COLUMN].unique().tolist()}")
        print(f"Available Location_Types for GDP: {df[df[METRIC_COLUMN] == TARGET_METRIC]['Location_Type'].unique().tolist()}")
        return pd.DataFrame()

    # --- CRITICAL CLEANING STEP FOR LOCATION_NAME ---
    # 1. Remove text in parentheses and strip whitespace
    df_gdp_level[location_name_col] = df_gdp_level[location_name_col].apply(
        lambda name: re.sub(r'\s*\(.*\)\s*', '', str(name)).strip()
    )

    # 2. Filter out non-geographical entries like 'EXISTING DATA BUT WITHHELD'
    df_gdp_level = df_gdp_level[df_gdp_level[location_name_col] != 'EXISTING DATA BUT WITHHELD'].copy()
    if df_gdp_level.empty:
        print("Warning: No valid geographical GDP data found after filtering out non-geographical entries.")
        return pd.DataFrame()

    # 3. Create a temporary column with uppercase names for robust matching
    df_gdp_level['Temp_Location_Name_Upper'] = df_gdp_level[location_name_col].str.upper()

    # 4. Define a comprehensive mapping from the UPPERCASE version of your data's names
    # to the EXACT casing of the GeoJSON names for PROVINCES/CITIES.
    name_replacements_upper_to_geojson = {
        # --- PROVINCE MAPPINGS (from your data's uppercase to GeoJSON's exact name) ---
        'ABRA': 'Abra',
        'AGUSAN DEL NORTE': 'Agusan del Norte',
        'AGUSAN DEL SUR': 'Agusan del Sur',
        'AKLAN': 'Aklan',
        'ALBAY': 'Albay',
        'ANTIQUE': 'Antique',
        'APAYAO': 'Apayao',
        'AURORA': 'Aurora',
        'BATANES': 'Batanes',
        'BASILAN': 'Basilan',
        'BATAAN': 'Bataan',
        'BATANGAS': 'Batangas',
        'BENGUET': 'Benguet',
        'BILIRAN': 'Biliran', # This province is missing in simplemaps.com GeoJSON
        'BOHOL': 'Bohol',
        'BUKIDNON': 'Bukidnon',
        'BULACAN': 'Bulacan',
        'CAGAYAN': 'Cagayan',
        'CAMARINES NORTE': 'Camarines Norte',
        'CAMARINES SUR': 'Camarines Sur',
        'CAMIGUIN': 'Camiguin',
        'CAPIZ': 'Capiz',
        'CATANDUANES': 'Catanduanes', # This province is missing in simplemaps.com GeoJSON
        'CAVITE': 'Cavite',
        'CEBU': 'Cebu',
        'COMPOSTELA VALLEY': 'Davao de Oro', # Compostela Valley renamed to Davao de Oro
        'COTABATO': 'Cotabato', # Also known as North Cotabato
        'DAVAO DEL NORTE': 'Davao del Norte',
        'DAVAO DEL SUR': 'Davao del Sur',
        'DAVAO OCCIDENTAL': 'Davao Occidental',
        'DAVAO ORIENTAL': 'Davao Oriental',
        'DINAGAT ISLANDS': 'Dinagat Islands', # This province is missing in simplemaps.com GeoJSON
        'EASTERN SAMAR': 'Eastern Samar',
        'GUIMARAS': 'Guimaras', # This province is missing in simplemaps.com GeoJSON
        'IFUGAO': 'Ifugao',
        'ILOCOS NORTE': 'Ilocos Norte',
        'ILOCOS SUR': 'Ilocos Sur',
        'ILOILO': 'Iloilo',
        'ISABELA': 'Isabela',
        'KALINGA': 'Kalinga',
        'LA UNION': 'La Union',
        'LAGUNA': 'Laguna',
        'LANAO DEL NORTE': 'Lanao del Norte',
        'LANAO DEL SUR': 'Lanao del Sur',
        'LEYTE': 'Leyte',
        'MAGUINDANAO DEL NORTE': 'Maguindanao', # Map to old Maguindanao (if GeoJSON lacks new splits)
        'MAGUINDANAO DEL SUR': 'Maguindanao', # Map to old Maguindanao (if GeoJSON lacks new splits)
        'MARINDUQUE': 'Marinduque', # This province is missing in simplemaps.com GeoJSON
        'MASBATE': 'Masbate', # This province is missing in simplemaps.com GeoJSON
        'MISAMIS OCCIDENTAL': 'Misamis Occidental',
        'MISAMIS ORIENTAL': 'Misamis Oriental',
        'MOUNTAIN PROVINCE': 'Mountain Province',
        'NEGROS OCCIDENTAL': 'Negros Occidental',
        'NEGROS ORIENTAL': 'Negros Oriental',
        'NORTHERN SAMAR': 'Northern Samar',
        'NUEVA ECIJA': 'Nueva Ecija',
        'NUEVA VIZCAYA': 'Nueva Vizcaya',
        'OCCIDENTAL MINDORO': 'Mindoro Occidental',
        'ORIENTAL MINDORO': 'Mindoro Oriental',
        'PALAWAN': 'Palawan',
        'PAMPANGA': 'Pampanga',
        'PANGASINAN': 'Pangasinan',
        'QUEZON': 'Quezon',
        'QUIRINO': 'Quirino',
        'RIZAL': 'Rizal',
        'ROMBLON': 'Romblon', # This province is missing in simplemaps.com GeoJSON
        'SAMAR': 'Samar',
        'SARANGANI': 'Sarangani',
        'SIQUIJOR': 'Siquijor', # This province is missing in simplemaps.com GeoJSON
        'SORSOGON': 'Sorsogon', # This province is missing in simplemaps.com GeoJSON
        'SOUTH COTABATO': 'South Cotabato',
        'SOUTHERN LEYTE': 'Southern Leyte',
        'SULTAN KUDARAT': 'Sultan Kudarat',
        'SULU': 'Sulu', # This province is missing in simplemaps.com GeoJSON
        'SURIGAO DEL NORTE': 'Surigao del Norte',
        'SURIGAO DEL SUR': 'Surigao del Sur',
        'TARLAC': 'Tarlac',
        'TAWI-TAWI': 'Tawi-Tawi', # This province is missing in simplemaps.com GeoJSON
        'ZAMBALES': 'Zambales',
        'ZAMBOANGA DEL NORTE': 'Zamboanga del Norte',
        'ZAMBOANGA DEL SUR': 'Zamboanga del Sur',
        'ZAMBOANGA SIBUGAY': 'Zamboanga Sibugay',

        # --- CITY MAPPINGS (from your data's uppercase to GeoJSON's exact name) ---
        # These are based on the GeoJSON's 'name' list.
        'CITY OF ANGELES': 'Angeles City',
        'CITY OF BACOLOD': 'Bacolod City',
        'CITY OF BAGUIO': 'Baguio City',
        'CITY OF BUTUAN': 'Butuan City',
        'CITY OF CAGAYAN DE ORO': 'Cagayan de Oro City',
        'CITY OF CALOOCAN': 'Caloocan',
        'CITY OF CEBU': 'Cebu City',
        'CITY OF DAVAO': 'Davao City',
        'CITY OF GENERAL SANTOS': 'General Santos City',
        'CITY OF ILIGAN': 'Iligan City',
        'CITY OF ILOILO': 'Iloilo City',
        'CITY OF ISABELA': 'Isabela City',
        'CITY OF LAPU-LAPU': 'Lapu-Lapu City',
        'CITY OF LAS PIÑAS': 'Las Piñas',
        'CITY OF LUCENA': 'Lucena City',
        'CITY OF MAKATI': 'Makati',
        'CITY OF MALABON': 'Malabon City',
        'CITY OF MANDALUYONG': 'Mandaluyong',
        'CITY OF MANDAUE': 'Mandaue City',
        'CITY OF MANILA': 'Manila',
        'CITY OF MARIKINA': 'Marikina',
        'CITY OF MUNTINLUPA': 'Muntinlupa',
        'CITY OF NAVOTAS': 'Navotas',
        'CITY OF OLONGAPO': 'Olongapo City',
        'CITY OF PARAÃ±AQUE': 'Parañaque City', # Correcting encoding issue
        'CITY OF PASIG': 'Pasig',
        'CITY OF PUERTO PRINCESA': 'Puerto Princesa City',
        'CITY OF SAN JUAN': 'San Juan',
        'CITY OF TACLOBAN': 'Tacloban City',
        'CITY OF TAGUIG': 'Taguig',
        'CITY OF VALENZUELA': 'Valenzuela',
        'CITY OF ZAMBOANGA': 'Zamboanga City',
        'PASAY CITY': 'Pasay City',
        'PATEROS': 'Pateros',
        'QUEZON CITY': 'Quezon City',

        # --- Regional entities that might appear in province data ---
        'NATIONAL CAPITAL REGION': 'Metropolitan Manila',
        'CORDILLERA ADMINISTRATIVE REGION': 'Cordillera Administrative Region',
        'AUTONOMOUS REGION IN MUSLIM MINDANAO': 'Autonomous Region in Muslim Mindanao',
        'BANGSAMORO AUTONOMOUS REGION IN MUSLIM MINDANAO': 'Autonomous Region in Muslim Mindanao',
    }

    # Apply the replacements using the temporary uppercase column.
    df_gdp_level[location_name_col] = df_gdp_level['Temp_Location_Name_Upper'].apply(
        lambda x: name_replacements_upper_to_geojson.get(x, x) # If not found, keep the uppercase name
    )

    # Drop the temporary column
    df_gdp_level.drop(columns=['Temp_Location_Name_Upper'], inplace=True)

    # Handle any remaining NaN values in Location_Name if mapping failed for some reason
    # This will drop entries that did not find a match in the GeoJSON.
    initial_rows = len(df_gdp_level)
    df_gdp_level.dropna(subset=[location_name_col], inplace=True)
    rows_dropped = initial_rows - len(df_gdp_level)
    if rows_dropped > 0:
        print(f"Warning: Dropped {rows_dropped} rows because their '{location_name_col}' did not match any GeoJSON feature name after cleaning.")
        # Optionally, print the names that were dropped for further debugging
        # print(f"Dropped names: {set(df_gdp_level_original[~df_gdp_level_original[location_name_col].isin(df_gdp_level[location_name_col])]['Temp_Location_Name_Upper'].unique())}")
    # --- END CRITICAL CLEANING STEP ---


    # Aggregate GDP by Location Name (Province/City) and Year
    # Summing 'Value' to get total GDP for each location per year
    df_gdp_aggregated = df_gdp_level.groupby([time_col, location_name_col]).agg(
        Total_GDP=(value_col, 'sum')
    ).reset_index()

    # Ensure 'Total_GDP' is numeric
    df_gdp_aggregated['Total_GDP'] = pd.to_numeric(df_gdp_aggregated['Total_GDP'], errors='coerce')
    df_gdp_aggregated.dropna(subset=['Total_GDP'], inplace=True) # Drop rows with NaN GDP

    # Convert 'Start_Year' to string for animation_frame
    df_gdp_aggregated[time_col] = df_gdp_aggregated[time_col].astype(str)

    print(f"\nPrepared GDP data for mapping. Number of entries: {len(df_gdp_aggregated)}")
    print(f"Years available: {df_gdp_aggregated[time_col].unique().tolist()}")

    # --- DEBUGGING: Print unique location names from your data (after cleaning) ---
    print(f"\nUnique '{location_name_col}' values in your data (for {geographic_level_type}s with GDP, AFTER CLEANING):")
    print(df_gdp_aggregated[location_name_col].unique().tolist())
    # --- END DEBUGGING ---

    return df_gdp_aggregated

def create_gdp_map(df_gdp, geojson_data, location_name_col, time_col, geographic_level_type):
    """
    Creates an interactive choropleth map of regional/provincial GDP.
    """
    if df_gdp.empty or geojson_data is None:
        print("Cannot create map: Missing GDP data or GeoJSON data.")
        return

    # For simplemaps.com GeoJSONs, the name is in 'properties.name'
    name_property_key = 'name'

    # Map GeoJSON feature properties to a unique ID for merging
    try:
        # The simplemaps GeoJSON is a FeatureCollection, so features is a list of dicts.
        # Each dict has 'properties' and 'id'. 'properties' is a dict.
        geojson_id_map = {feature['properties'][name_property_key]: feature['id'] for feature in geojson_data['features']}
    except KeyError as e:
        print(f"Error: GeoJSON feature properties do not contain the expected key '{name_property_key}'.")
        print("Please check the structure of your GeoJSON file(s).")
        print(f"Example feature properties from GeoJSON: {geojson_data['features'][0]['properties'] if geojson_data['features'] else 'N/A'}")
        return

    # --- DEBUGGING: Print unique names from GeoJSON ---
    print(f"\nUnique '{name_property_key}' properties from GeoJSON features (from the loaded GeoJSON file):")
    print(list(geojson_id_map.keys()))
    # --- DEBUGGING: Print the geojson_id_map being created ---
    print(f"\nGenerated geojson_id_map: {geojson_id_map}")
    # --- END DEBUGGING ---

    # Create a 'geojson_id' column in your DataFrame for merging
    # This assumes LOCATION_NAME_COLUMN_FOR_MAP in your data matches the name_property_key in GeoJSON
    df_gdp['geojson_id'] = df_gdp[location_name_col].map(geojson_id_map)

    # Filter out locations that don't have a match in the GeoJSON
    df_gdp_mapped = df_gdp.dropna(subset=['geojson_id']).copy()

    if df_gdp_mapped.empty:
        print(f"Warning: No matching {geographic_level_type}s found between your GDP data and the GeoJSON file.")
        print(f"Please check if {geographic_level_type} names in your data's '{location_name_col}' column match '{name_property_key}' in the GeoJSON.")
        print(f"{geographic_level_type}s in your data: {df_gdp[location_name_col].unique().tolist()}")
        print(f"{geographic_level_type}s in GeoJSON ({name_property_key} property): {list(geojson_id_map.keys())}")
        return

    # Create the choropleth map
    fig = px.choropleth_mapbox(
        df_gdp_mapped,
        geojson=geojson_data,
        locations='geojson_id', # Column in df_gdp_mapped that contains the GeoJSON feature IDs
        color='Total_GDP', # Column to color the map by
        featureidkey="id", # Key in GeoJSON features to match 'locations'
        animation_frame=time_col, # Column for animation (years)
        color_continuous_scale="Viridis", # Color scale for GDP values
        range_color=(df_gdp_mapped['Total_GDP'].min(), df_gdp_mapped['Total_GDP'].max()), # Consistent color range
        mapbox_style="carto-positron", # Map style
        zoom=5, # Initial zoom level for Philippines
        center={"lat": 12.8797, "lon": 121.7740}, # Center of the Philippines
        opacity=0.7,
        labels={'Total_GDP': 'GDP (PHP Billions)', time_col: 'Year'},
        title=f'Philippine {geographic_level_type}al GDP by Year'
    )

    fig.update_layout(
        margin={"r":0,"t":50,"l":0,"b":0},
        height=700,
        coloraxis_colorbar=dict(
            title="GDP (PHP Billions)",
            thicknessmode="pixels", thickness=30,
            lenmode="pixels", len=300,
            yanchor="middle", y=0.5,
            xanchor="right", x=0.95
        )
    )

    # Save the interactive map to an HTML file
    output_filename = f"Philippine_{geographic_level_type}_GDP_Map.html"
    fig.write_html(output_filename)
    print(f"✅ Interactive map saved as {output_filename}")

# --- Main Execution Block ---
if __name__ == "__main__":
    # 1. Load your cleaned data
    df_cleaned = load_data(FILE_PATH)
    if df_cleaned is None:
        exit() # Exit if data loading failed

    # 2. Load GeoJSON data (from local file or URL)
    geojson_data = load_geojson(GEOJSON_URL)
    if geojson_data is None:
        exit() # Exit if GeoJSON loading failed

    # 3. Prepare GDP data for mapping
    df_gdp_for_map = prepare_gdp_data_for_map(
        df_cleaned, TIME_COLUMN, LOCATION_NAME_COLUMN_FOR_MAP, VALUE_COLUMN, METRIC_COLUMN, TARGET_METRIC, GEOGRAPHIC_LEVEL_TYPE
    )
    if df_gdp_for_map.empty:
        print("No suitable GDP data found for mapping. Exiting.")
        exit()

    # 4. Create and save the interactive map
    create_gdp_map(df_gdp_for_map, geojson_data, LOCATION_NAME_COLUMN_FOR_MAP, TIME_COLUMN, GEOGRAPHIC_LEVEL_TYPE)


Loading data from Excel: /content/cleaned_data.xlsx

Successfully loaded data from: '/content/cleaned_data.xlsx'
Successfully loaded local GeoJSON from: /content/philippines_provinces.json

Prepared GDP data for mapping. Number of entries: 702
Years available: ['2018', '2019', '2020', '2021', '2022', '2023']

Unique 'Location_Name' values in your data (for Provinces with GDP, AFTER CLEANING):
['Abra', 'Agusan del Norte', 'Agusan del Sur', 'Aklan', 'Albay', 'Angeles City', 'Antique', 'Apayao', 'Aurora', 'Bacolod City', 'Baguio City', 'Basilan', 'Bataan', 'Batanes', 'Batangas', 'Benguet', 'Biliran', 'Bohol', 'Bukidnon', 'Bulacan', 'Butuan City', 'Cagayan', 'Cagayan de Oro City', 'Caloocan', 'Camarines Norte', 'Camarines Sur', 'Camiguin', 'Capiz', 'Catanduanes', 'Cavite', 'Cebu', 'Cebu City', 'Cotabato', 'DAVAO DE ORO', 'Davao City', 'Davao Occidental', 'Davao Oriental', 'Davao del Norte', 'Davao del Sur', 'Dinagat Islands', 'EXISTING DATA BUT WITHHELD', 'Eastern Samar', 'General Santos C

In [None]:
import pandas as pd
import plotly.express as px
import json
import os
import requests # Import requests for loading GeoJSON
import re # Import regex for string cleaning

# === CONFIGURATION ===
FILE_PATH = '/content/cleaned_data.xlsx'
# Set GeoJSON URL to point to a LOCAL GeoJSON file for provinces
# IMPORTANT: Updated to use the new file: '/content/gadm41_PHL_3.json'
# This is a GADM Level 3 file, which might contain municipalities/cities.
# We will need to identify the correct property keys for provinces from its structure.
PROVINCE_GEOJSON_FOLDER = '' # No longer dynamically loading from a folder
GEOJSON_URLS = ['/content/gadm41_PHL_3.json'] # New GeoJSON file path

if not GEOJSON_URLS:
    print("Error: No GeoJSON files specified in GEOJSON_URLS. Please update the configuration.")


TIME_COLUMN = 'Start_Year'
# Column in your Excel data that contains the geographic names (Region or Province)
LOCATION_NAME_COLUMN_FOR_MAP = 'Location_Name'
VALUE_COLUMN = 'Value'
METRIC_COLUMN = 'Metric'
TARGET_METRIC = 'GDP'
# Set the geographic level you want to map ('Region' or 'Province')
GEOGRAPHIC_LEVEL_TYPE = 'Province' # Changed to Province for provincial map

# --- NEW: Define the property keys for name and ID in your GeoJSON files ---
# These are now generic placeholders. They will need to be adjusted once the
# structure of '/content/gadm41_PHL_3.json' is understood from its sample properties.
GEOJSON_NAME_PROPERTY_KEY = 'name_property_in_geojson' # Placeholder for actual name key (e.g., 'NAME_2', 'province', 'name')
GEOJSON_ID_PROPERTY_KEY = 'id_property_in_geojson'     # Placeholder for actual ID key (e.g., 'GID_2', 'id', 'code')
# =====================

def load_data(file_path):
    """Loads data from CSV or Excel based on file extension."""
    try:
        file_extension = os.path.splitext(file_path)[1].lower()
        if file_extension == '.csv':
            df = pd.read_csv(file_path)
            print(f"Loading data from CSV: {file_path}")
        elif file_extension == '.xlsx':
            df = pd.read_excel(file_path)
            print(f"Loading data from Excel: {file_path}")
        else:
            raise ValueError(f"Unsupported file type '{file_extension}'. Please provide a .csv or .xlsx file.")
        df.columns = df.columns.str.strip() # Clean column names
        print(f"\nSuccessfully loaded data from: '{file_path}'")
        return df
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found. Please ensure it's uploaded to /content/.")
        print("If you just restarted the Colab runtime, you might need to re-upload the file.")
        print("You can upload it using: `from google.colab import files; uploaded = files.upload()`")
        return None
    except Exception as e:
        print(f"An unexpected error occurred during data loading: {e}")
        return None

def load_geojson(paths_or_urls, name_prop_key, id_prop_key):
    """
    Loads GeoJSON data from a list of URLs or local files and merges them into a single FeatureCollection.
    Uses specified name and ID property keys.
    """
    merged_geojson_data = {
        "type": "FeatureCollection",
        "features": []
    }

    # Keep track of unique names found in GeoJSON features for debugging
    unique_geojson_names = set()

    if not paths_or_urls:
        print("No GeoJSON files specified to load.")
        return None

    for path_or_url in paths_or_urls:
        try:
            print(f"Attempting to load GeoJSON from: '{path_or_url}'")
            # Debugging print to check if the file is recognized as local
            print(f"  Checking if '{path_or_url}' exists locally: {os.path.exists(path_or_url)} and is a file: {os.path.isfile(path_or_url)}")

            if os.path.exists(path_or_url) and os.path.isfile(path_or_url):
                with open(path_or_url, 'r') as f:
                    geojson_part = json.load(f)
                print(f"  Successfully loaded local GeoJSON from: {path_or_url}")
            else:
                response = requests.get(path_or_url)
                response.raise_for_status()
                geojson_part = response.json()
                print(f"  Successfully loaded GeoJSON from URL: {path_or_url}")

            # --- CRITICAL CHECK FOR VALID GEOJSON STRUCTURE ---
            # A valid GeoJSON FeatureCollection must have a 'features' key which is a list.
            if 'features' in geojson_part and isinstance(geojson_part['features'], list):
                # --- NEW DEBUGGING: Print properties of first few features ---
                print(f"\n--- Sample Properties from GeoJSON features (first 5 from {path_or_url}) ---")
                for i, feature in enumerate(geojson_part['features'][:5]):
                    print(f"Feature {i} properties: {feature.get('properties', {})}")
                print("--------------------------------------------------")
                # --- END NEW DEBUGGING ---

                for feature in geojson_part['features']:
                    # Only add features that have the required name and ID properties
                    properties = feature.get('properties', {})
                    if name_prop_key in properties and id_prop_key in properties:
                        # Ensure the 'id' field of the feature (used by Plotly's featureidkey)
                        # is correctly set to the GeoJSON_ID_PROPERTY_KEY value.
                        feature['id'] = properties[id_prop_key]
                        merged_geojson_data['features'].append(feature)
                        unique_geojson_names.add(properties[name_prop_key])
                    else:
                        # This warning can be verbose if many features are missing keys, but useful for debugging
                        print(f"  Warning: Feature in GeoJSON missing expected property keys ('{name_prop_key}' or '{id_prop_key}'). Skipping feature.")
                        print(f"  Feature properties: {properties}")
            else:
                print(f"  Error: GeoJSON from '{path_or_url}' does not contain a valid 'features' list (it's either missing or not a list).")
                print("  A valid GeoJSON FeatureCollection must have a top-level 'features' key whose value is a list of features.")

        except FileNotFoundError:
            print(f"  Error: Local GeoJSON file '{path_or_url}' not found. Please ensure it's uploaded to /content/ if it's a local path.")
        except requests.exceptions.RequestException as e:
            print(f"  Error loading GeoJSON from URL: {e}")
            print(f"  URL attempted: {path_or_url}")
            print("  Please check the URL or try providing a local GeoJSON file.")
        except json.JSONDecodeError as e:
            print(f"  Error decoding GeoJSON from '{path_or_url}': {e}. The file might not be a valid JSON.")
            if os.path.exists(path_or_url) and os.path.isfile(path_or_url):
                with open(path_or_url, 'r', errors='ignore') as f:
                    content_preview = f.read(500)
                print(f"  Content preview (first 500 chars): {content_preview}")
            else:
                print(f"  Content preview: N/A (File not found locally)")
        except Exception as e:
            print(f"  An unexpected error occurred during GeoJSON loading from '{path_or_url}': {e}")

    if not merged_geojson_data['features']:
        print("Error: No GeoJSON features were successfully loaded from any of the provided paths/URLs.")
        return None

    # Add unique names to the merged_geojson_data for debugging in create_gdp_map
    merged_geojson_data['unique_names_for_debug'] = sorted(list(unique_geojson_names))
    return merged_geojson_data

def prepare_gdp_data_for_map(df, time_col, location_name_col, value_col, metric_col, target_metric, geographic_level_type):
    """
    Filters and aggregates GDP data for mapping based on the specified geographic level.
    Ensures data is at the correct level and ready for merging.
    Includes cleaning of Location_Name for matching.
    """
    # Filter for GDP metric and the specified geographic level (Province)
    # Modified to include 'City' and 'Province/City' as Location_Type
    df_gdp_level = df[
        (df[METRIC_COLUMN] == TARGET_METRIC) &
        (df['Location_Type'].isin(['Province', 'City', 'Province/City'])) # Include relevant types
    ].copy()

    if df_gdp_level.empty:
        print(f"Warning: No {geographic_level_type} GDP data found for metric '{TARGET_METRIC}'.")
        print(f"Available metrics: {df[METRIC_COLUMN].unique().tolist()}")
        print(f"Available Location_Types for GDP: {df[df[METRIC_COLUMN] == TARGET_METRIC]['Location_Type'].unique().tolist()}")
        return pd.DataFrame()

    # --- DEBUGGING: Print original unique Location_Names ---
    print(f"\n--- Original Unique '{location_name_col}' values in your data (before cleaning/mapping): ---")
    print(df_gdp_level[location_name_col].unique().tolist())
    print("--------------------------------------------------------------------------------")

    # --- CRITICAL CLEANING STEP FOR LOCATION_NAME ---
    # 1. Remove text in parentheses and strip whitespace
    df_gdp_level[location_name_col] = df_gdp_level[location_name_col].apply(
        lambda name: re.sub(r'\s*\(.*\)\s*', '', str(name)).strip()
    )

    # 2. Filter out non-geographical entries like 'EXISTING DATA BUT WITHHELD'
    df_gdp_level = df_gdp_level[df_gdp_level[location_name_col] != 'EXISTING DATA BUT WITHHELD'].copy()
    if df_gdp_level.empty:
        print("Warning: No valid geographical GDP data found after filtering out non-geographical entries.")
        return pd.DataFrame()

    # 3. Create a temporary column with uppercase names for robust matching
    df_gdp_level['Temp_Location_Name_Upper'] = df_gdp_level[location_name_col].str.upper()

    # 4. Define a comprehensive mapping from the UPPERCASE version of your data's names
    # to the EXACT casing of the GeoJSON names for PROVINCES/CITIES.
    # This dictionary is now EMPTY as we need a valid GeoJSON first to determine its names.
    name_replacements_upper_to_geojson = {
        # This dictionary will be populated once a complete and valid GeoJSON is provided.
        # Example for GADM 'NAME_2' and common data variations:
        # 'ABRA': 'Abra',
        # 'CITY OF ANGELES': 'Angeles City',
        # 'PARAÑAQUE CITY': 'Parañaque City', # Corrected encoding
        # ... and so on for all 81 provinces and HUCs
    }

    # Apply the replacements using the temporary uppercase column.
    df_gdp_level[location_name_col] = df_gdp_level['Temp_Location_Name_Upper'].apply(
        lambda x: name_replacements_upper_to_geojson.get(x, x) # If not found, keep the uppercase name
    )

    # Drop the temporary column
    df_gdp_level.drop(columns=['Temp_Location_Name_Upper'], inplace=True)

    # Aggregate GDP by Location Name (Province/City) and Year
    # Summing 'Value' to get total GDP for each location per year
    df_gdp_aggregated = df_gdp_level.groupby([time_col, location_name_col]).agg(
        Total_GDP=(value_col, 'sum')
    ).reset_index()

    # Ensure 'Total_GDP' is numeric
    df_gdp_aggregated['Total_GDP'] = pd.to_numeric(df_gdp_aggregated['Total_GDP'], errors='coerce')
    df_gdp_aggregated.dropna(subset=['Total_GDP'], inplace=True) # Drop rows with NaN GDP

    # Convert 'Start_Year' to string for animation_frame
    df_gdp_aggregated[time_col] = df_gdp_aggregated[time_col].astype(str)

    print(f"\nPrepared GDP data for mapping. Number of entries: {len(df_gdp_aggregated)}")
    print(f"Years available: {df_gdp_aggregated[time_col].unique().tolist()}")

    # --- DEBUGGING: Print unique location names from your data (after cleaning and mapping) ---
    print(f"\n--- Unique '{location_name_col}' values in your data (for {geographic_level_type}s with GDP, AFTER CLEANING AND MAPPING): ---")
    print(df_gdp_aggregated[location_name_col].unique().tolist())
    print("--------------------------------------------------------------------------------")

    return df_gdp_aggregated

def create_gdp_map(df_gdp, geojson_data, location_name_col, time_col, geographic_level_type, geojson_name_prop_key, geojson_id_prop_key):
    """
    Creates an interactive choropleth map of regional/provincial GDP.
    Uses configurable GeoJSON name and ID property keys.
    """
    if df_gdp.empty or geojson_data is None:
        print("Cannot create map: Missing GDP data or GeoJSON data.")
        return

    # Map GeoJSON feature properties to a unique ID for merging
    try:
        # Create geojson_id_map using the configurable keys
        geojson_id_map = {}
        for feature in geojson_data['features']:
            properties = feature.get('properties', {})
            if geojson_name_prop_key in properties and geojson_id_prop_key in properties:
                geojson_id_map[properties[geojson_name_prop_key]] = properties[geojson_id_prop_key]
            else:
                # This warning can be verbose if many features are missing keys, but useful for debugging
                print(f"  Warning: Feature in GeoJSON missing expected property keys ('{geojson_name_prop_key}' or '{geojson_id_prop_key}'). Skipping feature.")
                print(f"  Feature properties: {properties}")
                pass

    except Exception as e:
        print(f"Error creating geojson_id_map: {e}.")
        print(f"Please ensure your GeoJSON features have '{geojson_name_prop_key}' and '{geojson_id_prop_key}' in their 'properties'.")
        # Example feature properties from GeoJSON if available
        if geojson_data and 'features' in geojson_data and geojson_data['features']:
            print(f"Example feature properties from GeoJSON: {geojson_data['features'][0].get('properties', 'N/A')}")
        return

    # --- DEBUGGING: Print unique names from GeoJSON ---
    # This now uses the 'unique_names_for_debug' key added in load_geojson for combined names
    print(f"\n--- Unique '{geojson_name_prop_key}' properties from GeoJSON features (from the loaded GeoJSON file): ---")
    print(geojson_data.get('unique_names_for_debug', 'No unique names found for debugging.'))
    print("--------------------------------------------------------------------------------")
    # --- DEBUGGING: Print the geojson_id_map being created ---
    print(f"\n--- Generated geojson_id_map (partial view): ---")
    # Print only first 10 entries to avoid overwhelming output
    for i, (k, v) in enumerate(geojson_id_map.items()):
        if i >= 10:
            print("...")
            break
        print(f"  '{k}': '{v}'")
    print("--------------------------------------------------------------------------------")
    # --- END DEBUGGING ---

    # Create a 'geojson_id' column in your DataFrame for merging
    # This assumes LOCATION_NAME_COLUMN_FOR_MAP in your data matches the name_property_key in GeoJSON
    df_gdp['geojson_id'] = df_gdp[location_name_col].map(geojson_id_map)

    # Filter out locations that don't have a match in the GeoJSON
    df_gdp_mapped = df_gdp.dropna(subset=['geojson_id']).copy()

    # --- NEW DEBUGGING: Identify and print missing locations ---
    mapped_locations = set(df_gdp_mapped[location_name_col].unique())
    all_gdp_locations = set(df_gdp[location_name_col].unique())
    missing_locations = sorted(list(all_gdp_locations - mapped_locations))

    if missing_locations:
        print(f"\n--- WARNING: The following {len(missing_locations)} {geographic_level_type}(s) from your GDP data did NOT find a match in the GeoJSON: ---")
        print(missing_locations)
        print("--------------------------------------------------------------------------------")
    else:
        print(f"\n✅ All {geographic_level_type}(s) from your GDP data found a match in the GeoJSON!")
    # --- END NEW DEBUGGING ---


    if df_gdp_mapped.empty:
        print(f"Warning: No matching {geographic_level_type}s found between your GDP data and the GeoJSON file after mapping.")
        print(f"Please check if {geographic_level_type} names in your data's '{location_name_col}' column match '{geojson_name_prop_key}' in the GeoJSON.")
        return

    # Create the choropleth map
    fig = px.choropleth_mapbox(
        df_gdp_mapped,
        geojson=geojson_data,
        locations='geojson_id', # Column in df_gdp_mapped that contains the GeoJSON feature IDs
        color='Total_GDP', # Column to color the map by
        featureidkey=f"properties.{geojson_id_prop_key}", # Key in GeoJSON features to match 'locations'
        animation_frame=time_col, # Column for animation (years)
        color_continuous_scale="Viridis", # Color scale for GDP values
        range_color=(df_gdp_mapped['Total_GDP'].min(), df_gdp_mapped['Total_GDP'].max()), # Consistent color range
        mapbox_style="carto-positron", # Map style
        zoom=5, # Initial zoom level for Philippines
        center={"lat": 12.8797, "lon": 121.7740}, # Center of the Philippines
        opacity=0.7,
        labels={'Total_GDP': 'GDP (PHP Billions)', time_col: 'Year'},
        title=f'Philippine {geographic_level_type}al GDP by Year',
        hover_name=location_name_col # Explicitly set hover_name to the location name column
    )

    fig.update_layout(
        margin={"r":0,"t":50,"l":0,"b":0},
        height=700,
        coloraxis_colorbar=dict(
            title="GDP (PHP Billions)",
            thicknessmode="pixels", thickness=30,
            lenmode="pixels", len=300,
            yanchor="middle", y=0.5,
            xanchor="right", x=0.95
        )
    )

    # Save the interactive map to an HTML file
    output_filename = f"Philippine_{geographic_level_type}_GDP_Map.html"
    fig.write_html(output_filename)
    print(f"✅ Interactive map saved as {output_filename}")

# --- Main Execution Block ---
if __name__ == "__main__":
    # 1. Load your cleaned data
    df_cleaned = load_data(FILE_PATH)
    if df_cleaned is None:
        exit() # Exit if data loading failed

    # 2. Load GeoJSON data (from local file or URL)
    # Pass GEOJSON_URLS (list) and the new property keys to load_geojson
    geojson_data = load_geojson(GEOJSON_URLS, GEOJSON_NAME_PROPERTY_KEY, GEOJSON_ID_PROPERTY_KEY)
    if geojson_data is None:
        exit() # Exit if GeoJSON loading failed

    # 3. Prepare GDP data for mapping
    df_gdp_for_map = prepare_gdp_data_for_map(
        df_cleaned, TIME_COLUMN, LOCATION_NAME_COLUMN_FOR_MAP, VALUE_COLUMN, METRIC_COLUMN, TARGET_METRIC, GEOGRAPHIC_LEVEL_TYPE
    )
    if df_gdp_for_map.empty:
        print("No suitable GDP data found for mapping. Exiting.")
        exit()

    # 4. Create and save the interactive map
    # Pass the new property keys to create_gdp_map
    create_gdp_map(df_gdp_for_map, geojson_data, LOCATION_NAME_COLUMN_FOR_MAP, TIME_COLUMN, GEOGRAPHIC_LEVEL_TYPE, GEOJSON_NAME_PROPERTY_KEY, GEOJSON_ID_PROPERTY_KEY)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  Feature properties: {'GID_3': 'PHL.76.11.7_1', 'GID_0': 'PHL', 'COUNTRY': 'Philippines', 'GID_1': 'PHL.76_1', 'NAME_1': 'Tarlac', 'NL_NAME_1': 'NA', 'GID_2': 'PHL.76.11_1', 'NAME_2': 'Pura', 'NL_NAME_2': 'NA', 'NAME_3': 'Matindeg', 'VARNAME_3': 'NA', 'NL_NAME_3': 'NA', 'TYPE_3': 'Barangay', 'ENGTYPE_3': 'Village', 'CC_3': 'NA', 'HASC_3': 'NA'}
  Feature properties: {'GID_3': 'PHL.76.11.8_1', 'GID_0': 'PHL', 'COUNTRY': 'Philippines', 'GID_1': 'PHL.76_1', 'NAME_1': 'Tarlac', 'NL_NAME_1': 'NA', 'GID_2': 'PHL.76.11_1', 'NAME_2': 'Pura', 'NL_NAME_2': 'NA', 'NAME_3': 'Maungib', 'VARNAME_3': 'NA', 'NL_NAME_3': 'NA', 'TYPE_3': 'Barangay', 'ENGTYPE_3': 'Village', 'CC_3': 'NA', 'HASC_3': 'NA'}
  Feature properties: {'GID_3': 'PHL.76.11.9_1', 'GID_0': 'PHL', 'COUNTRY': 'Philippines', 'GID_1': 'PHL.76_1', 'NAME_1': 'Tarlac', 'NL_NAME_1': 'NA', 'GID_2': 'PHL.76.11_1', 'NAME_2': 'Pura', 'NL_NAME_2': 'NA', 'NAME_3': 'Naya', 'VARNAME_3