In [1]:
import geopandas as gpd
import h3
import numpy as np
import pandas as pd
from shapely.geometry import Polygon

In [2]:
CITY = "budapest"
VERSION = "20250428"

In [3]:
districts = gpd.read_file(
    "../../data/osm/budapest/budapest_districts_without_margaret_island.geojson"
)

In [4]:
OUTPUT = f"../../output/{CITY}/{VERSION}"

# Read the GeoJSON file where for each stop id we know the multimodal area (5 min walk + 10 min BKK) + ellipticity of this shape + size of the area
# and multipolygons describing the 5 min walk from each such station
multimodal = gpd.read_file(
    f"{OUTPUT}/stop_geometries_from_walk.geojson", engine="pyogrio"
)
# Read GeoJSON file for 15 minute walking distance
walking = gpd.read_file(f"{OUTPUT}/isochrones.geojson", engine="pyogrio")
# read centrality csv
centrality = pd.read_csv(f"{OUTPUT}/merged.csv", dtype={"stop_id": str})

# read in geojson for city boundary
bp = gpd.read_file(f"../../data/osm/{CITY}/boundary.geojson", engine="pyogrio")
# Read the pickle with telekom data for socioecon info
szk_nap = pd.read_pickle("../../data/telekom/tkom_sept21.pkl")
# income = pd.read_csv("/mnt/common-ssd/zadorzsofi/telekom/BKK/data/socioecon/stadat-jov0003-14.8.1.2_income_deciles_total.csv", header=1, index_col=0, delimiter=";", thousands=" ")
# distrcit gross income
dist_income = pd.read_csv(f"../../data/statistics/{CITY}/gross_mean_income_2024.csv")
# gini from housing price
gini_house_multi = pd.read_csv(f"{OUTPUT}/multimodal_stop_gini.csv")
gini_house_walk = pd.read_csv(f"{OUTPUT}/walk15_stop_gini.csv")

In [5]:
# Gross labour income per capita by income deciles for Hungary
income = (
    pd.read_csv(
        "../../data/statistics/hungary/stadat-jov0003-14.8.1.2-en.csv",
        delimiter=";",
        thousands=" ",
        skiprows=1,
        skipfooter=26,
        engine="python",
    )
    .drop(0)
    .reset_index(drop=True)
)
income.set_index("Denomination", inplace=True)

# Pipeline

0. Filter walking to 15min walk
1. match hexagons & other small shapes to walking15 shapes and multimodal
2. Append to socioecon data
3. For multimodal, count average income, nr poor households/total households (poor household %) within area
4. For walking15 count within area
5. Calculate a) low income ratio b) Gini, c) income entropy -- however how? income level at each location? and characterise low/mid/high income for Gini?

### Filter walking

In [6]:
walking15 = walking[(walking["costing"] == "walk") & (walking["range"] == 15)]
walking15 = walking15.copy()
walking15.drop(columns=["costing", "range"], inplace=True)

### 1. Match hex to shapes

In [7]:
def add_hexagon_ids(geo_df, hex_resolution=10):
    """
    Adds a column of H3 hexagon IDs to a GeoDataFrame based on its geometries.

    Parameters:
    geo_df (GeoDataFrame): The input GeoDataFrame with geometries.
    hex_resolution (int): The H3 resolution level to use.

    Returns:
    GeoDataFrame: The GeoDataFrame with a new column 'hexagon_ids' containing the hexagon IDs.
    """
    hexagon_lists = []  # List to store hexagon IDs for each geometry

    # Loop through each geometry in the GeoDataFrame
    for idx, row in geo_df.iterrows():
        # Ensure the geometry is valid
        if row.geometry.is_valid:
            # Initialize an empty set to collect hex IDs
            hex_ids = set()

            # Check if the geometry is a MultiPolygon
            if row.geometry.geom_type == "MultiPolygon":
                # Loop through each Polygon in the MultiPolygon
                for polygon in row.geometry.geoms:
                    geom_dict = polygon.__geo_interface__  # Get GeoJSON representation
                    hex_ids.update(
                        h3.polyfill(geom_dict, hex_resolution, geo_json_conformant=True)
                    )
            else:
                # Handle Polygon geometries
                geom_dict = row.geometry.__geo_interface__  # Get GeoJSON representation
                hex_ids.update(
                    h3.polyfill(geom_dict, hex_resolution, geo_json_conformant=True)
                )

            # Add the hex IDs to the list
            hexagon_lists.append(list(hex_ids))
        else:
            # Add an empty list for invalid geometries
            hexagon_lists.append([])

    # Add the hexagon IDs as a new column to the GeoDataFrame
    geo_df["hexagon_ids"] = hexagon_lists

    # Drop rows where 'hexagon_ids' column is empty or contains NaN
    geo_df = geo_df[geo_df["hexagon_ids"].apply(lambda x: bool(x))]

In [8]:
# match hexagons to 15min walking areas
add_hexagon_ids(walking15, hex_resolution=10)
# Explode the hexagon_ids column in the walking GeoDataFrame
walking15_exploded = walking15.explode("hexagon_ids").rename(
    columns={"hexagon_ids": "h3_id"}
)

In [9]:
# match hexagons to Multimodal areas
add_hexagon_ids(multimodal, hex_resolution=10)
# Explode the hexagon_ids column in the walking GeoDataFrame
multimodal_exploded = multimodal.explode("hexagon_ids").rename(
    columns={"hexagon_ids": "h3_id"}
)

### # 2. Append to socioecon data

#### 0. Assign district level gross mean income to each hex in the district

In [10]:
# Merge the dataframes based on the district number
districts3 = pd.merge(
    districts,
    dist_income[["district", "gross_mean_income_2024"]],
    left_on="district_number",
    right_on="district",
    how="left",
)


def h3_to_polygon(h3_id):
    """Convert a H3 hexagon ID to a Shapely Polygon."""
    # Get the vertices of the hexagon using h3 library
    boundary = h3.h3_to_geo_boundary(h3_id, geo_json=True)
    # Convert the list of boundary points into a Polygon
    return Polygon(boundary)


# Apply the conversion for each raster_id in szk_nap
szk_nap["geometry"] = szk_nap["raster_id"].apply(h3_to_polygon)

# Now create a GeoDataFrame
szk_nap_gdf = gpd.GeoDataFrame(szk_nap, geometry="geometry", crs="EPSG:4326")

# Perform a spatial join to map hexagons to districts
szk_nap_gdf2 = gpd.sjoin(szk_nap_gdf, districts3, how="left")

szk_nap_gdf3 = szk_nap_gdf2.drop(
    columns=["geometry", "index_right", "name", "short_name", "district_number"]
)

In [11]:
szk_nap_gdf3.head()

Unnamed: 0,raster_id,traffic,arpu_low,arpu_mid,arpu_high,osm_id,district,gross_mean_income_2024
0,8a1e037ac61ffff,590.142857,298.571429,228.714286,45.714286,1605916.0,13.0,906000.0
1,8a1e037a8b0ffff,120.0,59.0,29.142857,2.428571,1605916.0,13.0,906000.0
1,8a1e037a8b0ffff,120.0,59.0,29.142857,2.428571,1606043.0,14.0,865000.0
2,8a1e036ad4f7fff,30.857143,5.571429,4.714286,0.0,1550598.0,18.0,741000.0
3,8a1e036a0927fff,21.857143,0.0,5.571429,0.0,1550597.0,23.0,688000.0


In [12]:
# Merge 15 min walk with df_szk on the H3 hexagon IDs
stop_walk15 = pd.merge(
    walking15_exploded, szk_nap_gdf3, left_on="h3_id", right_on="raster_id", how="left"
)

In [13]:
# Merge with df_szk on the H3 hexagon IDs
stop_multimodal = pd.merge(
    multimodal_exploded, szk_nap_gdf2, left_on="h3_id", right_on="raster_id", how="left"
)

### 3. and 4. Get socioecon information for each multimodal and walking area

In [14]:
# Define the columns to sum and the column to average
columns_to_sum = [
    "traffic",
    "arpu_low",
    "arpu_mid",
    "arpu_high",
]

column_to_avg = "gross_mean_income_2024"

# For 15-min walking area
nr_ppl_per_stop_walk15 = (
    stop_walk15.groupby("stop_id")
    .agg(
        {
            **{col: "sum" for col in columns_to_sum},  # Sum columns
            column_to_avg: "mean",  # Average for income - so it takes into account if area is in several districts
        }
    )
    .reset_index()
)

# For multimodal area
nr_ppl_per_stop_multimodal = (
    stop_multimodal.groupby("stop_id")
    .agg(
        {
            **{col: "sum" for col in columns_to_sum},  # Sum  columns
            column_to_avg: "mean",  # Average for 'Gross mean income monthly 2024'
        }
    )
    .reset_index()
)

### Calculate socio-econ changes

#### c) Gini coefficient

##### 1. Calculate % difference from Hungary income deciles to low-income, medium income and high income

In [15]:
# For Helsinki I used these deciles, so I'll continue using them for consistency

# In the postal code data low-income is nr of people in decile 1 and 2
# High-income is nr of people in decile 9-10
# So I want to calculate % difference in income from median for low and high income groups and use that in the GINI

In [16]:
# Compute Bp's Income Differences between median (5th decile) and low/high income
median_bp = income.loc[income.index == "5th income decile", "2020"]

low_deciles_avg = income.loc[
    income.index.isin(["1st income decile", "2nd income decile"]), "2020"
].mean()
high_deciles_avg = income.loc[
    income.index.isin(["9th income decile", "10th income decile"]), "2020"
].mean()

low_income_diff = (low_deciles_avg - median_bp) / median_bp
high_income_diff = (high_deciles_avg - median_bp) / median_bp

In [17]:
# Estimate Representative Incomes for walking
nr_ppl_per_stop_walk15["Low_Income_Rep"] = nr_ppl_per_stop_walk15[
    "gross_mean_income_2024"
] * (1 + float(low_income_diff.iloc[0]))
nr_ppl_per_stop_walk15["High_Income_Rep"] = nr_ppl_per_stop_walk15[
    "gross_mean_income_2024"
] * (1 + float(high_income_diff.iloc[0]))

# Estimate Representative Incomes for multimodal
nr_ppl_per_stop_multimodal["Low_Income_Rep"] = nr_ppl_per_stop_multimodal[
    "gross_mean_income_2024"
] * (1 + float(low_income_diff.iloc[0]))
nr_ppl_per_stop_multimodal["High_Income_Rep"] = nr_ppl_per_stop_multimodal[
    "gross_mean_income_2024"
] * (1 + float(high_income_diff.iloc[0]))

##### 2. Apply % difference to walking and multimodal areas

In [18]:
# creating an array (column), where each income category is repeated the houshold nr times
# this will be used for gini calculation
def create_income_distribution(row):
    low_income = [row["Low_Income_Rep"]] * int(row["arpu_low"])
    middle_income = [row["gross_mean_income_2024"]] * int(row["arpu_mid"])
    high_income = [row["High_Income_Rep"]] * int(row["arpu_high"])
    return low_income + middle_income + high_income

In [19]:
# define gini
def gini(array):
    """Calculate the Gini coefficient of a numpy array.
    Based on code from: https://github.com/oliviaguest/gini"""
    array = np.array(
        array, dtype=float
    )  # change from Olivia: Convert to NumPy array and flatten
    if (
        array.size == 0 or np.sum(array) == 0
    ):  # change from Olivia, as I have empty arrays
        return np.nan  # handle 0 divisions
    array = np.sort(array)
    n = array.size
    index = np.arange(1, n + 1)
    return (np.sum((2 * index - n - 1) * array)) / (n * np.sum(array))

In [20]:
nr_ppl_per_stop_walk15["Income_Distribution"] = nr_ppl_per_stop_walk15.apply(
    create_income_distribution, axis=1
)
nr_ppl_per_stop_walk15["gini"] = nr_ppl_per_stop_walk15["Income_Distribution"].apply(
    gini
)

In [21]:
nr_ppl_per_stop_multimodal["Income_Distribution"] = nr_ppl_per_stop_multimodal.apply(
    create_income_distribution, axis=1
)
nr_ppl_per_stop_multimodal["gini"] = nr_ppl_per_stop_multimodal[
    "Income_Distribution"
].apply(gini)

##### 2. Merge Gini from housing prices

In [22]:
# Rename column in gini_house_multi
gini_house_multi = gini_house_multi.rename(columns={"multimodal_gini": "gini_house"})

# Rename column in gini_house_walk
gini_house_walk = gini_house_walk.rename(columns={"walk15_gini": "gini_house"})

# Merge for multimodal
nr_ppl_per_stop_multimodal = pd.merge(
    gini_house_multi,
    nr_ppl_per_stop_multimodal,
    on="stop_id",
    how="left",  # or 'inner' depending on your goal
)

# Merge for walk
nr_ppl_per_stop_walk15 = pd.merge(
    gini_house_walk, nr_ppl_per_stop_walk15, on="stop_id", how="left"
)

#### Calculate socioecon ratios

In [23]:
# nr_ppl_per_stop_walk15["fem_ratio"] = (
#     nr_ppl_per_stop_walk15["sex_female"] / nr_ppl_per_stop_walk15["traffic"]
# )
nr_ppl_per_stop_walk15["arpu_low_ratio"] = (
    nr_ppl_per_stop_walk15["arpu_low"] / nr_ppl_per_stop_walk15["traffic"]
)
nr_ppl_per_stop_walk15["arpu_high_ratio"] = (
    nr_ppl_per_stop_walk15["arpu_high"] / nr_ppl_per_stop_walk15["traffic"]
)
# nr_ppl_per_stop_walk15["young_ratio"] = (
#     nr_ppl_per_stop_walk15["age_young"] / nr_ppl_per_stop_walk15["traffic"]
# )
# nr_ppl_per_stop_walk15["old_ratio"] = (
#     nr_ppl_per_stop_walk15["age_old"] / nr_ppl_per_stop_walk15["traffic"]
# )

In [24]:
# nr_ppl_per_stop_multimodal["fem_ratio"] = (
#     nr_ppl_per_stop_multimodal["sex_female"] / nr_ppl_per_stop_multimodal["traffic"]
# )
nr_ppl_per_stop_multimodal["arpu_low_ratio"] = (
    nr_ppl_per_stop_multimodal["arpu_low"] / nr_ppl_per_stop_multimodal["traffic"]
)
nr_ppl_per_stop_multimodal["arpu_high_ratio"] = (
    nr_ppl_per_stop_multimodal["arpu_high"] / nr_ppl_per_stop_multimodal["traffic"]
)
# nr_ppl_per_stop_multimodal["young_ratio"] = (
#     nr_ppl_per_stop_multimodal["age_young"] / nr_ppl_per_stop_multimodal["traffic"]
# )
# nr_ppl_per_stop_multimodal["old_ratio"] = (
#     nr_ppl_per_stop_multimodal["age_old"] / nr_ppl_per_stop_multimodal["traffic"]
# )

### Merge different datasets

In [25]:
# Merge the DataFrames on stop_id with suffixes but remove Income_Dist before merging as I won't need this in the final df
nr_ppl_per_stop_comparison = pd.merge(
    nr_ppl_per_stop_multimodal.drop(columns=["Income_Distribution"]),
    nr_ppl_per_stop_walk15.drop(columns=["Income_Distribution"]),
    on="stop_id",
    suffixes=("_multimodal", "_walk15"),
)

In [26]:
# calculate change of some variables
nr_ppl_comparison = nr_ppl_per_stop_comparison.dropna()
nr_ppl_comparison = nr_ppl_comparison.copy()

# nr_ppl_comparison.loc[:, "fem_change"] = (
#     nr_ppl_comparison["fem_ratio_multimodal"] - nr_ppl_comparison["fem_ratio_walk15"]
# )
nr_ppl_comparison.loc[:, "arpu_low_change"] = (
    nr_ppl_comparison["arpu_low_ratio_multimodal"]
    - nr_ppl_comparison["arpu_low_ratio_walk15"]
)
# nr_ppl_comparison.loc[:, "young_change"] = (
#     nr_ppl_comparison["young_ratio_multimodal"]
#     - nr_ppl_comparison["young_ratio_walk15"]
# )
# nr_ppl_comparison.loc[:, "old_change"] = (
#     nr_ppl_comparison["old_ratio_multimodal"] - nr_ppl_comparison["old_ratio_walk15"]
# )

In [27]:
# Calculate percentage change
nr_ppl_comparison["percent_change_gini"] = np.where(
    nr_ppl_comparison["gini_walk15"] == 0,
    np.nan,  # because division by 0 is not meaningful, or can be flag value as there is more inequality in the multimodal -- what should be the value?
    (nr_ppl_comparison["gini_multimodal"] - nr_ppl_comparison["gini_walk15"])
    / nr_ppl_comparison["gini_walk15"]
    * 100,
)

nr_ppl_comparison["percent_change_house_gini"] = np.where(
    nr_ppl_comparison["gini_house_walk15"] == 0,
    np.nan,  # because division by 0 is not meaningful, or can be flag value as there is more inequality in the multimodal -- what should be the value?
    (nr_ppl_comparison["gini_house_multimodal"] - nr_ppl_comparison["gini_walk15"])
    / nr_ppl_comparison["gini_house_walk15"]
    * 100,
)

In [28]:
# Gini log change
nr_ppl_comparison["log_change_gini"] = np.where(
    (nr_ppl_comparison["gini_multimodal"] > 0) & (nr_ppl_comparison["gini_walk15"] > 0),
    np.log(nr_ppl_comparison["gini_multimodal"] / nr_ppl_comparison["gini_walk15"]),
    np.nan,
)

# Gini (households) log change
nr_ppl_comparison["log_change_house_gini"] = np.where(
    (nr_ppl_comparison["gini_house_multimodal"] > 0)
    & (nr_ppl_comparison["gini_house_walk15"] > 0),
    np.log(
        nr_ppl_comparison["gini_house_multimodal"]
        / nr_ppl_comparison["gini_house_walk15"]
    ),
    np.nan,
)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [29]:
centrality.drop_duplicates(subset=["stop_id"], inplace=True)

In [30]:
# Merge the DataFrames
nr_ppl_comparison2 = pd.merge(nr_ppl_comparison, centrality, on="stop_id", how="inner")

In [31]:
nr_ppl_comparison2.to_csv(f"../../output/{CITY}/bp_socioecon_merged5c.csv", index=False)

In [None]:
nr_ppl_comparison2[["stop_id", "gini_walk15", "gini_multimodal"]].to_csv(
    f"../../output/{CITY}/{VERSION}/stop_gini_from_mobility.csv", index=False
)

In [None]:
nr_ppl_comparison2[
    ["stop_id", "arpu_low_ratio_walk15", "arpu_low_ratio_multimodal"]
].to_csv(f"../../output/{CITY}/{VERSION}/stop_income_from_mobility.csv", index=False)

Unnamed: 0,stop_id,arpu_low_ratio_walk15,arpu_low_ratio_multimodal
0,007877,0.278032,0.360259
1,007878,0.274220,0.373276
2,007879,0.276003,0.344026
3,007881,0.379244,0.371556
4,007883,0.358126,0.354507
...,...,...,...
4024,F04573,0.306491,0.317397
4025,F04574,0.257719,0.321150
4026,F04575,0.259534,0.300188
4027,F04576,0.268701,0.298702
