In [1]:
import pandas as pd
import geopandas as gpd
import json
import shutil
import os
from shapely.geometry import mapping
import tqdm

## 1. Convert raw population file to population.csv

In [2]:
file_path = 'DECENNIALDHC2020.P1_population/DECENNIALDHC2020.P1-Data.csv'
raw_population = pd.read_csv(file_path)

# Remove the prefix '1400000US' from the GEO_ID column
raw_population['GEO_ID'] = raw_population['GEO_ID'].str.replace('1400000US', '', regex=False)
raw_population = raw_population[raw_population['GEO_ID'] != 'Geography']

# Rename columns
raw_population = raw_population.rename(columns={
    'NAME': 'label',
    'GEO_ID': 'geoid',
    'P1_001N': 'total_population'
})

raw_population = raw_population[['label', 'geoid', 'total_population']]


output_file_path = 'population.csv'
raw_population.to_csv(output_file_path, index=False)

print(f"Population data saved to {output_file_path}")

Population data saved to population.csv


## 2. Combining shapefile and population data into tessellation.geojson

In [7]:
shapefile_path = "tl_2020_53_tract20/tl_2020_53_tract20.shp"
gdf = gpd.read_file(shapefile_path)
gdf["GEOID"] = gdf["GEOID20"].astype(str)
population_csv_path = "population.csv"
population_df = pd.read_csv(population_csv_path)
population_df["geoid"] = population_df["geoid"].astype(str)
population_df.rename(columns={"geoid": "GEOID"}, inplace=True)
gdf = gdf.merge(population_df[["GEOID", "total_population"]], on="GEOID", how="left")

geojson = {
    "type": "FeatureCollection",
    "features": []
}

# Iterate through the rows of the GeoDataFrame and populate the GeoJSON features
for _, row in gdf.iterrows():
    feature = {
        "type": "Feature",
        "properties": {
            "GEOID": row["GEOID"],
            "lng": float(row["INTPTLON20"]),
            "lat": float(row["INTPTLAT20"]),
            "total_population": row["total_population"]
        },
        "geometry": mapping(row["geometry"])
    }
    geojson["features"].append(feature)

output_path = 'tessellation_wip.geojson'
with open(output_path, 'w') as f:
    json.dump(geojson, f, indent=2)

print(f"GeoJSON file saved to {output_path}")

GeoJSON file saved to tessellation_wip.geojson


In [8]:
source = "tessellation_wip.geojson"
destination = "../tessellation.geojson"
shutil.copy(source, destination)

print(f"Tessellation copied from {source} to {destination}")

Tessellation copied from tessellation_wip.geojson to ../tessellation.geojson


## 3. Mapping geoids in flow.csv to polygon geoids in tessellation.geojson

In [9]:
flow = pd.read_csv("flow_original.csv")
tessellation = gpd.read_file("tessellation_wip.geojson")

# add columns to the tessellation dataframe callled new_geoid_o and new_geoid_d
flow["new_geoid_o"] = None
flow["new_geoid_d"] = None
from shapely.geometry import Point

points_sets = set()
progress_bar = tqdm.tqdm_notebook(total=len(flow))
for i, row in flow.iterrows():
    progress_bar.update(1)
    points_sets.add((row["lng_o"], row["lat_o"]))
    points_sets.add((row["lng_d"], row["lat_d"]))


points_to_new_geoid_mapping = {}
for pos in tqdm.tqdm_notebook(points_sets):
    for j, standard_row in tessellation.iterrows():
        if standard_row["geometry"].contains(Point(pos)):
            points_to_new_geoid_mapping[pos] = standard_row["GEOID"]
            break

progress = 0
progress_bar = tqdm.tqdm_notebook(total=len(flow))
for i, row in flow.iterrows():
    progress_bar.update(1)
    lng_o = row["lng_o"]
    lat_o = row["lat_o"]
    geoid_o_old = str(row["geoid_o"])
    lng_d = row["lng_d"]
    lat_d = row["lat_d"]
    geoid_d_old = str(row["geoid_d"])
    
    if (lng_o, lat_o) in points_to_new_geoid_mapping:
        flow.at[i, "new_geoid_o"] = points_to_new_geoid_mapping[(lng_o, lat_o)]
    if (lng_d, lat_d) in points_to_new_geoid_mapping:
        flow.at[i, "new_geoid_d"] = points_to_new_geoid_mapping[(lng_d, lat_d)]

flow.to_csv('flow_GEOIDadjusted.csv', index=False)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  progress_bar = tqdm.tqdm_notebook(total=len(flow))


  0%|          | 0/139972 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for pos in tqdm.tqdm_notebook(points_sets):


  0%|          | 0/1449 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  progress_bar = tqdm.tqdm_notebook(total=len(flow))


  0%|          | 0/139972 [00:00<?, ?it/s]

In [13]:
# make a copy of flow
flow_final = flow.copy()

flow_final['geoid_o'] = flow_final['new_geoid_o'].astype(str)
flow_final['geoid_d'] = flow_final['new_geoid_d'].astype(str)

flow_final.drop(columns=['new_geoid_o', 'new_geoid_d'], inplace=True)

flow_final['geoid_o'] = flow_final['geoid_o'].astype(str)
flow_final['geoid_d'] = flow_final['geoid_d'].astype(str)

flow_final.to_csv('../flow.csv', index=False)

print("Transformation complete! The new file is saved as 'flow.csv'.")

Transformation complete! The new file is saved as 'flow.csv'.


## 4. Convert the boundary shapefile to boundary.geojson

In [11]:
shapefile_path = "tl_2020_53_puma20/tl_2020_53_puma20.shp"
tessellation = gpd.read_file(shapefile_path)

geojson_path = "tl_2020_53_puma20.geojson"
tessellation.to_file(geojson_path, driver="GeoJSON")

print(f"GeoJSON file saved at {geojson_path}")

GeoJSON file saved at tl_2020_53_puma20.geojson


In [12]:
source = "tl_2020_53_puma20.geojson"
destination = "../boundary.geojson"
shutil.copy(source, destination)

print(f"Tessellation copied from {source} to {destination}")

Tessellation copied from tl_2020_53_puma20.geojson to ../boundary.geojson
