# Analysis of 2010 to 2020 block transformations

In [None]:
from functools import cache
import pandas as pd
from psql_utils.epsql import Engine
import pandas as pd

@cache
def engine():
    return Engine()

In [None]:
crosswalk_blk2010_blk2020_table = "nhgis_geo_wgs84.crosswalk_blk2010_blk2020"
c2010_2020 = pd.read_csv("nhgis_blk2010_blk2020_ge/nhgis_blk2010_blk2020_ge.csv", dtype={"GEOID10":str, "GEOID20":str})


In [None]:
cnonzero2010_2020 = c2010_2020[c2010_2020["WEIGHT"] > 0]

In [None]:
blocks2010 = c2010_2020["GEOID10"].unique()
blocks2020 = c2010_2020["GEOID20"].unique()
print("Number of blocks in 2010: ", len(blocks2010))
print("Number of blocks in 2020: ", len(blocks2020))
print("Length of crosswalk:", len(c2010_2020))
print("Length of nonzerocrosswalk:", len(cnonzero2010_2020))

In [None]:
c2010_2020[c2010_2020["GEOID10"].str.endswith("99")]

In [None]:
c2010_2020[c2010_2020["GEOID20"].str.endswith("99")]

In [None]:
gdf = engine().execute_returning_gdf("select * from tiger_wgs84.tl_2020_tabblock20 where geoid20 between '42003' and '42003z'")
gdf.explore()

In [None]:
waterq.explore()

### Questions:
* How many blocks from 2010 map identically to 2020?
* How many blocks from 2010 map to multiple 2020 blocks?
* How many blocks from 2020 map to multiple 2010 blocks?

In [None]:
# How many blocks from 2010 are split among multiple blocks in 2020?

blocks_not_split_2010_to_2020 = c2010_2020[c2010_2020['WEIGHT'] == 1.0]
assert blocks_not_split_2010_to_2020["GEOID10"].is_unique
assert not blocks_not_split_2010_to_2020["GEOID20"].is_unique
print(f"2010 blocks represented in a single 2020 block: {len(blocks_not_split_2010_to_2020):,} ({len(blocks_not_split_2010_to_2020)/len(blocks2010):.1%})")
print(f"2010 blocks represented in multiple 2020 blocks: {len(blocks2010) - len(blocks_not_split_2010_to_2020):,} ({(len(blocks2010) - len(blocks_not_split_2010_to_2020))/len(blocks2010):.1%})")

In [None]:
duplicate_2010_blocks = cnonzero2010_2020["GEOID10"][cnonzero2010_2020["GEOID10"].duplicated(keep=False)].unique()
print(f"2010 blocks represented in multiple 2020 blocks (alt): {len(duplicate_2010_blocks):,}")


In [None]:
duplicate_2020_blocks = cnonzero2010_2020["GEOID20"][cnonzero2010_2020["GEOID20"].duplicated(keep=False)].unique()
print(f"2020 blocks represented in multiple 2010 blocks (alt): {len(duplicate_2020_blocks):,}")

In [None]:
duplicate_2010_blocks[duplicate_2010_blocks["WEIGHT"] == 0.0]

In [None]:
single_2010_blocks = c2010_2020[~c2010_2020["GEOID10"].duplicated(keep=False)]
print(single_2010_blocks['WEIGHT'].value_counts())
single_2010_geoids = set(single_2010_blocks["GEOID10"])
print(len(single_2010_geoids))

In [None]:
blocks_not_split_2010_to_2020 = c2010_2020[c2010_2020['WEIGHT'] == 1.0]
len(blocks_not_split_2010_to_2020)

In [None]:
len(duplicate_2010_blocks["GEOID10"].unique())