# FIPS code <-> Core-based statistical area (CBSA) membership 
The user advocates asked for the ability to see what CBSA each county is in. 
This is a fairly simple exercise in merging disparate data sources. We can generalize this functionality in some powerful ways.
- Mapping between different abstractions (spatial granularities)
- Using LLM semantic knowledge to merge disparate data sources

## Basic setup
Function definitions 
TODO: separate this into a utils .py file

In [None]:
import requests
import geopandas as gpd
import pandas as pd
import plotly.express as px
import io
import matplotlib.pyplot as plt

# functions for getting CBSA membership data
tmp_save_dir = "temporary_files"


def get_CBSA_geodataframe():
    membership_df = get_CBSA_membership_df()
    shape_df = get_CBSA_shape_df()
    # merge, make sure it's a geodataframe, and drop rows with no membership data
    merged_data = pd.merge(
        shape_df, membership_df, left_on="CBSAFP", right_on="CBSA Code", how="outer"
    )
    merged_data.dropna(
        subset=["CBSAFP", "CBSA Code", "FIPS State Code", "FIPS County Code"],
        inplace=True,
    )
    # set index to GeoFIPS
    merged_data.set_index("CBSA Code", inplace=True)
    # fips as strings of the right length
    merged_data["FIPS State Code"] = (
        merged_data["FIPS State Code"].astype(int).astype(str).str.zfill(2)
    )
    merged_data["FIPS County Code"] = (
        merged_data["FIPS County Code"].astype(int).astype(str).str.zfill(3)
    )
    return merged_data


def get_CBSA_membership_df():
    # get membership data
    membership_url = f"https://www2.census.gov/programs-surveys/metro-micro/geographies/reference-files/2023/delineation-files/list1_2023.xlsx"
    response = requests.get(membership_url)
    response.raise_for_status()  # Raise an exception for HTTP errors
    # with open(os.path.join(tmp_save_dir, 'temp_file.xlsx'), "wb") as f:
    #     f.write(response.content)
    data = io.BytesIO(response.content)
    membership_df = pd.read_excel(data, skiprows=2)
    # membership_df = pd.read_excel(os.path.join(tmp_save_dir, 'temp_file.xlsx'), skiprows=2)
    return membership_df


def get_CBSA_shape_df():
    # get shape data
    YEAR = 2021  # 2021 is most recent as of Oct 28, 2023
    URL = f"https://www2.census.gov/geo/tiger/TIGER{YEAR}/CBSA/tl_{YEAR}_us_cbsa.zip"
    shape_df = gpd.read_file(URL)
    return shape_df


def restrict_gdf_to_county_CBSAs(geodataframe, GeoFIPS):
    FIPS_state = GeoFIPS[:2]
    FIPS_county = GeoFIPS[2:]
    my_dmembership_gdf = geodataframe[
        (geodataframe["FIPS State Code"] == FIPS_state)
        & (geodataframe["FIPS County Code"] == FIPS_county)
    ]
    return my_membership_gdf

## Task 1
Merge two datasets, one with CBSA shape files, and one with CBSA membership info

In [None]:
# take a look at the shape data (all CBSAs)
shape_df = get_CBSA_shape_df()
shape_df.plot()
display(shape_df.head())
print("num of unique CBSA codes in shape data:", len(shape_df["CBSAFP"].unique()))

In [None]:
# take a look at the membership data
membership_df = get_CBSA_membership_df()
display(membership_df.head())
print(
    "num of unique CBSA codes in membership data:",
    len(membership_df["CBSA Code"].unique()),
)

In [None]:
# attempt to merge, see what's left
merged_df = get_CBSA_geodataframe()
merged_df.plot()
display(merged_df.head())
print("num of unique CBSA codes in merged data:", len(merged_df.index.unique()))

## Task 2
Given a county FIPS code, find what CBSA(s) the county belongs to, and plot their geometry 

In [None]:
# kings county ny
GeoFIPS = "36047"
my_membership_gdf = restrict_gdf_to_county_CBSAs(merged_df, GeoFIPS)
my_membership_gdf.plot()

In [None]:
# plot with plotly
geodf_to_plot = my_membership_gdf.copy()
fig = px.choropleth_mapbox(
    geodf_to_plot,
    geojson=geodf_to_plot.geometry,
    locations=geodf_to_plot.index,
    color="CBSA Title",
    mapbox_style="carto-positron",
    zoom=3,
    center={"lat": 37.0902, "lon": -95.7129},
    hover_data=["CBSA Title", "County/County Equivalent", "State Name"],
    opacity=0.5,
    labels={"CBSA Title": "CBSA Title"},
)
fig.show()

## Task 3
From each CBSA, get a list of other counties in it

In [None]:
# make a unique_CBSA_df df with a column for all the counties in each CBSA
unique_CBSA_df = (
    merged_df.groupby(["CBSA Code", "CBSA Title", "geometry"])[
        "County/County Equivalent"
    ]
    .apply(lambda x: ", ".join(x))
    .reset_index()
)
# convert to geodataframe
unique_CBSA_df = gpd.GeoDataFrame(unique_CBSA_df, geometry="geometry")
# Rename columns for clarity
unique_CBSA_df.columns = ["CBSA Code", "CBSA Title", "geometry", "Counties"]
unique_CBSA_df.set_index("CBSA Code", inplace=True)
# # don't run too slow
# geodf_to_plot = unique_CBSA_df.copy()
# fig = px.choropleth_mapbox(geodf_to_plot, geojson=geodf_to_plot.geometry,
#                            locations=geodf_to_plot.index, color='CBSA Title',
#                            mapbox_style="carto-positron",
#                            zoom=3, center = {"lat": 37.0902, "lon": -95.7129},
#                            hover_data=["CBSA Title", "Counties"],
#                            opacity=0.5,
#                            labels={'CBSA Title':'CBSA Title'}
#                           )
# fig.show()
unique_CBSA_df

In [None]:
unique_CBSA_df.plot()

In [None]:
unique_CBSA_df

## Task 4

Automate dataset merging using an LLM. 

Can an LLM use semantic knowledge to infer how to map between datasets that are formatted slightly differently? 

### Prompt for GPT
I would like to merge two dataframes that have similar columns, but the columns may be named differently. Here is what they look like:

```python
dfA.head()
CSAFP	CBSAFP	GEOID	NAME	NAMELSAD	LSAD	MEMI	MTFCC	ALAND	AWATER	INTPTLAT	INTPTLON	geometry
0	122	12020	12020	Athens-Clarke County, GA	Athens-Clarke County, GA Metro Area	M1	1	G3110	2654607902	26109459	+33.9439840	-083.2138965	POLYGON ((-83.36003 34.04057, -83.36757 34.043...
1	122	12060	12060	Atlanta-Sandy Springs-Alpharetta, GA	Atlanta-Sandy Springs-Alpharetta, GA Metro Area	M1	1	G3110	22495873026	386782308	+33.6937280	-084.3999113	POLYGON ((-84.27014 32.99101, -84.27084 32.991...
2	428	12100	12100	Atlantic City-Hammonton, NJ	Atlantic City-Hammonton, NJ Metro Area	M1	1	G3110	1438775279	301270067	+39.4693555	-074.6337591	POLYGON ((-74.58640 39.30989, -74.58665 39.309...
3	426	12120	12120	Atmore, AL	Atmore, AL Micro Area	M2	2	G3110	2448595161	20024887	+31.1222867	-087.1684097	POLYGON ((-87.36388 30.99790, -87.36391 30.997...
4	258	12140	12140	Auburn, IN	Auburn, IN Micro Area	M2	2	G3110	939731961	2657419	+41.3967596	-085.0026969	POLYGON ((-85.07780 41.26560, -85.07850 41.265...

CBSA Code	Metropolitan Division Code	CSA Code	CBSA Title	Metropolitan/Micropolitan Statistical Area	Metropolitan Division Title	CSA Title	County/County Equivalent	State Name	FIPS State Code	FIPS County Code	Central/Outlying County
0	10100	NaN	NaN	Aberdeen, SD	Micropolitan Statistical Area	NaN	NaN	Brown County	South Dakota	46.0	13.0	Central
1	10100	NaN	NaN	Aberdeen, SD	Micropolitan Statistical Area	NaN	NaN	Edmunds County	South Dakota	46.0	45.0	Outlying
2	10140	NaN	NaN	Aberdeen, WA	Micropolitan Statistical Area	NaN	NaN	Grays Harbor County	Washington	53.0	27.0	Central
3	10180	NaN	101.0	Abilene, TX	Metropolitan Statistical Area	NaN	Abilene-Sweetwater, TX	Callahan County	Texas	48.0	59.0	Outlying
4	10180	NaN	101.0	Abilene, TX	Metropolitan Statistical Area	NaN	Abilene-Sweetwater, TX	Jones County	Texas	48.0	253.0	Outlying
```


This is the information you will share. Take your time and be careful to include as much reliable information as you can. 
```python
merged_columns = # a list of column names for a merged df. Only include variables that you think may contained in both dataframes, just in different formats. Be careful not to include variables that are only contained in one of the dataframes -- these will be addressed separately. But if you think there's a way to convert between the dataframes and compare them, please try it. 
dict_dfA = # a dictionary of python code strings to compute the merging columns from dfA {'column1': 'dfA["var3"].astype(str)', ...}
dict_dfB = # a dictionary of python code strings to compute the merging columns from dfB {'column1': 'dfB["blah4"].astype(str)', ...}
Be very careful with your python strings to convert the data from dfA and dfB to the exact same format, so they can be merged later using pd.merge
redundant_cols_A = # a list of redundant columns for dfA. this should include all of the columns that were used to construct dict dfB redundant_col_A = ["var3", ...]
redundant_cols_B = # a list of redundant columns for dfB. this should include all of the columns that were used to construct dict dfB redundant_col_B = ["blah4", ...]
cols_to_aggregate = # a dictionary. Keys are columns that should be converted to numerical values (e.g. population), and entries are aggfunc that is appropriate (e.g. sum, median, mean)
```


In [None]:
# function to use LLM for merge
def LLM_assisted_merge(
    dfA,
    dfB,
    merged_columns,
    dict_dfA,
    dict_dfB,
    redundant_cols_A,
    redundant_cols_B,
    cols_to_aggregate,
):
    # Using the dictionaries, create new DataFrames with the desired column names
    dfA_to_merge = dfA.copy()
    dfB_to_merge = dfB.copy()

    # Rename columns in dfA_to_merge using dict_dfA
    for merged_col, original_col in dict_dfA.items():
        try:
            dfA_to_merge[merged_col] = eval(original_col)
        except Exception as e:
            print(f"Error processing column {merged_col} for dfA: {e}")

    # Rename columns in dfB_to_merge using dict_dfB
    for merged_col, original_col in dict_dfB.items():
        try:
            dfB_to_merge[merged_col] = eval(original_col)
        except Exception as e:
            print(f"Error processing column {merged_col} for dfB: {e}")

    # Now, perform the merge on the desired columns from the merged_columns list
    merged_df = dfA_to_merge.merge(dfB_to_merge, on=merged_columns, how="left")

    # Drop the redundant columns
    merged_df.drop(columns=redundant_cols_A, inplace=True)
    merged_df.drop(columns=redundant_cols_B, inplace=True)

    # move the merged columns to the front
    merged_df = merged_df[
        merged_columns + [col for col in merged_df.columns if col not in merged_columns]
    ]

    return merged_df

Putting it together, with output from LLM

In [None]:
#### START LLM PARAMS ####
# A list of column names for the merged dataframe. These columns will be our merging columns.
merged_columns = ["CSA_Code", "CBSA_Code", "Area_Type", "Area_Name"]

dict_dfA = {
    "CSA_Code": 'dfA["CSAFP"].astype(str)',
    "CBSA_Code": 'dfA["CBSAFP"].astype(str)',
    "Area_Type": 'dfA["LSAD"].map({"M1": "Metropolitan", "M2": "Micropolitan"})',
    "Area_Name": 'dfA["NAME"]',
}

dict_dfB = {
    "CSA_Code": 'dfB["CSA Code"].astype(str)',
    "CBSA_Code": 'dfB["CBSA Code"].astype(str)',
    "Area_Type": 'dfB["Metropolitan/Micropolitan Statistical Area"]',
    "Area_Name": 'dfB["CBSA Title"]',
}

redundant_cols_A = ["CSAFP", "CBSAFP", "LSAD", "NAME", "NAMELSAD"]

redundant_cols_B = [
    "CSA Code",
    "CBSA Code",
    "Metropolitan/Micropolitan Statistical Area",
    "CBSA Title",
    "Metropolitan Division Code",
    "CSA Title",
    "Metropolitan Division Title",
]

cols_to_aggregate = {"ALAND": "sum", "AWATER": "sum"}

#### END LLM PARAMS ####

# using LLM params for merge
dfA = shape_df
dfB = membership_df

merged_df = LLM_assisted_merge(
    dfA,
    dfB,
    merged_columns,
    dict_dfA,
    dict_dfB,
    redundant_cols_A,
    redundant_cols_B,
    cols_to_aggregate,
)

merged_df

## Task 5
Aggregate/dissolve/abstract the merged dataset by different variables. There's not always one canonical way to do it -- the columns give us options for possible relevant abstractions

In [None]:
# use 'first' aggregation method if not specified by LLM
for col in merged_df.columns:
    if col not in cols_to_aggregate:
        # not geometry either
        if col != "geometry":
            cols_to_aggregate[col] = "first"

# try different abstractions
for col in cols_to_aggregate:
    print("Try aggregating by", col)
    dissolve_by = col
    dissolve_me = merged_df.copy().dropna(subset=[dissolve_by])
    dissolved = dissolve_me.dissolve(
        by=dissolve_by, aggfunc=cols_to_aggregate
    )  # in the future, can set aggfunc to median, mean, etc.
    # if nonempty, plot
    if not dissolved.empty:
        ax = dissolved.plot(column=dissolve_by)
        ax.set_title("Aggregated by " + dissolve_by)
        plt.show()