In [24]:
import pickle

import pandas as pd
import os
import geopandas as gpd
from cities.utils.cleaning_utils import find_repo_root

In [25]:
repo_root = find_repo_root()
with open(os.path.join(repo_root, "data", "raw", "exclusions.pkl"), "rb") as file:
    exclusions = pickle.load(file)
exclusions

{'transport': array([ 2282, 15901, 22051, 22071, 22075, 22087, 22089, 22095, 22103,
        51901, 51903, 51907, 51911, 51913, 51918, 51919, 51921, 51923,
        51929, 51931, 51933, 51939, 51941, 51942, 51944, 51945, 51947,
        51949, 51951, 51953, 51955, 51958])}

In [26]:
url = "https://www2.census.gov/geo/tiger/GENZ2021/shp/cb_2021_us_county_20m.zip"
counties = gpd.read_file(url)
counties.head()

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,NAME,NAMELSAD,STUSPS,STATE_NAME,LSAD,ALAND,AWATER,geometry
0,13,239,350547,0500000US13239,13239,Quitman,Quitman County,GA,Georgia,6,391703076,24158295,"POLYGON ((-85.14183 31.83926, -85.11403 31.893..."
1,18,111,450376,0500000US18111,18111,Newton,Newton County,IN,Indiana,6,1040539827,4349506,"POLYGON ((-87.52665 41.16609, -87.39380 41.162..."
2,19,53,465215,0500000US19053,19053,Decatur,Decatur County,IA,Iowa,6,1377569408,4138626,"POLYGON ((-94.01480 40.89703, -93.55654 40.898..."
3,24,13,1696228,0500000US24013,24013,Carroll,Carroll County,MD,Maryland,6,1159355859,13112464,"POLYGON ((-77.21702 39.72022, -76.99932 39.720..."
4,29,155,758532,0500000US29155,29155,Pemiscot,Pemiscot County,MO,Missouri,6,1275841039,53915406,"POLYGON ((-89.96131 36.38880, -89.75215 36.386..."


In [27]:
gdp = pd.read_csv(os.path.join(repo_root, "data", "processed", "gdp_wide.csv"))
gdp.head()

Unnamed: 0,GeoFIPS,GeoName,2001,2002,2003,2004,2005,2006,2007,2008,...,2011,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,1001,"Autauga, AL",59.839,61.996,63.508,73.73,75.307,80.459,81.836,73.87,...,86.679,93.06,93.155,99.931,104.09,99.798,100.854,97.233,96.115,94.638
1,1003,"Baldwin, AL",73.853,77.273,81.57,90.523,101.402,104.553,107.84,102.635,...,99.0,104.651,106.431,110.434,115.476,118.498,125.068,131.431,131.614,144.294
2,1005,"Barbour, AL",113.864,111.853,114.628,124.473,125.004,122.611,118.397,110.695,...,103.918,113.335,106.76,103.702,101.969,100.509,101.804,102.053,98.044,99.393
3,1007,"Bibb, AL",80.443,81.527,85.124,89.317,88.782,89.597,95.308,94.745,...,102.559,99.537,97.933,94.594,95.812,96.878,96.988,104.62,109.487,107.878
4,1009,"Blount, AL",92.104,92.593,95.469,98.129,100.918,97.428,96.72,97.077,...,91.938,99.318,101.584,106.506,98.394,104.331,109.56,106.565,100.422,113.455


I would like to merge two dataframes that have similar columns, but the columns may be named differently. Here is what they look like:

```python
dfA.head() = 
	STATEFP	COUNTYFP	COUNTYNS	AFFGEOID	GEOID	NAME	NAMELSAD	STUSPS	STATE_NAME	LSAD	ALAND	AWATER	geometry
0	13	239	00350547	0500000US13239	13239	Quitman	Quitman County	GA	Georgia	06	391703076	24158295	POLYGON ((-85.14183 31.83926, -85.11403 31.893...
1	18	111	00450376	0500000US18111	18111	Newton	Newton County	IN	Indiana	06	1040539827	4349506	POLYGON ((-87.52665 41.16609, -87.39380 41.162...
2	19	053	00465215	0500000US19053	19053	Decatur	Decatur County	IA	Iowa	06	1377569408	4138626	POLYGON ((-94.01480 40.89703, -93.55654 40.898...
3	24	013	01696228	0500000US24013	24013	Carroll	Carroll County	MD	Maryland	06	1159355859	13112464	POLYGON ((-77.21702 39.72022, -76.99932 39.720...
4	29	155	00758532	0500000US29155	29155	Pemiscot	Pemiscot County	MO	Missouri	06	1275841039	53915406	POLYGON ((-89.96131 36.38880, -89.75215 36.386..

dfB.head() = 
	GeoFIPS	GeoName	2001	2002	2003	2004	2005	2006	2007	2008	...	2011	2013	2014	2015	2016	2017	2018	2019	2020	2021
0	1001	Autauga, AL	59.839	61.996	63.508	73.730	75.307	80.459	81.836	73.870	...	86.679	93.060	93.155	99.931	104.090	99.798	100.854	97.233	96.115	94.638
1	1003	Baldwin, AL	73.853	77.273	81.570	90.523	101.402	104.553	107.840	102.635	...	99.000	104.651	106.431	110.434	115.476	118.498	125.068	131.431	131.614	144.294
2	1005	Barbour, AL	113.864	111.853	114.628	124.473	125.004	122.611	118.397	110.695	...	103.918	113.335	106.760	103.702	101.969	100.509	101.804	102.053	98.044	99.393
3	1007	Bibb, AL	80.443	81.527	85.124	89.317	88.782	89.597	95.308	94.745	...	102.559	99.537	97.933	94.594	95.812	96.878	96.988	104.620	109.487	107.878
4	1009	Blount, AL	92.104	92.593	95.469	98.129	100.918	97.428	96.720	97.077	...	91.938	99.318	101.584	106.506	98.394	104.3
```


This is the information you will share. Take your time and be careful to include as much reliable information as you can. 
```python
merged_columns = # a list of column names for a merged df. Only include variables that you think may contained in both dataframes, just in different formats. Be careful not to include variables that are only contained in one of the dataframes -- these will be addressed separately. But if you think there's a way to convert between the dataframes and compare them, please try it. 
dict_dfA = # a dictionary of python code strings to compute the merging columns from dfA {'column1': 'dfA["var3"].astype(str)', ...}
dict_dfB = # a dictionary of python code strings to compute the merging columns from dfB {'column1': 'dfB["blah4"].astype(str)', ...}
Be very careful with your python strings to convert the data from dfA and dfB to the exact same format, so they can be merged later using pd.merge
redundant_cols_A = # a list of redundant columns for dfA. this should include all of the columns that were used to construct dict dfB redundant_col_A = ["var3", ...]
redundant_cols_B = # a list of redundant columns for dfB. this should include all of the columns that were used to construct dict dfB redundant_col_B = ["blah4", ...]
cols_to_aggregate = # a dictionary. Keys are columns that should be converted to numerical values (e.g. population), and entries are aggfunc that is appropriate (e.g. sum, median, mean)
```



In [28]:
# function to use LLM for merge
def LLM_assisted_merge(dfA, dfB, merged_columns, dict_dfA, dict_dfB, redundant_cols_A, redundant_cols_B, cols_to_aggregate):
    # Using the dictionaries, create new DataFrames with the desired column names
    dfA_to_merge = dfA.copy()
    dfB_to_merge = dfB.copy()

    # Rename columns in dfA_to_merge using dict_dfA
    for merged_col, original_col in dict_dfA.items():
        try:
            dfA_to_merge[merged_col] = eval(original_col)
        except Exception as e:
            print(f"Error processing column {merged_col} for dfA: {e}")

    # Rename columns in dfB_to_merge using dict_dfB
    for merged_col, original_col in dict_dfB.items():
        try:
            dfB_to_merge[merged_col] = eval(original_col)
        except Exception as e:
            print(f"Error processing column {merged_col} for dfB: {e}")

    # Now, perform the merge on the desired columns from the merged_columns list
    merged_df = dfA_to_merge.merge(dfB_to_merge, on=merged_columns, how='left')

    # Drop the redundant columns
    merged_df.drop(columns=redundant_cols_A, inplace=True)
    merged_df.drop(columns=redundant_cols_B, inplace=True)

    # move the merged columns to the front
    # merged_df = merged_df[merged_columns + [col for col in merged_df.columns if col not in merged_columns]]

    return merged_df

In [30]:
#### START LLM PARAMS ####
# A list of column names for the merged dataframe. These columns will be our merging columns.
# merged_columns:
merged_columns = ['GeoFIPS', 'GeoName']

# dict_dfA:
dict_dfA = {
    'GeoFIPS': 'dfA["STATEFP"].astype(str) + dfA["COUNTYFP"].astype(str)', 
    'GeoName': 'dfA["NAME"] + ", " + dfA["STUSPS"]'
}

# dict_dfB:
dict_dfB = {
    'GeoFIPS': 'dfB["GeoFIPS"].astype(str)', 
    'GeoName': 'dfB["GeoName"]'
}

# redundant_cols_A:
redundant_cols_A = ['STATEFP', 'COUNTYFP', 'NAME', 'STUSPS']

# redundant_cols_B:
redundant_cols_B = ['GeoFIPS', 'GeoName']

# cols_to_aggregate:
# This is just a generic example since it's not clear which columns you want to aggregate.
cols_to_aggregate = {
    '2001': 'sum', 
    '2002': 'sum', 
    # ... you can continue this pattern for each year
    '2021': 'sum'
}

#### END LLM PARAMS ####

# using LLM params for merge
dfA = counties
dfB = gdp

merged_df = LLM_assisted_merge(dfA, dfB, merged_columns, dict_dfA, dict_dfB, redundant_cols_A, redundant_cols_B, cols_to_aggregate)

merged_df

Unnamed: 0,COUNTYNS,AFFGEOID,GEOID,NAMELSAD,STATE_NAME,LSAD,ALAND,AWATER,geometry,2001,...,2011,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,00350547,0500000US13239,13239,Quitman County,Georgia,06,391703076,24158295,"POLYGON ((-85.14183 31.83926, -85.11403 31.893...",93.749,...,96.487,104.040,95.060,99.568,112.433,96.579,98.178,89.801,87.329,83.995
1,00450376,0500000US18111,18111,Newton County,Indiana,06,1040539827,4349506,"POLYGON ((-87.52665 41.16609, -87.39380 41.162...",92.976,...,103.674,116.813,117.127,109.572,96.604,91.432,96.704,93.077,97.061,103.170
2,00465215,0500000US19053,19053,Decatur County,Iowa,06,1377569408,4138626,"POLYGON ((-94.01480 40.89703, -93.55654 40.898...",84.677,...,100.799,99.872,98.776,101.737,105.433,110.290,112.890,108.117,101.654,107.798
3,01696228,0500000US24013,24013,Carroll County,Maryland,06,1159355859,13112464,"POLYGON ((-77.21702 39.72022, -76.99932 39.720...",76.152,...,97.683,100.226,99.510,101.215,101.568,106.456,104.838,105.452,101.050,105.298
4,00758532,0500000US29155,29155,Pemiscot County,Missouri,06,1275841039,53915406,"POLYGON ((-89.96131 36.38880, -89.75215 36.386...",93.939,...,110.203,122.903,98.475,94.166,107.345,86.487,80.452,79.584,83.561,84.271
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3216,01155135,0500000US41063,41063,Wallowa County,Oregon,06,8147835333,14191752,"POLYGON ((-117.97766 46.00017, -117.71785 45.9...",84.127,...,120.107,94.471,97.099,92.511,97.843,108.082,97.018,102.353,106.468,116.035
3217,00345714,0500000US13163,13163,Jefferson County,Georgia,06,1363771357,8059597,"POLYGON ((-82.66192 33.12633, -82.57882 33.119...",104.949,...,102.334,103.129,97.550,99.282,104.701,106.457,99.390,107.122,109.003,113.129
3218,01419970,0500000US02100,02100,Haines Borough,Alaska,04,6069358955,1041214321,"POLYGON ((-136.46681 59.28425, -136.47433 59.4...",,...,,,,,,,,,,
3219,00069176,0500000US05117,05117,Prairie County,Arkansas,06,1677069877,72638193,"POLYGON ((-91.80251 35.03070, -91.70112 35.062...",,...,,,,,,,,,,


In [31]:
# Replace exclusions.pkl with a file included_counties.csv, with columns GeoFIPS, GeoName, Included, Explanation.
# Explanation is a string that explains why the county was excluded.
# Included is a boolean that is True if the county is included in the analysis.

# make empty df with columns GeoFIPS, GeoName, Included, Explanation
included_counties = pd.DataFrame(columns=["GeoFIPS", "GeoName", "Included", "Explanation"])
included_counties["GeoFIPS"] = merged_df["GeoFIPS"]
included_counties["GeoName"] = merged_df["GeoName"]
included_counties

KeyError: 'GeoFIPS'

In [32]:
merged_df.columns

Index(['COUNTYNS', 'AFFGEOID', 'GEOID', 'NAMELSAD', 'STATE_NAME', 'LSAD',
       'ALAND', 'AWATER', 'geometry', '2001', '2002', '2003', '2004', '2005',
       '2006', '2007', '2008', '2009', '2010', '2011', '2013', '2014', '2015',
       '2016', '2017', '2018', '2019', '2020', '2021'],
      dtype='object')