In [50]:
import pandas as pd
import numpy as np 
import re
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd
from pathlib import Path
from shapely.ops import transform
from pyproj import Transformer

In [51]:
df_regions.head()

Unnamed: 0,Region,County Name
0,West Region,Shelby County
1,West Region,Fayette County
2,West Region,Tipton County
3,West Region,Haywood County
4,West Region,Lauderdale County


### West

In [52]:
west = [
    "Obion", "Weakley", "Dyer", "Gibson", "Crockett", "Lauderdale",
    "Tipton", "Haywood", "Shelby", "Fayette", "Hardeman", "McNairy", "Lake"
]


In [53]:
# If you need the " County" suffix:
counties_with_suffix = [f"{c} County" for c in west]

### Mid_west (23)

In [54]:
mid_west = [
    "Robertson", "Cheatham", "Williamson", "Maury", "Giles", "Montgomery",
    "Dickson", "Hickman", "Lewis", "Lawrence", "Stewart", "Houston",
    "Humphreys", "Perry", "Wayne", "Benton", "Decatur", "Henry",
    "Carroll", "Hardin", "Henderson", "Chester", "Madison"
]

In [55]:
# If you need " County" suffix:
mid_west_with_suffix = [f"{c} County" for c in mid_west]

### Midstate (14)

In [56]:
mid_state = [
    "Macon", "Sumner", "Trousdale", "Jackson", "Smith", "Wilson",
    "Davidson", "Rutherford", "Cannon", "Coffee", "Bedford",
    "Marshall", "Moore", "Lincoln"
]


In [57]:
# If you need " County" suffix:
mid_state_with_suffix = [f"{c} County" for c in mid_state]

### TN Valley (16)

In [58]:
tn_valley = [
    "Marion", "Franklin", "Hamilton", "Sequatchie", "Grundy", "Rhea",
    "Bledsoe", "Van Buren", "Warren", "Cumberland", "White", "DeKalb",
    "Putnam", "Overton", "Clay", "Pickett"
]

In [59]:
# If you need " County" suffix:
tn_valley_with_suffix = [f"{c} County" for c in tn_valley]

### East (15)

In [60]:
east = [
    "Scott", "Fentress", "Campbell", "Claiborne", "Morgan", "Anderson",
    "Union", "Roane", "Knox", "Loudon", "Meigs", "McMinn", "Monroe",
    "Bradley", "Polk"
]

In [61]:
# If you need " County" suffix:
east_with_suffix = [f"{c} County" for c in east]

### Northeast (14)

In [62]:
northeast = [
    "Sevier", "Blount", "Jefferson", "Cocke", "Greene", "Hamblen", "Grainger",
    "Unicoi", "Washington", "Hawkins", "Hancock", "Carter", "Sullivan", "Johnson"
]

In [63]:
# If you need " County" suffix:
northeast_with_suffix = [f"{c} County" for c in northeast]

## Dataframe

In [64]:
rows = []
rows += [{'County Name': c, 'Region': 'West'} for c in counties_with_suffix]
rows += [{'County Name': c, 'Region': 'Mid West'} for c in mid_west_with_suffix]
rows += [{'County Name': c, 'Region': 'Mid-State'} for c in mid_state_with_suffix]
rows += [{'County Name': c, 'Region': 'TN Valley'} for c in tn_valley_with_suffix]
rows += [{'County Name': c, 'Region': 'East'} for c in east_with_suffix]
rows += [{'County Name': c, 'Region': 'Northeast'} for c in northeast_with_suffix]

In [65]:

df_region = pd.DataFrame(rows)

In [66]:
# Optional: ensure each name ends with " County" exactly once
df_region['County Name'] = (
    df_region['County Name'].astype(str).str.strip()
      .str.replace(r'\s*County$', '', regex=True).str.strip() + ' County'
)

df_region

Unnamed: 0,County Name,Region
0,Obion County,West
1,Weakley County,West
2,Dyer County,West
3,Gibson County,West
4,Crockett County,West
...,...,...
90,Hawkins County,Northeast
91,Hancock County,Northeast
92,Carter County,Northeast
93,Sullivan County,Northeast


In [67]:
df = pd.read_csv('../data/TN_Regions.csv')

In [68]:
df.head()

Unnamed: 0,Region,County
0,West Region,Shelby
1,West Region,Fayette
2,West Region,Tipton
3,West Region,Haywood
4,West Region,Lauderdale


In [69]:
# Optional: ensure each name ends with " County" exactly once
df['County'] = (
    df['County'].astype(str).str.strip()
      .str.replace(r'\s*County$', '', regex=True).str.strip() + ' County'
)

In [70]:
df = df.rename(columns={'County': 'County Name'})

In [71]:
# Normalize and compare county names between two DataFrames

def norm_county_series(s):
    s = s.dropna().astype(str).str.strip()
    s = s.str.replace(r'\s+', ' ', regex=True)
    s = s.str.replace(r'\s*County$', '', regex=True).str.strip() + ' County'
    return s

# Handle possible column name typos/spacing
col_left = df.columns[df.columns.str.strip().str.lower() == 'county name'][0]
col_right = df_region.columns[df_region.columns.str.strip().str.lower() == 'county name'][0]

left_set = set(norm_county_series(df[col_left]).unique())
right_set = set(norm_county_series(df_region[col_right]).unique())

only_in_left = sorted(left_set - right_set)
only_in_right = sorted(right_set - left_set)
in_both = sorted(left_set & right_set)

only_in_left, only_in_right, in_both

([],
 ['Williamson County'],
 ['Anderson County',
  'Bedford County',
  'Benton County',
  'Bledsoe County',
  'Blount County',
  'Bradley County',
  'Campbell County',
  'Cannon County',
  'Carroll County',
  'Carter County',
  'Cheatham County',
  'Chester County',
  'Claiborne County',
  'Clay County',
  'Cocke County',
  'Coffee County',
  'Crockett County',
  'Cumberland County',
  'Davidson County',
  'DeKalb County',
  'Decatur County',
  'Dickson County',
  'Dyer County',
  'Fayette County',
  'Fentress County',
  'Franklin County',
  'Gibson County',
  'Giles County',
  'Grainger County',
  'Greene County',
  'Grundy County',
  'Hamblen County',
  'Hamilton County',
  'Hancock County',
  'Hardeman County',
  'Hardin County',
  'Hawkins County',
  'Haywood County',
  'Henderson County',
  'Henry County',
  'Hickman County',
  'Houston County',
  'Humphreys County',
  'Jackson County',
  'Jefferson County',
  'Johnson County',
  'Knox County',
  'Lake County',
  'Lauderdale Coun

In [72]:
left_norm = norm_county_series(df[col_left]).rename('County Name').to_frame()
right_norm = norm_county_series(df_region[col_right]).rename('County Name').to_frame()

left_only_rows = left_norm.merge(right_norm.drop_duplicates(), on='County Name', how='left', indicator=True)
left_only_rows = left_only_rows[left_only_rows['_merge'] == 'left_only']['County Name'].unique().tolist()

right_only_rows = right_norm.merge(left_norm.drop_duplicates(), on='County Name', how='left', indicator=True)
right_only_rows = right_only_rows[right_only_rows['_merge'] == 'left_only']['County Name'].unique().tolist()

left_only_rows, right_only_rows

([], ['Williamson County'])

In [73]:
# Rows where county is Lake County but not matched in df_region
df[df['County Name'].str.strip().eq('Lake County')]

Unnamed: 0,Region,County Name
6,West Region,Lake County


In [74]:
# Rows where county is Lake County but not matched in df_region
df_region[df_region['County Name'].str.strip().eq('Williamson County')]

Unnamed: 0,County Name,Region
15,Williamson County,Mid West


In [75]:
df_region.to_csv('regions corrected.csv')

In [None]:
df