In [2]:
import pandas as pd
import numpy as np 
import re
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd
from pathlib import Path
from shapely.ops import transform
from pyproj import Transformer


In [3]:
df_risk_data = pd.read_csv('../data/bdaic created tables/df_high_risk_buckets.csv', dtype={'PERSON ID': str}) 

In [4]:
# --- Load TN counties from TIGER/Line ---
shp = Path('../data/bdaic created tables/tl_2024_us_county/tl_2024_us_county.shp')
gdf = gpd.read_file(shp)
gdf = gdf[gdf['STATEFP'] == '47'].copy()

In [5]:
# Load TN counties base geometry (95 rows)
base = gpd.read_file("../data/bdaic created tables/tl_2024_us_county/tl_2024_us_county.shp")
tn = base[base["STATEFP"] == "47"][["GEOID","NAME","geometry"]].copy()
tn["GEOID"] = tn["GEOID"].astype(str).str.zfill(5)

In [7]:
df_risk_data.head()

Unnamed: 0,PERSON ID,CURRENT AGE,COMMITMENT COUNTY,RESPONSIBLE COUNTY,GENDER,REMOVAL ZIP CODE,PLACEMENT ZIP CODE,LOCATION BEGIN DATE,LOCATION END DATE,Mental_Health_Behavioral_Impulse_Control,...,Mental_Health_Sexual_Developmental_Concerns,Mental_Health_Trauma_Stress_Related,Supportive_Adult_Relationships_Family_Relationships,Supportive_Adult_Relationships_Social_Relationships,Supportive_Adult_Relationships_Barriers_to_Support,Housing_Instability_Skills_Readiness,Housing_Instability_Current_Living_Situation,Housing_Instability_Barriers_Risks,County,Region
0,353258,17,Madison,Madison,MALE,37040.0,37040.0,2023-06-13,2024-05-25,,...,0.0,,,1.0,1.0,0.0,,0.0,Madison,West
1,706512,17,Sullivan,Sullivan,FEMALE,37642.0,37642.0,2024-11-18,,,...,,,1.0,1.0,1.0,0.0,0.0,0.0,Sullivan,East
2,1348374,17,Warren,Warren,FEMALE,37110.0,37110.0,2023-06-02,2023-09-30,,...,0.0,1.0,,1.0,,,0.0,0.0,Warren,Middle
3,2470086,18,Hamilton,Hamilton,MALE,37218.0,37218.0,2024-09-17,,1.0,...,0.0,0.0,1.0,1.0,,1.0,1.0,,Hamilton,East
4,3965628,21,Davidson,Davidson,MALE,37725.0,37725.0,2021-12-22,2022-08-31,1.0,...,0.0,1.0,0.0,,1.0,0.0,0.0,0.0,Davidson,Middle


In [8]:
df_risk_data = df_risk_data.rename(columns={"County": "NAME"})

In [9]:
# If your attributes table has multiple rows per county, collapse to ONE row per NAME
# Choose the right aggregation per column: mean/first/sum/etc.
attrs = df_risk_data.copy()
attrs = attrs.groupby("NAME", as_index=False).first()  # or .mean(), .sum() — pick what makes sense

In [10]:
# Merge attributes onto the 95 TN polygons
gdf_merged = tn.merge(attrs, on="NAME", how="left")
print(len(gdf_merged))  # should be 95

95


In [11]:
transformer = Transformer.from_crs(gdf_merged.crs, "EPSG:4326", always_xy=True)
gdf_4326 = gdf_merged.copy()
gdf_4326["geometry"] = gdf_4326["geometry"].apply(lambda geom: transform(transformer.transform, geom))

gdf_4326.to_file("tn_counties_joined.geojson", driver="GeoJSON")


In [12]:
# simplify in a projected CRS for a meter-based tolerance
gdf_proj = gdf_merged.to_crs(3857)
gdf_proj["geometry"] = gdf_proj.geometry.simplify(tolerance=200, preserve_topology=True)
gdf_slim = gdf_proj.to_crs(4326)
gdf_slim.to_file("tn_counties_joined.geojson", driver="GeoJSON")

In [13]:
prefixes = ("Mental_Health", "Supportive_Adult_Relationships", "Housing_Instability")
pat = r"^(%s)_" % "|".join(map(re.escape, prefixes))
indicator_cols = [c for c in gdf_merged.columns if re.match(pat, c)]

cols_csv = ["GEOID","NAME"] + (["Region"] if "Region" in gdf_merged.columns else []) + indicator_cols
cols_csv = [c for c in cols_csv if c in gdf_merged.columns]
gdf_merged[cols_csv].to_csv("tn_risk_buckets_by_NAME.csv", index=False)


In [14]:
print("Rows:", len(gdf_merged))  # expect 95
print("CRS:", gdf_merged.crs)    # typically EPSG:4269 for TIGER
print("Any null geometry?", gdf_merged.geometry.isna().sum())
print("A few columns:", gdf_merged.columns[:10].tolist())


Rows: 95
CRS: EPSG:4269
Any null geometry? 0
A few columns: ['GEOID', 'NAME', 'geometry', 'PERSON ID', 'CURRENT AGE', 'COMMITMENT COUNTY', 'RESPONSIBLE COUNTY', 'GENDER', 'REMOVAL ZIP CODE', 'PLACEMENT ZIP CODE']


In [15]:
merge = gdf_merged[cols_csv]

In [16]:
def export_arcgis_csv(gdf,
                      join_col="GEOID",
                      indicator_prefixes=("Mental_Health", "Supportive_Adult_Relationships", "Housing_Instability"),
                      file_name="tn_high_risk_buckets.csv"):
    """
    Minimal function to create a single ArcGIS-ready CSV file.
    """
    df = gdf.copy()

    # Ensure GEOID is 5-digit string
    if join_col not in df.columns:
        raise ValueError(f"Join column '{join_col}' not found.")
    df[join_col] = df[join_col].astype(str).str.strip().str.zfill(5)

    # Select county/region columns if they exist
    keep_cols = [join_col]
    if "County" in df.columns:
        keep_cols.append("County")
    if "Region" in df.columns:
        keep_cols.append("Region")

    # Find indicator columns quickly
    pat = r"^(%s)_" % "|".join(map(re.escape, indicator_prefixes))
    indicator_cols = [c for c in df.columns if re.match(pat, c)]
    if not indicator_cols:
        raise ValueError("No indicator columns found.")

    # Keep just what ArcGIS needs
    df_out = df[keep_cols + indicator_cols].copy()

    # Make sure all indicators are numeric (ArcGIS likes numbers)
    for col in indicator_cols:
        df_out[col] = pd.to_numeric(df_out[col], errors="coerce")

    # Save
    df_out.to_csv(file_name, index=False)
    print(f"✅ CSV saved for ArcGIS: {file_name} ({len(df_out)} rows)")

    return df_out

In [17]:
export_arcgis_csv(gdf, file_name="tn_risk_buckets.csv")

ValueError: No indicator columns found.

In [18]:
keep_geo = cols_csv + ["geometry"]
keep_geo = [c for c in keep_geo if c in gdf_merged.columns]

gdf_merged[keep_geo].to_file("tn_counties_joined.geojson", driver="GeoJSON")
print("✅ GeoJSON saved (no reprojection): tn_counties_joined.geojson")


✅ GeoJSON saved (no reprojection): tn_counties_joined.geojson


In [19]:
g = gdf_merged[keep_geo].copy()
if g.crs and g.crs.to_epsg() != 4326:
    transformer = Transformer.from_crs(g.crs, "EPSG:4326", always_xy=True)
    g["geometry"] = g["geometry"].apply(lambda geom: transform(transformer.transform, geom))

g.to_file("tn_counties_joined.geojson", driver="GeoJSON")
print("✅ GeoJSON saved (row-wise reprojection): tn_counties_joined.geojson")

✅ GeoJSON saved (row-wise reprojection): tn_counties_joined.geojson


## Percents

In [20]:
# Load your file
gdf = gpd.read_file("tn_counties_joined.geojson")


In [21]:
print("Available columns:\n", sorted(gdf.columns.tolist()))

Available columns:
 ['GEOID', 'Housing_Instability_Barriers_Risks', 'Housing_Instability_Current_Living_Situation', 'Housing_Instability_Skills_Readiness', 'Mental_Health_Behavioral_Impulse_Control', 'Mental_Health_Cultural_Spiritual_and_Engagement_Factors', 'Mental_Health_Mood_Emotional_Regulation', 'Mental_Health_Physical_Medical', 'Mental_Health_Psychotic_Thought_Disturbances', 'Mental_Health_Risk_Safety', 'Mental_Health_Sexual_Developmental_Concerns', 'Mental_Health_Trauma_Stress_Related', 'NAME', 'Region', 'Supportive_Adult_Relationships_Barriers_to_Support', 'Supportive_Adult_Relationships_Family_Relationships', 'Supportive_Adult_Relationships_Social_Relationships', 'geometry']


In [22]:
# ---------- CONFIG ----------
in_path = "tn_counties_joined.geojson"   # your existing file with county geometries
out_path = "tn_counties_pct_ones.geojson"  # output file for ArcGIS
name_col = "NAME"
county_col = "GEOID"
weight_col = None   # e.g., "survey_weight" if/when you have one; else leave as None


In [23]:
# categories to evaluate (all your 0/1 columns)
category_cols = [
    "Housing_Instability_Barriers_Risks",
    "Housing_Instability_Current_Living_Situation",
    "Housing_Instability_Skills_Readiness",
    "Mental_Health_Behavioral_Impulse_Control",
    "Mental_Health_Cultural_Spiritual_and_Engagement_Factors",
    "Mental_Health_Mood_Emotional_Regulation",
    "Mental_Health_Physical_Medical",
    "Mental_Health_Psychotic_Thought_Disturbances",
    "Mental_Health_Risk_Safety",
    "Mental_Health_Sexual_Developmental_Concerns",
    "Mental_Health_Trauma_Stress_Related",
    "Supportive_Adult_Relationships_Barriers_to_Support",
    "Supportive_Adult_Relationships_Family_Relationships",
    "Supportive_Adult_Relationships_Social_Relationships",
]

In [24]:
# Load
gdf = gpd.read_file(in_path)

In [25]:
# 1) Normalize column names (trim spaces, keep original casing)
rename_norm = {c: c.strip() for c in gdf.columns}
if rename_norm != {c: c for c in gdf.columns}:
    gdf = gdf.rename(columns=rename_norm)

In [26]:
if name_col not in gdf.columns:
    raise ValueError(f"'{name_col}' not in columns. Available: {sorted(gdf.columns)}")
missing = [c for c in category_cols if c not in gdf.columns]
if missing:
    raise ValueError(f"These category columns are missing: {missing}")


In [27]:
# 2) Melt to long
id_vars = [name_col]
has_weight = weight_col is not None and (weight_col in gdf.columns)
if has_weight:
    id_vars.append(weight_col)

long = gdf.melt(
    id_vars=id_vars,
    value_vars=category_cols,
    var_name="category",
    value_name="resp_raw",
)

In [28]:
# 3) Build 0/1 indicator and weights
resp_num = pd.to_numeric(long["resp_raw"], errors="coerce")
long["is_one"] = (resp_num == 1).astype(float)

if has_weight:
    long["w"] = pd.to_numeric(long[weight_col], errors="coerce").fillna(0.0)
else:
    long["w"] = 1.0

long["w1"] = long["w"] * long["is_one"]

In [29]:
# 4) Aggregate by NAME + category
agg = (long.groupby([name_col, "category"], dropna=False)
           .agg(w_sum=("w", "sum"), w1_sum=("w1", "sum"))
           .reset_index())

agg["pct_ones"] = np.where(agg["w_sum"] > 0, 100.0 * agg["w1_sum"] / agg["w_sum"], np.nan)

# 5) Pivot back to wide with NAME as a real column
wide = (agg
        .pivot_table(index=name_col, columns="category", values="pct_ones", aggfunc="first")
        .reset_index())
wide.columns.name = None

In [30]:
# Optional: cleaner column names for ArcGIS
wide = wide.rename(columns={c: f"pct_{c}" for c in wide.columns if c != name_col})

# 6) Take one geometry per NAME (first if duplicates)
geom = (gdf[[name_col, gdf.geometry.name]]
        .drop_duplicates(subset=[name_col])
        .groupby(name_col, as_index=False)
        .first())

In [31]:
# 7) Merge attributes onto geometry (safe column merge, not index join)
merged = geom.merge(wide, on=name_col, how="left")

In [32]:
# Set CRS if missing (WGS84 is typical for GeoJSON)
if merged.crs is None:
    merged = merged.set_crs("EPSG:4326")

merged.to_file(out_path, driver="GeoJSON")


# Again

In [33]:
# -------- CONFIG --------
in_path  = "tn_counties_joined.geojson"       # input
out_path = "tn_counties_pct_ones.geojson"     # output
group_col = "NAME"                             # county key to group by (use "GEOID" if you prefer)
weight_col = None  

In [34]:
category_cols = [
    "Housing_Instability_Barriers_Risks",
    "Housing_Instability_Current_Living_Situation",
    "Housing_Instability_Skills_Readiness",
    "Mental_Health_Behavioral_Impulse_Control",
    "Mental_Health_Cultural_Spiritual_and_Engagement_Factors",
    "Mental_Health_Mood_Emotional_Regulation",
    "Mental_Health_Physical_Medical",
    "Mental_Health_Psychotic_Thought_Disturbances",
    "Mental_Health_Risk_Safety",
    "Mental_Health_Sexual_Developmental_Concerns",
    "Mental_Health_Trauma_Stress_Related",
    "Supportive_Adult_Relationships_Barriers_to_Support",
    "Supportive_Adult_Relationships_Family_Relationships",
    "Supportive_Adult_Relationships_Social_Relationships",
]

In [35]:
gdf = gpd.read_file(in_path)

# Basic checks
missing = [c for c in category_cols if c not in gdf.columns]
if missing:
    raise ValueError(f"Missing columns: {missing}")
if group_col not in gdf.columns:
    raise ValueError(f"Group column '{group_col}' not found. Available: {sorted(gdf.columns)}")

# Geometry per county (first geometry if multiple rows per county)
geom = (gdf[[group_col, gdf.geometry.name]]
        .drop_duplicates(subset=[group_col])
        .groupby(group_col, as_index=False)
        .first())

# Prepare result dataframe with one row per county
result = geom[[group_col]].copy()

# Weights: if no weight column, use 1.0 for every row
if weight_col and (weight_col in gdf.columns):
    w = pd.to_numeric(gdf[weight_col], errors="coerce").fillna(0.0)
else:
    w = pd.Series(1.0, index=gdf.index)

# For each category column: % of 1's within county
for col in category_cols:
    s = pd.to_numeric(gdf[col], errors="coerce")
    is_one = (s == 1).astype(float)

    w1 = is_one * w  # weighted ones
    # aggregate per county
    denom = w.groupby(gdf[group_col]).sum()
    numer = w1.groupby(gdf[group_col]).sum()

    pct = np.where(denom > 0, 100.0 * (numer / denom), np.nan)
    pct = pd.Series(pct, index=denom.index)

    result[f"pct_{col}"] = result[group_col].map(pct)

# Attach geometry back
result = result.merge(geom, on=group_col, how="left")
result = gpd.GeoDataFrame(result, geometry=gdf.geometry.name, crs=gdf.crs)

# (Optional) set a CRS if the input had none; GeoJSONs are commonly EPSG:4326
if result.crs is None:
    result = result.set_crs("EPSG:4326")

# Save
result.to_file(out_path, driver="GeoJSON")

print(f"Saved: {out_path}")
print("Rows in output (should equal # of counties):", len(result))
print("Unique counties:", result[group_col].nunique())


Saved: tn_counties_pct_ones.geojson
Rows in output (should equal # of counties): 95
Unique counties: 95


In [38]:
df_risk_buckets = pd.read_csv('../data/bdaic created tables/df_high_risk_buckets.csv')
df_risk_buckets.head()

Unnamed: 0,PERSON ID,CURRENT AGE,COMMITMENT COUNTY,RESPONSIBLE COUNTY,GENDER,REMOVAL ZIP CODE,PLACEMENT ZIP CODE,LOCATION BEGIN DATE,LOCATION END DATE,Mental_Health_Behavioral_Impulse_Control,...,Mental_Health_Sexual_Developmental_Concerns,Mental_Health_Trauma_Stress_Related,Supportive_Adult_Relationships_Family_Relationships,Supportive_Adult_Relationships_Social_Relationships,Supportive_Adult_Relationships_Barriers_to_Support,Housing_Instability_Skills_Readiness,Housing_Instability_Current_Living_Situation,Housing_Instability_Barriers_Risks,County,Region
0,353258,17,Madison,Madison,MALE,37040.0,37040.0,2023-06-13,2024-05-25,,...,0.0,,,1.0,1.0,0.0,,0.0,Madison,West
1,706512,17,Sullivan,Sullivan,FEMALE,37642.0,37642.0,2024-11-18,,,...,,,1.0,1.0,1.0,0.0,0.0,0.0,Sullivan,East
2,1348374,17,Warren,Warren,FEMALE,37110.0,37110.0,2023-06-02,2023-09-30,,...,0.0,1.0,,1.0,,,0.0,0.0,Warren,Middle
3,2470086,18,Hamilton,Hamilton,MALE,37218.0,37218.0,2024-09-17,,1.0,...,0.0,0.0,1.0,1.0,,1.0,1.0,,Hamilton,East
4,3965628,21,Davidson,Davidson,MALE,37725.0,37725.0,2021-12-22,2022-08-31,1.0,...,0.0,1.0,0.0,,1.0,0.0,0.0,0.0,Davidson,Middle


In [39]:

# ---------- CONFIG ----------
# Your in-memory table (already loaded): df_extreme_risk_buckets
county_key_in_df = "County"               # county column in df_extreme_risk_buckets
weight_col = None  

In [40]:
bucket_cols = [
    "Mental_Health_Behavioral_Impulse_Control",
    "Mental_Health_Cultural_Spiritual_and_Engagement_Factors",
    "Mental_Health_Mood_Emotional_Regulation",
    "Mental_Health_Physical_Medical",
    "Mental_Health_Psychotic_Thought_Disturbances",
    "Mental_Health_Risk_Safety",
    "Mental_Health_Sexual_Developmental_Concerns",
    "Mental_Health_Trauma_Stress_Related",
    "Supportive_Adult_Relationships_Family_Relationships",
    "Supportive_Adult_Relationships_Social_Relationships",
    "Supportive_Adult_Relationships_Barriers_to_Support",
    "Housing_Instability_Skills_Readiness",
    "Housing_Instability_Current_Living_Situation",
    "Housing_Instability_Barriers_Risks"
]

In [41]:
# Counties geometry to join (has polygons):
county_geo_path = "tn_counties_joined.geojson"   # existing GeoJSON with county polygons
county_name_in_geo = "NAME"                      # name column in the geometry file
out_geojson = "tn_counties_risk_pct.geojson"

In [42]:
# 1) Pick bucket columns (auto-detect 0/1 columns if list empty)
df = df_extreme_risk_buckets.copy()
if not bucket_cols:
    candidates = [c for c in df.columns if c not in {county_key_in_df}]
    bucket_cols = []
    for c in candidates:
        s = pd.to_numeric(df[c], errors="coerce").dropna()
        if s.size and set(s.unique()).issubset({0,1}):
            bucket_cols.append(c)
    if not bucket_cols:
        raise ValueError("No 0/1 columns found. Specify bucket_cols explicitly.")


In [43]:
# 2) Normalize county names to maximize join success
def norm(x):
    return (str(x).strip().upper() if pd.notna(x) else x)

df["_CountyNorm"] = df[county_key_in_df].map(norm)

# Weights
if weight_col and (weight_col in df.columns):
    w = pd.to_numeric(df[weight_col], errors="coerce").fillna(0.0)
else:
    w = pd.Series(1.0, index=df.index)

In [44]:
# 3) Compute % of 1's per county for each bucket
out = df[["_CountyNorm"]].drop_duplicates().copy()
for col in bucket_cols:
    s = pd.to_numeric(df[col], errors="coerce")
    is_one = (s == 1).astype(float)
    w1 = is_one * w

    denom = w.groupby(df["_CountyNorm"]).sum()
    numer = w1.groupby(df["_CountyNorm"]).sum()

    pct = (100.0 * numer / denom).replace([np.inf, -np.inf], np.nan)
    out[f"pct_{col}"] = out["_CountyNorm"].map(pct)

In [45]:
# 4) Read county polygons and normalize their names the same way
g = gpd.read_file(county_geo_path)
if county_name_in_geo not in g.columns:
    raise ValueError(f"'{county_name_in_geo}' not found in {county_geo_path}. Available: {list(g.columns)}")

g["_CountyNorm"] = g[county_name_in_geo].map(norm)

# Keep one polygon per county name
geom = (g[["_CountyNorm", g.geometry.name]]
        .drop_duplicates(subset=["_CountyNorm"])
        .groupby("_CountyNorm", as_index=False)
        .first())

# 5) Merge %s onto geometry
merged = geom.merge(out, on="_CountyNorm", how="left")

# Optional: set a CRS if missing (GeoJSONs are commonly WGS84)
if merged.crs is None:
    merged = merged.set_crs("EPSG:4326")

In [46]:
# 6) Save a single GeoJSON with all pct_* fields (one row per county)
merged.to_file(out_geojson, driver="GeoJSON")
print(f"Saved: {out_geojson}")
print("Rows (counties) =", len(merged))

Saved: tn_counties_risk_pct.geojson
Rows (counties) = 95


In [47]:
merged.head()

Unnamed: 0,_CountyNorm,geometry,pct_Mental_Health_Behavioral_Impulse_Control,pct_Mental_Health_Cultural_Spiritual_and_Engagement_Factors,pct_Mental_Health_Mood_Emotional_Regulation,pct_Mental_Health_Physical_Medical,pct_Mental_Health_Psychotic_Thought_Disturbances,pct_Mental_Health_Risk_Safety,pct_Mental_Health_Sexual_Developmental_Concerns,pct_Mental_Health_Trauma_Stress_Related,pct_Supportive_Adult_Relationships_Family_Relationships,pct_Supportive_Adult_Relationships_Social_Relationships,pct_Supportive_Adult_Relationships_Barriers_to_Support,pct_Housing_Instability_Skills_Readiness,pct_Housing_Instability_Current_Living_Situation,pct_Housing_Instability_Barriers_Risks
0,ANDERSON,"POLYGON ((-84.19551 35.9916, -84.19558 35.9913...",54.12844,81.651376,60.550459,14.678899,1.834862,26.605505,7.33945,59.633028,71.559633,85.321101,70.642202,15.59633,22.018349,35.779817
1,BEDFORD,"POLYGON ((-86.24383 35.48785, -86.24409 35.486...",62.0,64.0,40.0,2.0,6.0,12.0,6.0,36.0,64.0,88.0,54.0,10.0,32.0,28.0
2,BENTON,"POLYGON ((-88.00067 36.01744, -88.00066 36.017...",76.923077,61.538462,76.923077,0.0,7.692308,61.538462,7.692308,76.923077,76.923077,92.307692,76.923077,0.0,76.923077,38.461538
3,BLEDSOE,"POLYGON ((-84.9812 35.75686, -84.98101 35.7569...",33.333333,53.333333,46.666667,0.0,0.0,6.666667,13.333333,46.666667,53.333333,73.333333,26.666667,0.0,26.666667,13.333333
4,BLOUNT,"POLYGON ((-83.92084 35.47385, -83.92098 35.473...",71.111111,76.666667,77.777778,8.888889,3.333333,21.111111,11.111111,71.111111,88.888889,85.555556,76.666667,12.222222,32.222222,42.222222


In [48]:
merged.to_csv('tn_counties_risk_pct.csv', index=False)