In [14]:
import pandas as pd
import numpy as np
import os
import re

# Expenditure

In [15]:
us_states = {
    'AL': 'ALABAMA',    'AK': 'ALASKA',    'AZ': 'ARIZONA',    'AR': 'ARKANSAS',    'CA': 'CALIFORNIA',
    'CO': 'COLORADO',    'CT': 'CONNECTICUT',    'DE': 'DELAWARE',    'FL': 'FLORIDA',    'GA': 'GEORGIA',
    'HI': 'HAWAII',    'ID': 'IDAHO',    'IL': 'ILLINOIS',    'IN': 'INDIANA',    'IA': 'IOWA',
    'KS': 'KANSAS',    'KY': 'KENTUCKY',    'LA': 'LOUISIANA',    'ME': 'MAINE',    'MD': 'MARYLAND',
    'MA': 'MASSACHUSETTS',    'MI': 'MICHIGAN',    'MN': 'MINNESOTA',    'MS': 'MISSISSIPPI',    'MO': 'MISSOURI',
    'MT': 'MONTANA',    'NE': 'NEBRASKA',    'NV': 'NEVADA',    'NH': 'NEW HAMPSHIRE',    'NJ': 'NEW JERSEY',
    'NM': 'NEW MEXICO',    'NY': 'NEW YORK',    'NC': 'NORTH CAROLINA',    'ND': 'NORTH DAKOTA',    'OH': 'OHIO',
    'OK': 'OKLAHOMA',    'OR': 'OREGON',    'PA': 'PENNSYLVANIA',    'RI': 'RHODE ISLAND',    'SC': 'SOUTH CAROLINA',
    'SD': 'SOUTH DAKOTA',    'TN': 'TENNESSEE',    'TX': 'TEXAS',    'UT': 'UTAH',    'VT': 'VERMONT',
    'VA': 'VIRGINIA',    'WA': 'WASHINGTON',    'WV': 'WEST VIRGINIA',    'WI': 'WISCONSIN',    'WY': 'WYOMING'
}


In [16]:
folder_path = "./mfcu_charts"
target_data = "Total Medicaid Expenditures"
target_state = "MA" # Massachusetts

if target_state not in us_states:
    raise ValueError(f"Invalid target state: {target_state}. Must be one of {list(us_states.keys())}.")

In [17]:
results = []

# state aliases for matching
state_aliases = {target_state,us_states[target_state]}

for filename in os.listdir(folder_path):
    if filename.endswith(".csv") and filename.startswith("fy"):
        file_path = os.path.join(folder_path, filename)
        
        # extract year from filename
        year_match = re.search(r"fy(\d{4})", filename)
        if not year_match:
            continue
        year = int(year_match.group(1))

        # load csv file
        df = pd.read_csv(file_path, dtype=str)  # to avoid dtype issues
        
        # column name normalization
        df.columns = [col.strip().lower() for col in df.columns]

        # find the state column
        state_col_candidates = df.columns[df.columns.str.contains("state|location|jurisdiction", case=False)]
        if len(state_col_candidates) == 0:
            state_col = df.columns[0]  # fallback to the first column if no candidates found
        else:
            state_col = state_col_candidates[0]

        # match the target state
        match_row = df[df[state_col].str.strip().str.upper().isin(state_aliases)]

        if not match_row.empty:
            # find the target data column
            col_candidates = [col for col in df.columns if target_data.lower() == col.lower()]
            if col_candidates:
                value = match_row.iloc[0][col_candidates[0]]
                results.append({"Year": year, target_data: value})
            else:
                print(f"⚠️ file {filename} does not contain {target_data}")
        else:
            print(f"⚠️ file {filename} doses not contain data for {target_state}")



In [18]:
# Create a DataFrame from the results
if not results:
    raise ValueError(f"No data found for {target_state} in the specified files.")
final_df = pd.DataFrame(results).sort_values("Year")
final_df[target_data] = final_df[target_data].replace({'\$': '', ',': ''}, regex=True)
final_df[target_data] = pd.to_numeric(final_df[target_data], errors='coerce')

  final_df[target_data] = final_df[target_data].replace({'\$': '', ',': ''}, regex=True)


In [19]:
final_df.rename(columns={"Year": "ds"}, inplace=True)
final_df['ds'] = pd.to_datetime(final_df['ds'], format='%Y')
final_df.rename(columns={target_data: 'y'}, inplace=True)

In [20]:
final_df.to_csv(f"{target_state}_medicaid_expenditures.csv", index=False)

# Enrollment
Data Source: https://data.medicaid.gov/dataset/6165f45b-ca93-5bb5-9d06-db29c692a360/data

In [21]:
df_enrollment = pd.read_csv("Medicaid and CHIP.csv")

In [22]:
target_data1 = "Total Medicaid Enrollment"
target_state1 = "MA" # Massachusetts

if target_state1 not in us_states:
    raise ValueError(f"Invalid target state: {target_state1}. Must be one of {list(us_states.keys())}.")

if target_data1 not in df_enrollment.columns:
    raise ValueError(f"Invalid target data: {target_data1}. Must be one of {df_enrollment.columns.tolist()}.")

In [23]:
df_need = df_enrollment[['State Abbreviation','Reporting Period',target_data1]].copy()
df_need.query("`State Abbreviation` == @target_state1", inplace=True)
df_need['Reporting Period'] = pd.to_datetime(df_need['Reporting Period'], format='%Y%m')
df_need.dropna(subset=['Reporting Period', target_data1], inplace=True)
df_need.rename(columns={'Reporting Period': 'ds', target_data1: 'y'}, inplace=True)
df_need.drop(columns=['State Abbreviation'], inplace=True)

In [24]:
df_need.to_csv(f"{target_state1}_medicaid_enrollment.csv", index=False)