In [None]:
import pandas as pd

# read in COI data from 2_subdomains.csv
# change path to the actual path to this file
path = "2_subdomains.csv"
coi_df = pd.read_csv(path)

In [None]:
# This calculates the mean score of each county based on all of the data for that county in the r_SE_WL_nat column
df_avg_score = coi_df.groupby('county_name')['r_SE_WL_nat'].mean().reset_index()

In [None]:
# These print statements give an idea of what the data looks like and its range
print(df_avg_score)
print(min(df_avg_score["r_SE_WL_nat"]))
print(max(df_avg_score["r_SE_WL_nat"]))

In [None]:
# Go through this new dataframe. 
# If the score is in [1,33], append that county to a list called "low"
# If the score is in [34,66], append that county to a list called "medium"
# If the score is in [67,100], append that county to a list called "high"

low, med, high = [],[],[]
level_dict = {}
zip_dict = {}

for row in range(len(df_avg_score["r_SE_WL_nat"])):
    # Store the county's mean value
    cur_val = df_avg_score["r_SE_WL_nat"][row]

    # Store the county name
    county = df_avg_score["county_name"][row]
    
    if (cur_val >= 1 and cur_val <= 33):
        low.append(county)
        level_dict[county] = "Low"
    elif (cur_val >= 34 and cur_val <= 66):
        med.append(county)
        level_dict[county] = "Medium"
    else:
        high.append(county)
        level_dict[county] = "High"

# print(low)
# print(med)
# print(high)
# print(level_dict)

In [None]:
# Create the "Level" column that assigns a county a level based on the mean score
coi_df["Level"] = coi_df["county_name"].map(level_dict)

In [None]:
# Read in the ADMISSIONS.csv file
path = "ADMISSIONS.csv"
mimic_df = pd.read_csv(path)

private -> high -> random county from list of high counties

In [None]:
# The following insurance assignments will determine which level of zip codes to assign to each row of the 
# MIMIC dataset for the synthetic zip code column

insurance_dict = {}

insurance_dict["Medicaid"] = "Low"
insurance_dict["Government"] = "Low"
insurance_dict["Self Pay"] = "Low"
insurance_dict["Medicare"] = "Medium"
insurance_dict["Private"] = "High"

In [None]:
# Create the "Level" column in the MIMIC dataset
mimic_df["Level"] = mimic_df["INSURANCE"].map(insurance_dict)

In [None]:
# This function assumes the low, med, high arrays are already established
def assign_county(row):
    import random

    if (row["Level"] == "Low"):
        return random.choice(low)
    elif (row["Level"] == "Medium"):
        return random.choice(med)
    elif (row["Level"] == "High"):
        return random.choice(high)
    
# This line of code assigns counties to the MIMIC dataset based on the level
mimic_df["county_name"] = mimic_df.apply(assign_county, axis=1)

In [None]:
# Visualize the data's contents
mimic_df.head()

In [None]:
# The following json file was obtained from GitHub: https://github.com/AdmitHub/us-zcta-counties/tree/main

import json

# May need to specify path here, but as long as this json file is in the same folder as this Jupyter Notebook it should work
with open('state_county_zip.json', 'r') as f:
    data = json.load(f)

def get_zip(county_name):
    import random
    
    state_abbreviations = {
                        "Alabama": "AL",
                        "Alaska": "AK",
                        "Arizona": "AZ",
                        "Arkansas": "AR",
                        "California": "CA",
                        "Colorado": "CO",
                        "Connecticut": "CT",
                        'District of Columbia': "DC",
                        "Delaware": "DE",
                        "Florida": "FL",
                        "Georgia": "GA",
                        "Hawaii": "HI",
                        "Idaho": "ID",
                        "Illinois": "IL",
                        "Indiana": "IN",
                        "Iowa": "IA",
                        "Kansas": "KS",
                        "Kentucky": "KY",
                        "Louisiana": "LA",
                        "Maine": "ME",
                        "Maryland": "MD",
                        "Massachusetts": "MA",
                        "Michigan": "MI",
                        "Minnesota": "MN",
                        "Mississippi": "MS",
                        "Missouri": "MO",
                        "Montana": "MT",
                        "Nebraska": "NE",
                        "Nevada": "NV",
                        "New Hampshire": "NH",
                        "New Jersey": "NJ",
                        "New Mexico": "NM",
                        "New York": "NY",
                        "North Carolina": "NC",
                        "North Dakota": "ND",
                        "Ohio": "OH",
                        "Oklahoma": "OK",
                        "Oregon": "OR",
                        "Pennsylvania": "PA",
                        "Rhode Island": "RI",
                        "South Carolina": "SC",
                        "South Dakota": "SD",
                        "Tennessee": "TN",
                        "Texas": "TX",
                        "Utah": "UT",
                        "Vermont": "VT",
                        "Virginia": "VA",
                        "Washington": "WA",
                        "West Virginia": "WV",
                        "Wisconsin": "WI",
                        "Wyoming": "WY"
                    }

    # Assumes there will be a comma and space in between
    elements = county_name.split(', ')
    # print(elements)
    county = elements[0]

    if (len(elements) > 1):
        # Assumes that the county and state were separated successfully
        state = state_abbreviations[elements[1]]
    else:
        # Sometimes "census areas" don't have a corresponding state so splitting by commas don't work.
        # Thankfully, they tend to be in Alaska
        state = "AK"

    if state in data:
        if county in data[state]["counties"]:
            # This gets the first zip code available for that county.
            # return data[state]["counties"][county]["zip_codes"][0]

            # The better code is to randomly pick a zip code from the list of zip codes for that county:
            return random.choice(data[state]["counties"][county]["zip_codes"])

    # If all else fails, return 00000, but it would be good to check for 00000 in the resulting csv.
    return '00000'



# get_zip('Armstrong County, Pennsylvania')

In [None]:
# Actually create the synthetic zip code column
mimic_df['zip_code'] = mimic_df.apply(lambda row: get_zip(row['county_name']), axis=1)

In [None]:
# Write the resulting csv file (can change the name here)
mimic_df.to_csv("ADMISSIONS_SYNTHETIC.csv", index=None)