In [1]:
import numpy as np
from cities.utils.data_grabber import find_repo_root, MSADataGrabberCSV, DataGrabberCSV
import pandas as pd
import numpy as np
import requests
from us import states

root = find_repo_root()

data = DataGrabberCSV()
data.get_features_wide(["gdp"])
gdp = data.wide["gdp"]

dataMA = MSADataGrabberCSV()
dataMA.get_features_wide(["gdp_ma"])
gdp_ma = dataMA.wide["gdp_ma"]


# note that there is also a library for accessing the Census data:
# https://github.com/datamade/census


# description of the missing values in the data (it is sometimes caused by a mistake that can be corrected):
#https://www2.census.gov/geo/pdfs/reference/Geography_Notes.pdf

In [None]:
variables = "NAME,DP05_0072E,DP05_0074E,DP05_0075E,DP05_0076E,DP05_0077E,DP05_0079E,DP05_0080E,DP05_0081E,DP05_0082E,DP05_0083E,DP05_0084E,DP05_0085E"
county_fips = "*" # all counties
tract = "*" # all tracts
api_key = '077d857d6c12d5b9b3aeafa07d2c1916ba12a86c' # private api key required to access the data https://api.census.gov/data/key_signup.html
year = 2022

dfs = []

for x in range(0, len(states.STATES)): # in this call it's not possible to use the '*' wildcard to access all states, so we need to iterate over all states
    fips = states.STATES[x].fips

    url = f'https://api.census.gov/data/{year}/acs/acs5/profile?get={variables}&for=tract:{tract}&in=state:{fips}&in=county:{county_fips}&key={api_key}'

    response = requests.get(url)

    assert response.status_code == 200, 'The data retrieval went wrong'  # 200 means success

    print(f'{fips} fips done')

    data = response.json()

    df = pd.DataFrame(data[1:], columns=data[0])

    dfs.append(df)
    
    combined_df = pd.concat(dfs, ignore_index=True)



In [19]:
combined_df.head()

Unnamed: 0,NAME,DP05_0072E,DP05_0074E,DP05_0075E,DP05_0076E,DP05_0077E,DP05_0079E,DP05_0080E,DP05_0081E,DP05_0082E,DP05_0083E,DP05_0084E,DP05_0085E,state,county,tract
0,Census Tract 201; Autauga County; Alabama,1865,46,0,0,35,1428,208,0,45,0,0,103,1,1,20100
1,Census Tract 202; Autauga County; Alabama,1861,2,0,0,0,674,1042,0,0,0,8,135,1,1,20200
2,Census Tract 203; Autauga County; Alabama,3492,44,0,0,0,2413,876,0,11,0,0,148,1,1,20300
3,Census Tract 204; Autauga County; Alabama,3987,13,0,0,31,3500,297,7,23,0,0,116,1,1,20400
4,Census Tract 205.01; Autauga County; Alabama,4121,9,98,0,45,3209,620,0,97,0,0,43,1,1,20501


In [42]:
ethnic = combined_df.copy()

In [43]:
column_mapping = {
    'DP05_0072E': 'total_pop', # those variable names work for 2022, be aware that in other years their meaning may differ
    'DP05_0074E': 'mexican',
    'DP05_0075E': 'puerto_rican',
    'DP05_0076E': 'cuban',
    'DP05_0077E': 'other_hispanic_latino',
    'DP05_0079E': 'white',
    'DP05_0080E': 'black_african_american',
    'DP05_0081E': 'american_indian_alaska_native',
    'DP05_0082E': 'asian',
    'DP05_0083E': 'native_hawaiian_other_pacific_islander',
    'DP05_0084E': 'other_race',
    'DP05_0085E': 'two_or_more_sum'
}

ethnic.rename(columns=column_mapping, inplace=True)
ethnic.head()

Unnamed: 0,NAME,total_pop,mexican,puerto_rican,cuban,other_hispanic_latino,white,black_african_american,american_indian_alaska_native,asian,native_hawaiian_other_pacific_islander,other_race,two_or_more_sum,state,county,tract
0,Census Tract 201; Autauga County; Alabama,1865,46,0,0,35,1428,208,0,45,0,0,103,1,1,20100
1,Census Tract 202; Autauga County; Alabama,1861,2,0,0,0,674,1042,0,0,0,8,135,1,1,20200
2,Census Tract 203; Autauga County; Alabama,3492,44,0,0,0,2413,876,0,11,0,0,148,1,1,20300
3,Census Tract 204; Autauga County; Alabama,3987,13,0,0,31,3500,297,7,23,0,0,116,1,1,20400
4,Census Tract 205.01; Autauga County; Alabama,4121,9,98,0,45,3209,620,0,97,0,0,43,1,1,20501


In [44]:
state_abbreviations = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

ethnic['GeoFIPS'] = ethnic.apply(lambda row: f"{row['state']}{row['county']}{row['tract']}", axis=1).astype(np.int64)

ethnic.drop(['state', 'county', 'tract'], axis=1, inplace=True)

#pop['GeoName'] = pop['NAME'].apply(lambda x: f"{x.split(',')[1].strip().replace(' County', '')}, {state_abbreviations[x.split(',')[2].strip()]} (CT)")


def parse_geo_name(name):
    if ';' in name:
        parts = name.split(';')
    else:
        parts = name.split(',')

    if len(parts) >= 3:
        county = parts[1].strip().replace(' County', '')
        state_full = parts[2].strip()
        state_abbr = state_abbreviations.get(state_full, state_full)  
        return f"{county}, {state_abbr} (CT)"
    return "Unknown"


ethnic['GeoName'] = ethnic['NAME'].apply(parse_geo_name).astype(str)

assert ethnic[ethnic['GeoName'] == 'Unknown'].shape[0] == 0, 'There are Unknown GeoNames'

ethnic = ethnic.drop(['NAME'], axis=1)

print(ethnic['GeoName'].nunique())
ethnic.head()

3143


Unnamed: 0,total_pop,mexican,puerto_rican,cuban,other_hispanic_latino,white,black_african_american,american_indian_alaska_native,asian,native_hawaiian_other_pacific_islander,other_race,two_or_more_sum,GeoFIPS,GeoName
0,1865,46,0,0,35,1428,208,0,45,0,0,103,1001020100,"Autauga, AL (CT)"
1,1861,2,0,0,0,674,1042,0,0,0,8,135,1001020200,"Autauga, AL (CT)"
2,3492,44,0,0,0,2413,876,0,11,0,0,148,1001020300,"Autauga, AL (CT)"
3,3987,13,0,0,31,3500,297,7,23,0,0,116,1001020400,"Autauga, AL (CT)"
4,4121,9,98,0,45,3209,620,0,97,0,0,43,1001020501,"Autauga, AL (CT)"


In [45]:
rows1 = ethnic.shape[0]
ethnic = ethnic.dropna(how='any')
rows2 = ethnic.shape[0]

ethnic.sort_values(by=['GeoFIPS', 'GeoName'], inplace=True)
print(f"This many rows were removed: {rows1 - rows2}")


This many rows were removed: 0


In [None]:
# read the areas post2020


# and

# df['CBSA'] = df['CBSA'].astype(np.int64)
# df2 = df[df['CBSA'].isin(metro_areas['GeoFIPS'])]

# df2 = pd.merge(df2, metro_areas[['GeoFIPS', 'GeoName']], left_on='CBSA', right_on='GeoFIPS', how='inner')
# df2 = df2.drop_duplicates(subset=['CBSA'])

# df2.drop(columns='CBSA', inplace=True)
# df2.head()



In [46]:
cols_to_save = ethnic.shape[1] - 2
ethnic = ethnic[['GeoFIPS', 'GeoName'] + list(ethnic.columns[0:cols_to_save])]
ethnic.head()

Unnamed: 0,GeoFIPS,GeoName,total_pop,mexican,puerto_rican,cuban,other_hispanic_latino,white,black_african_american,american_indian_alaska_native,asian,native_hawaiian_other_pacific_islander,other_race,two_or_more_sum
0,1001020100,"Autauga, AL (CT)",1865,46,0,0,35,1428,208,0,45,0,0,103
1,1001020200,"Autauga, AL (CT)",1861,2,0,0,0,674,1042,0,0,0,8,135
2,1001020300,"Autauga, AL (CT)",3492,44,0,0,0,2413,876,0,11,0,0,148
3,1001020400,"Autauga, AL (CT)",3987,13,0,0,31,3500,297,7,23,0,0,116
4,1001020501,"Autauga, AL (CT)",4121,9,98,0,45,3209,620,0,97,0,0,43


In [48]:
# wrangling other races, verifying the sum to the total population
# normalizing the values, to create a composition of the population

ethnic.iloc[:, 2:] = ethnic.iloc[:, 2:].apply(
    pd.to_numeric, errors="coerce"
)
ethnic[ethnic.columns[2:]] = ethnic[
    ethnic.columns[2:]
].astype(float)

ethnic["other_race_races"] = (
    ethnic["other_race"] + ethnic["two_or_more_sum"]
)
ethnic = ethnic.drop(
    ["other_race", "two_or_more_sum"], axis=1
)

ethnic["totalALT"] = ethnic.iloc[:, 3:].sum(axis=1)
assert (ethnic["totalALT"] == ethnic["total_pop"]).all()
ethnic = ethnic.drop("totalALT", axis=1)


# assertion that by rows they sum up to 1

row_sums = ethnic.iloc[:, 2:].sum(axis=1)
ethnic.iloc[:, 3:] = ethnic.iloc[:, 3:].div(
    row_sums, axis=0
)

ethnic = ethnic.drop(["total_pop"], axis=1)
ethnic.head()



# columns_to_convert = pop_pre2020_filtered_wide.columns[2:]  
# pop_pre2020_filtered_wide[columns_to_convert] = pop_pre2020_filtered_wide[columns_to_convert].astype(float)

KeyError: 'other_race'

In [49]:
ethnic.dtypes

GeoFIPS                                     int64
GeoName                                    object
mexican                                   float64
puerto_rican                              float64
cuban                                     float64
other_hispanic_latino                     float64
white                                     float64
black_african_american                    float64
american_indian_alaska_native             float64
asian                                     float64
native_hawaiian_other_pacific_islander    float64
other_race_races                          float64
dtype: object

In [None]:
# save them