In [1]:
import numpy as np
from cities.utils.cleaning_utils import find_repo_root
import pandas as pd
import numpy as np
import requests

root = find_repo_root()

Not a time series version

In [2]:
area = "metropolitan%20statistical%20area/micropolitan%20statistical%20area:*"

variables = "DP03_0004E,DP03_0033E,DP03_0034E,DP03_0035E,DP03_0036E,DP03_0037E,DP03_0038E,DP03_0039E,DP03_0040E,DP03_0041E,DP03_0042E,DP03_0043E,DP03_0044E,DP03_0045E"


url = f"https://api.census.gov/data/2021/acs/acs5/profile?get={variables}&for={area}"

response = requests.get(url)

assert response.status_code == 200  # 200 means success

data = response.json()

df = pd.DataFrame(data[1:], columns=data[0])
df.columns

Index(['DP03_0004E', 'DP03_0033E', 'DP03_0034E', 'DP03_0035E', 'DP03_0036E',
       'DP03_0037E', 'DP03_0038E', 'DP03_0039E', 'DP03_0040E', 'DP03_0041E',
       'DP03_0042E', 'DP03_0043E', 'DP03_0044E', 'DP03_0045E',
       'metropolitan statistical area/micropolitan statistical area'],
      dtype='object')

In [33]:
industry = df.copy()

column_name_mapping = {
    "metropolitan statistical area/micropolitan statistical area": "CBSA",
    "DP03_0004E": "employed_sum",
    "DP03_0033E": "agri_forestry_mining",
    "DP03_0034E": "construction",
    "DP03_0035E": "manufacturing",
    "DP03_0036E": "wholesale_trade",
    "DP03_0037E": "retail_trade",
    "DP03_0038E": "transport_utilities",
    "DP03_0039E": "information",
    "DP03_0040E": "finance_real_estate",
    "DP03_0041E": "prof_sci_mgmt_admin",
    "DP03_0042E": "education_health",
    "DP03_0043E": "arts_entertainment",
    "DP03_0044E": "other_services",
    "DP03_0045E": "public_admin",
}

industry.rename(columns=column_name_mapping, inplace=True)

In [34]:
metro_areas = pd.read_csv(f"{root}/data/raw/metrolist.csv")

industry["CBSA"] = industry["CBSA"].astype(np.int64)
industry_comp = industry[industry["CBSA"].isin(metro_areas["GeoFIPS"])]


industry_comp = pd.merge(
    industry_comp,
    metro_areas[["GeoFIPS", "GeoName"]],
    left_on="CBSA",
    right_on="GeoFIPS",
    how="inner",
)
industry_comp = industry_comp.drop_duplicates(subset=["CBSA"])

industry_comp.drop(columns="CBSA", inplace=True)
industry_comp = industry_comp[
    ["GeoFIPS", "GeoName"]
    + [col for col in industry_comp.columns if col not in ["GeoFIPS", "GeoName"]]
]

industry_comp.head()

Unnamed: 0,GeoFIPS,GeoName,employed_sum,agri_forestry_mining,construction,manufacturing,wholesale_trade,retail_trade,transport_utilities,information,finance_real_estate,prof_sci_mgmt_admin,education_health,arts_entertainment,other_services,public_admin
0,10180,"Abilene, TX (MA)",78405,2354,5529,5381,1469,9623,4468,819,5548,5466,22423,6579,3843,4903
3,10420,"Akron, OH (MA)",353066,1847,21688,53220,10304,41835,18064,5288,22163,35616,83123,32710,16375,10833
5,10500,"Albany, GA (MA)",61978,1678,3498,5882,1719,7373,3593,747,2749,5162,16498,5084,3309,4686
9,10540,"Albany-Lebanon, OR (MA)",57062,2770,4329,7490,993,6423,3090,375,2351,4323,13753,4251,3081,3833
10,10580,"Albany-Schenectady-Troy, NY (MA)",455453,3144,25093,35986,8825,46349,19456,7942,32915,53571,121743,35743,20341,44345


In [35]:
industry_comp.iloc[:, 2:] = industry_comp.iloc[:, 2:].astype(float)
row_sums = industry_comp.iloc[:, 3:].sum(axis=1)

industry_comp.iloc[:, 3:] = industry_comp.iloc[:, 3:].div(row_sums, axis=0)
industry_comp = industry_comp.drop(["employed_sum"], axis=1)

industry_comp.head()

Unnamed: 0,GeoFIPS,GeoName,agri_forestry_mining,construction,manufacturing,wholesale_trade,retail_trade,transport_utilities,information,finance_real_estate,prof_sci_mgmt_admin,education_health,arts_entertainment,other_services,public_admin
0,10180,"Abilene, TX (MA)",0.030024,0.070518,0.068631,0.018736,0.122735,0.056986,0.010446,0.070761,0.069715,0.285989,0.08391,0.049015,0.062534
3,10420,"Akron, OH (MA)",0.005231,0.061428,0.150737,0.029184,0.118491,0.051163,0.014977,0.062773,0.100876,0.235432,0.092646,0.046379,0.030683
5,10500,"Albany, GA (MA)",0.027074,0.056439,0.094905,0.027736,0.118962,0.057972,0.012053,0.044354,0.083288,0.266191,0.082029,0.05339,0.075607
9,10540,"Albany-Lebanon, OR (MA)",0.048544,0.075865,0.131261,0.017402,0.112562,0.054152,0.006572,0.041201,0.07576,0.241019,0.074498,0.053994,0.067173
10,10580,"Albany-Schenectady-Troy, NY (MA)",0.006903,0.055095,0.079011,0.019376,0.101765,0.042718,0.017438,0.072269,0.117621,0.267301,0.078478,0.044661,0.097365


In [36]:
industry_comp.to_csv(f"{root}/data/raw/industry_ma.csv", index=False)