In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import plotly.express as px
import sys

PROJECT_DIR = Path.cwd().parent
DATA_DIR = PROJECT_DIR / "data"
if str(PROJECT_DIR) not in sys.path:
    sys.path.append(str(PROJECT_DIR))

from data.data import get_data_df, get_metadata_df

In [None]:
df = get_data_df()
metadata_df = get_metadata_df()

In [None]:
df

In [None]:
target_id = df.IBRC_Geo_ID.cat.categories[0]
target_naics = df[df.IBRC_Geo_ID == target_id]["NAICS Code"].cat.categories[0]
target_df = df.loc[(df.IBRC_Geo_ID == target_id) & (df["NAICS Code"] == target_naics), :]
px.line(target_df, x="Year", y="PA-LQ_Data", color="PA-LQ_Code_Description",)

In [None]:
# CLQ/PA-LQ is geometric mean? Yes
from statistics import geometric_mean
for key, grp in target_df.groupby(by="Year", observed=True):
    print(geometric_mean([grp.loc[grp["PA-LQ_Code"]==s, "PA-LQ_Data"].values.item() for s in {"100", "300", "400", "500"}]), end="\t")
    print(grp.loc[grp["PA-LQ_Code"] == "200", "PA-LQ_Data"].values.item())

In [None]:
import requests
counties = requests.get("https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json").json()

In [None]:
# which counties have white spaces?
df[df.IBRC_Geo_ID.str.contains(" ")].IBRC_Geo_ID.cat.remove_unused_categories().value_counts()

In [None]:
# length of each id, without whitespaces
df.IBRC_Geo_ID.drop_duplicates().str.strip().str.len().value_counts()

In [None]:
# we should be able to strip the trailing white space for IBRC_Geo_ID and add a leading 0
print(df[df.IBRC_Geo_ID.str.contains(" ")].iloc[0])
next(filter(lambda county: county["id"] == "05019", counties["features"]))

In [None]:
# Fix IBRC_Geo_ID
geoid_category_map = {old_cat: f"0{old_cat[:-1]}" for old_cat in df.IBRC_Geo_ID.dtype.categories if old_cat[-1] == " "}
df["IBRC_Geo_ID"] = df.IBRC_Geo_ID.cat.rename_categories(geoid_category_map)

In [None]:
# Enhance NAICS description
import string
naics_category_map = {old_cat: old_cat.lstrip(string.punctuation + string.whitespace) for old_cat in df["NAICS Description"].cat.categories}
df["NAICS Description"] = df["NAICS Description"].cat.rename_categories(naics_category_map)

In [None]:
# Set Year category as ordered
df["Year"] = df.Year.astype(pd.CategoricalDtype(sorted(df.Year.cat.categories), ordered=True))

In [None]:
for key, grp in df[df["PA-LQ_Code"] == "200"].groupby(by=["NAICS Code"], observed=True):
    fig = px.choropleth_mapbox(
        grp, geojson=counties, locations='IBRC_Geo_ID', color='PA-LQ_Data',
        color_continuous_scale="Reds",
        # range_color=(0, 12),
        mapbox_style="carto-positron",
        zoom=3, center = {"lat": 37.0902, "lon": -95.7129},
        opacity=0.5,
        animation_frame="Year",
    )
    fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
    # fig.show()
    break
fig

In [None]:
geoid_to_county_name = df.drop_duplicates("IBRC_Geo_ID").set_index("IBRC_Geo_ID")["Description"].to_dict()
naics_code_to_industry_name = df.drop_duplicates("NAICS Code").set_index("NAICS Code")["NAICS Description"].to_dict()
lq_code_to_lq_name = df.drop_duplicates("PA-LQ_Code").set_index("PA-LQ_Code")["PA-LQ_Code_Description"].to_dict()