In [44]:
import pandas as pd
from pathlib import Path
import re
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import altair as alt
from vega_datasets import data


BASE_DIR = Path("..")
ORIGINAL_DATA_DIR = BASE_DIR / "original_data"
CLEAN_DATA_DIR = BASE_DIR / "clean_data"
FILE_PATH_TERM = CLEAN_DATA_DIR / "nsf_terminations_airtable.csv"
FILE_PATH_UNI = CLEAN_DATA_DIR / "heis_usa.csv"
FILE_PATH_COMPLETE = CLEAN_DATA_DIR / "nsf_awards_us_2019_2024.csv"

In [45]:
terminations = pd.read_csv(FILE_PATH_TERM)
universities = pd.read_csv(FILE_PATH_UNI)
total_dataset = pd.read_csv(FILE_PATH_COMPLETE)

In [46]:
us_map = alt.topo_feature(data.us_10m.url, feature="states")

terminated_by_state = (
    terminations
    .groupby("org_state_full")
    .size()
    .reset_index(name="terminated_count")
)

new_row = {"org_state_full": "Wyoming", "terminated_count": 0}
terminated_by_state = pd.concat([terminated_by_state, pd.DataFrame([new_row])], ignore_index=True)

terminated_by_state["org_state_full"] = terminated_by_state["org_state_full"].str.strip()

state_names = data.population_engineers_hurricanes()
state_abbrev = state_names[["state", "id"]].drop_duplicates()
state_abbrev.columns = ["org_state_full", "id"]

terminated_map = pd.merge(terminated_by_state, state_abbrev, on="org_state_full", how="left")

choropleth = (
    alt.Chart(us_map)
    .mark_geoshape()
    .transform_lookup(
        lookup="id",
        from_=alt.LookupData(terminated_map, "id", ["org_state_full", "terminated_count"])
    )
    .encode(
        color=alt.Color(
            "terminated_count:Q",
            title="Nombre de grants terminades",
            scale=alt.Scale(scheme="reds")
        ),
        tooltip=["org_state_full:N", "terminated_count:Q"]
    )
    .project("albersUsa")
    .properties(
        title="Grants terminades per estat (NSF)",
        width=700,
        height=400
    )
)

choropleth


In [47]:
import pandas as pd
import altair as alt
from vega_datasets import data

#
# # Second idea: ratio of terminated grant / num universities
#
# ---------------------------
# 1. Agrupar terminacions per estat (nom complet)
# ---------------------------
terminated_by_state = (
    terminations
    .groupby("org_state_full")
    .size()
    .reset_index(name="terminated_count")
)
terminated_by_state["org_state_full"] = terminated_by_state["org_state_full"].str.strip()

new_row = {"org_state_full": "Wyoming", "terminated_count": 0}
terminated_by_state = pd.concat([terminated_by_state, pd.DataFrame([new_row])], ignore_index=True)

universities_by_state = (
    universities.groupby("region")
    .size()
    .reset_index(name="universities_count")
)
universities_by_state["region"] = universities_by_state["region"].str.strip()


state_names = data.population_engineers_hurricanes()
state_fips = state_names[["state", "id"]].drop_duplicates()
state_fips.columns = ["org_state_full", "id"]


combined = pd.merge(
    terminated_by_state,
    universities_by_state,
    left_on="org_state_full",
    right_on="region",
    how="left"
).drop(columns="region")

combined["terminated_per_university"] = (
    combined["terminated_count"] / combined["universities_count"]
)

terminated_map = pd.merge(combined, state_fips, on="org_state_full", how="left")

us_map = alt.topo_feature(data.us_10m.url, feature="states")

choropleth = (
    alt.Chart(us_map)
    .mark_geoshape(stroke="white", strokeWidth=0.5)
    .transform_lookup(
        lookup="id",
        from_=alt.LookupData(
            terminated_map,
            "id",
            ["org_state_full", "terminated_per_university"]
        )
    )
    .encode(
        color=alt.Color(
            "terminated_per_university:Q",
            title="Grants terminades per universitat",
            scale=alt.Scale(scheme="reds", domain=[0, combined["terminated_per_university"].max()])
        ),
        tooltip=[
            alt.Tooltip("org_state_full:N", title="Estat"),
            alt.Tooltip("terminated_per_university:Q", title="Rati (Grants/Universitat)", format=".2f")
        ]
    )
    .project("albersUsa")
    .properties(
        title="Rati de grants terminades per universitat / centre de recerca (NSF)",
        width=700,
        height=400
    )
)

choropleth


In [48]:
terminated_by_state = (
    terminations
    .groupby("org_state_full")
    .size()
    .reset_index(name="terminated_count")
)

terminated_by_state = pd.concat(
    [terminated_by_state, pd.DataFrame([{"org_state_full": "Wyoming", "terminated_count": 0}])],
    ignore_index=True
)

total_grants_by_state = (
    total_dataset
    .groupby("org_state_full")
    .size()
    .reset_index(name="previous_count")
)

combined = terminated_by_state.merge(total_grants_by_state, on="org_state_full", how="left")
combined["terminated_metric"] = combined["terminated_count"] / combined["previous_count"]

state_fips = (
    data.population_engineers_hurricanes()[["state", "id"]]
    .drop_duplicates()
    .rename(columns={"state": "org_state_full"})
)

terminated_map = combined.merge(state_fips, on="org_state_full", how="left")

choropleth = (
    alt.Chart(alt.topo_feature(data.us_10m.url, feature="states"))
    .mark_geoshape(stroke="white", strokeWidth=0.5)
    .transform_lookup(
        lookup="id",
        from_=alt.LookupData(terminated_map, "id", ["org_state_full", "terminated_metric"])
    )
    .encode(
        color=alt.Color(
            "terminated_metric:Q",
            title="Rati de grants terminades",
            scale=alt.Scale(scheme="reds", domain=[0, combined["terminated_metric"].max()])
        ),
        tooltip=[
            alt.Tooltip("org_state_full:N", title="Estat"),
            alt.Tooltip("terminated_metric:Q", title="Rati de grants terminades", format=".2f")
        ]
    )
    .project("albersUsa")
    .properties(
        title="Rati de grants terminades per estat (NSF)",
        width=700,
        height=400
    )
)

choropleth
