# AUTHORITATIVE

# Load all NSF-Awards data

In [15]:
import requests
import zipfile  
import io
import os
import json
import pandas as pd

data_dir = '../data'

## Get the data
Download all zip files for awards from 2010 thru 2025, extract all json, load all json into a big dataframe, then subdivide into dataframes for awards, PIs, ...

In [None]:


# Create directories for zip files and unzipped data if they don't exist
for year in range(2010, 2026):
    awards_url = f"https://www.nsf.gov/awardsearch/download?DownloadFileName={year}&All=true&isJson=true"
    print(f"Downloading awards data for {year}...")
    
    # Download the zip file from the URL
    response = requests.get(awards_url)
    with open(f"zipfiles/awards_{year}.zip", "wb") as f:
        f.write(response.content)

    print(f"zip file for {year} downloaded")

    # Unzip the file
    with zipfile.ZipFile(f"zipfiles/awards_{year}.zip", "r") as zip_ref:
        zip_ref.extractall(f"data/awards_json/awards_{year}")

    print(f"zip file for {year} unzipped")




Downloading awards data for 2010...
zip file for 2010 downloaded
zip file for 2010 unzipped
Downloading awards data for 2011...
zip file for 2011 downloaded
zip file for 2011 unzipped
Downloading awards data for 2012...
zip file for 2012 downloaded
zip file for 2012 unzipped
Downloading awards data for 2013...
zip file for 2013 downloaded
zip file for 2013 unzipped
Downloading awards data for 2014...
zip file for 2014 downloaded
zip file for 2014 unzipped
Downloading awards data for 2015...
zip file for 2015 downloaded
zip file for 2015 unzipped
Downloading awards data for 2016...
zip file for 2016 downloaded
zip file for 2016 unzipped
Downloading awards data for 2017...
zip file for 2017 downloaded
zip file for 2017 unzipped
Downloading awards data for 2018...
zip file for 2018 downloaded
zip file for 2018 unzipped
Downloading awards data for 2019...
zip file for 2019 downloaded
zip file for 2019 unzipped
Downloading awards data for 2020...
zip file for 2020 downloaded
zip file for 20

In [None]:
# # Read all json files from the unzipped folder into a single DataFrame using the Pandas normalize_json function
def read_json_files(folder_path):
    dataframes = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            filename = os.path.join(root, file)
            if filename.endswith(".json"):
                with open(filename, 'r') as f:
                    data = json.load(f)
                    df = pd.json_normalize(data)
                    dataframes.append(df)
    # Concatenate all DataFrames into a single DataFrame, resetting the index before returning
    return pd.concat(dataframes, ignore_index=True)

print("Reading JSON files into DataFrame...")

# Read the JSON files into a DataFrame
awards_df = read_json_files("../data/awards_json/")

print("DataFrame created")

Reading JSON files into DataFrame...
DataFrame created


## Update datatypes in the dataframe

In [12]:
# Convert the date columns to datetime format
date_columns = ['awd_eff_date', 'awd_exp_date','awd_min_amd_letter_date', 'awd_max_amd_letter_date']
for col in date_columns:
    if col in awards_df.columns:
        awards_df[col] = pd.to_datetime(awards_df[col], errors='coerce')

### Save and restore the DataFrame

In [None]:
awards_df.to_pickle('../data/pkl_files/awards_data_2010_2025_big.pkl')

In [2]:
awards_df = pd.read_pickle('../data/pkl_files/awards_data_2010_2025_big.pkl')

## Extract child tables
The pi, pgm_ele, pgm_ref, app_fund, and oblg_fy columns all contain nested data and will be broken out into separate data frames

In [15]:
# pi table
# Extract the 'pi' column from the awards_data DataFrame, which contains nested data.
# Use the explode() method to transform each element of a list-like column into a separate row.
# Use the apply() method with the pandas Series constructor to convert the exploded data into a DataFrame.
# Dictionary keys will become column names, and values will become the corresponding values in the DataFrame.
pi_df = awards_df['pi'].explode().apply(pd.Series) # 330,573 rows, 10 columns

# Remove duplicates from the pi_df DataFrame based on the 'nsf_id' column.
# Keep the first occurrence of each unique value in the 'nsf_id' column.
# Reset the index of the DataFrame after removing duplicates.   
# The reset_index() method is used to create a new index for the DataFrame.
# The drop=True argument is used to avoid adding the old index as a new column in the DataFrame.

pi_df = pi_df.drop_duplicates(subset=['nsf_id']).reset_index(drop=True) # 131,363 rows, 10 columns


In [16]:
# pgm_ele table
# Same process as above, but for the 'pgm_ele' column.
pgm_ele_df = awards_df['pgm_ele'].explode().apply(pd.Series) # 243,844 rows, 2 columns

# Remove duplicates from the pgm_ele_df DataFrame based on the 'pgm_ele_code' column.

pgm_ele_df = pgm_ele_df.drop_duplicates(subset=['pgm_ele_code']).reset_index(drop=True) # 1,190 rows, 2 columns

In [17]:
# pgm_ref table
# Same process as above, but for the 'pgm_ref' column.
pgm_ref_df = awards_df['pgm_ref'].explode().apply(pd.Series) # 476,124 rows, 2 columns

# Remove duplicates from the pgm_ref_df DataFrame based on the 'pgm_ref_code' column.
pgm_ref_df = pgm_ref_df.drop_duplicates(subset=['pgm_ref_code']).reset_index(drop=True) # 1,509 rows, 2 columns

In [18]:
# app_fund table
# Same process as above, but for the 'app_fund' column.
app_fund_df = awards_df['app_fund'].explode().apply(pd.Series) # 321,721 rows x 6 columns
# Remove duplicates from the app_fund_df DataFrame based on the 'fund_code' column. 

app_fund_df = app_fund_df.drop_duplicates(subset=['fund_code']).reset_index(drop=True) # 213 rows x 6 columns

In [19]:
# oblg_fy table
# Same process as above, but for the 'oblg_fy' column.
oblg_fy_df = awards_df['oblg_fy'].explode().apply(pd.Series) # 292,009 rows, 2 columns

# Reset the index, retaining the original index as a new column.
oblg_fy_df = oblg_fy_df.reset_index()

# Rename the 'index' column to 'awd_index'.
oblg_fy_df = oblg_fy_df.rename(columns={'index': 'awd_index'}) # 292,009 rows, 2 columns

In [None]:
# Write the DataFrames to pickle files
pi_df.to_pickle('../data/pkl_files/pi_df.pkl')
pgm_ele_df.to_pickle('../data/pkl_files/pgm_ele_df.pkl')
pgm_ref_df.to_pickle('../data/pkl_files/pgm_ref_df.pkl')
app_fund_df.to_pickle('../data/pkl_files/app_fund_df.pkl')
oblg_fy_df.to_pickle('../data/pkl_files/oblg_fy_df.pkl')

In [None]:
# Read the pickle files back into DataFrames
pi_df = pd.read_pickle('../data/pkl_files/pi_df.pkl')
pgm_ele_df = pd.read_pickle('../data/pkl_files/pgm_ele_df.pkl')
pgm_ref_df = pd.read_pickle('../data/pkl_files/pgm_ref_df.pkl')
app_fund_df = pd.read_pickle('../data/pkl_files/app_fund_df.pkl')
oblg_fy_df = pd.read_pickle('../data/pkl_files/oblg_fy_df.pkl')

## Create bridge tables to join parent awards table to child tables

In [22]:
# Create a bridge table to connect the pi_df and awards_data DataFrames

awd_pi_list = []
for row in awards_df.iterrows():
    # Get the index of the current row
    index = row[0]

    # Get the awd_id of the current row
    awd_id = row[1]['awd_id']
    
    # Get the list of PI IDs for the current award
    pi_ids = row[1]['pi']
    # Check if pi_ids is not None and is a list
    if pi_ids is not None and isinstance(pi_ids, list):
        # If pi_ids is a list, extract the 'nsf_id' from each element
        pi_ids = [{'nsf_id': pi['nsf_id']} for pi in pi_ids]
    else:
        # If pi_ids is not a list, create an empty list
        pi_ids = []
    for pi_id in pi_ids:
        # Create a dictionary for the bridge table
        awd_pi_dict = {
            'awd_index': index,
            'awd_id': awd_id,
            'nsf_id': pi_id['nsf_id']
        }
        # Append the dictionary to the list
        awd_pi_list.append(awd_pi_dict)

# Convert the list of dictionaries to a DataFrame
awd_pi_df = pd.DataFrame(awd_pi_list) # 330,573 rows, 3 columns

In [23]:
# Create a bridge table to connect the pgm_ele_df and awards_data DataFrames
awd_pgm_ele_list = []

for row in awards_df.iterrows():
    # Get the index of the current row
    index = row[0]

    # Get the awd_id of the current row
    awd_id = row[1]['awd_id']
    
    # Get the list of program elements for the current award
    pgm_elems = row[1]['pgm_ele']
    # Check if pgm_elems is not None and is a list
    if pgm_elems is not None and isinstance(pgm_elems, list):
        # If pgm_elems is a list, extract the 'pgm_ele_code' from each element
        pgm_elems = [{'pgm_ele_code': pgm['pgm_ele_code']} for pgm in pgm_elems]
    else:
        # If pgm_elems is not a list, create an empty list
        pgm_elems = []
    for pgm_elem in pgm_elems:
        # Create a dictionary for the bridge table
        awd_pgm_elem_dict = {
            'awd_index': index,
            'awd_id': awd_id,
            'pgm_ele_code': pgm_elem['pgm_ele_code']
        }
        # Append the dictionary to the list
        awd_pgm_ele_list.append(awd_pgm_elem_dict)

# Convert the list of dictionaries to a DataFrame
awd_pgm_ele_df = pd.DataFrame(awd_pgm_ele_list) # 243,152 rows, 3 columns

In [None]:
# Create a bridge table to connect the awards_data and pgm_ref_df DataFrames
awd_pgm_ref_list = []

for row in awards_df.iterrows():
    # Get the index of the current row
    index = row[0]

    # Get the awd_id of the current row
    awd_id = row[1]['awd_id']
    
    # Get the list of program references for the current award
    pgm_refs = row[1]['pgm_ref']
    # Check if pgm_refs is not None and is a list
    if pgm_refs is not None and isinstance(pgm_refs, list):
        # If pgm_refs is a list, extract the 'pgm_ref_code' from each element
        pgm_refs = [{'pgm_ref_code': pgm['pgm_ref_code']} for pgm in pgm_refs]
    else:
        # If pgm_refs is not a list, create an empty list
        pgm_refs = []
    for pgm_ref in pgm_refs:
        # Create a dictionary for the bridge table
        awd_pgm_ref_dict = {
            'awd_index': index,
            'awd_id': awd_id,
            'pgm_ref_code': pgm_ref['pgm_ref_code']
        }
        # Append the dictionary to the list
        awd_pgm_ref_list.append(awd_pgm_ref_dict)

# Convert the list of dictionaries to a DataFrame
awd_pgm_ref_df = pd.DataFrame(awd_pgm_ref_list) # 451,944 rows, 3 columns

In [None]:
# Write the bridge tables to pickle files
awd_pi_df.to_pickle('../data/pkl_files/awd_pi_df.pkl')
awd_pgm_ele_df.to_pickle('../data/pkl_files/awd_pgm_ele_df.pkl')
awd_pgm_ref_df.to_pickle('../data/pkl_files/awd_pgm_ref_df.pkl')

In [3]:
# Read the bridge tables back into DataFrames
awd_pi_df = pd.read_pickle('../data/pkl_files/awd_pi_df.pkl')
awd_pgm_ele_df = pd.read_pickle('../data/pkl_files/awd_pgm_ele_df.pkl')
awd_pgm_ref_df = pd.read_pickle('../data/pkl_files/awd_pgm_ref_df.pkl')

## Remove unnecessary columns from awards data

In [27]:
awards_df.columns

Index(['awd_id', 'agcy_id', 'tran_type', 'awd_istr_txt', 'awd_titl_txt',
       'cfda_num', 'org_code', 'po_phone', 'po_email', 'po_sign_block_name',
       'awd_eff_date', 'awd_exp_date', 'tot_intn_awd_amt', 'awd_amount',
       'awd_min_amd_letter_date', 'awd_max_amd_letter_date',
       'awd_abstract_narration', 'awd_arra_amount', 'dir_abbr',
       'org_dir_long_name', 'div_abbr', 'org_div_long_name', 'awd_agcy_code',
       'fund_agcy_code', 'pi', 'pgm_ele', 'pgm_ref', 'app_fund', 'oblg_fy',
       'inst.inst_name', 'inst.inst_street_address',
       'inst.inst_street_address_2', 'inst.inst_city_name',
       'inst.inst_state_code', 'inst.inst_state_name', 'inst.inst_phone_num',
       'inst.inst_zip_code', 'inst.inst_country_name', 'inst.cong_dist_code',
       'inst.st_cong_dist_code', 'inst.org_lgl_bus_name',
       'inst.org_prnt_uei_num', 'inst.org_uei_num', 'perf_inst.perf_inst_name',
       'perf_inst.perf_str_addr', 'perf_inst.perf_city_name',
       'perf_inst.perf_st_cod

In [9]:
# before
# awards_df.info(memory_usage='deep') # memory usage: 2.1 GB

awards_df.drop(columns=['awd_abstract_narration','pi', 'pgm_ele', 'pgm_ref', 'app_fund', 'oblg_fy', 'por.por_cntn',
       'por.por_txt_cntn', 'por'], inplace=True)

# after
# awards_df.info(memory_usage='deep') # memory usage: 559.8 MB

In [10]:
# Write the cleaned awards_df DataFrame to a pickle file
awards_df.to_pickle('../data/pkl_files/awards_df.pkl')

In [4]:
# Read the awards_df DataFrame back from the pickle file
awards_df = pd.read_pickle('../data/pkl_files/awards_df.pkl')

# Make two partially summarized dataframes
By year and division, by year and directorate

In [15]:
# summarize awards_df, grouping by year of the awd_eff_date column and div_abbr, and including the count of awards and the sum of the award amount
awards_summary_divyr_df = awards_df.groupby([awards_df['awd_eff_date'].dt.year, 'div_abbr']).agg(
    awd_count=('awd_id', 'count'),
    sum_awd_amount=('awd_amount', 'sum')
).reset_index()
# Rename the columns for clarity
awards_summary_divyr_df.rename(columns={'awd_eff_date': 'awd_year'}, inplace=True)

In [16]:
# summarize awards_df, grouping by year of the awd_eff_date column and dir_abbr, and including the count of awards and the sum of the award amount
awards_summary_diryr_df = awards_df.groupby([awards_df['awd_eff_date'].dt.year, 'dir_abbr']).agg(
    awd_count=('awd_id', 'count'),
    sum_awd_amount=('awd_amount', 'sum')
).reset_index()
# Rename the columns for clarity
awards_summary_diryr_df.rename(columns={'awd_eff_date': 'awd_year'}, inplace=True)

In [17]:
# write the summary DataFrames to pickle files
awards_summary_diryr_df.to_pickle('../data/pkl_files/awards_summary_diryr_df.pkl')
awards_summary_divyr_df.to_pickle('../data/pkl_files/awards_summary_divyr_df.pkl')

In [18]:
# read the summary DataFrames back from the pickle files
awards_summary_diryr_df = pd.read_pickle('../data/pkl_files/awards_summary_diryr_df.pkl')
awards_summary_divyr_df = pd.read_pickle('../data/pkl_files/awards_summary_divyr_df.pkl')

## Make an enriched version of awards_df with an awd_type column

In [11]:
# Join awards_df with awd_pgm_ref_df on the 'awd_id' column
awards_with_type_df = pd.merge(awards_df, awd_pgm_ref_df, on='awd_id', how='left')

In [12]:
# Add an awd_type column based on pgm_ref_code
rc_at_map = {
    '1045' : 'CAREER',
    '9250' : 'REU Site',
    '9229' : 'RUI'
}

awards_with_type_df['awd_type'] = awards_with_type_df['pgm_ref_code'].map(rc_at_map)
awards_with_type_df = awards_with_type_df.query('awd_type == "CAREER" or awd_type == "REU Site" or awd_type == "RUI"')

In [None]:
# Write to pickle file
awards_with_type_df.to_pickle(f'{data_dir}/pkl_files/awards_with_type_df.pkl')

In [16]:
# Read the awards_with_type_df DataFrame back from the pickle file
awards_with_type_df = pd.read_pickle(f'{data_dir}/pkl_files/awards_with_type_df.pkl')

## Remove everything we don't need from memory

In [32]:
# del awards_data
if 'awd_pgm_ele_list' in globals(): del awd_pgm_ele_list
if 'awd_pgm_ref_list' in globals(): del awd_pgm_ref_list
if 'awd_pi_list' in globals(): del awd_pi_list

In [33]:
# for root, dirs, files in os.walk("data/awards_json/"):
#     for file in files:
#         print(os.path.join(root, file))