# Generate a SQLite database from existing dataframes

In [1]:
import pandas as pd
import sqlite3

db_name = 'nsf_awards.db'

In [2]:
# Read in pickle files
# Main tables
awards_df = pd.read_pickle('awards_df.pkl')
pi_df = pd.read_pickle('pi_df.pkl')
pgm_ele_df = pd.read_pickle('pgm_ele_df.pkl')
pgm_ref_df = pd.read_pickle('pgm_ref_df.pkl')
app_fund_df = pd.read_pickle('app_fund_df.pkl')
oblg_fy_df = pd.read_pickle('oblg_fy_df.pkl')

# Bridge tables
awd_pi_df = pd.read_pickle('awd_pi_df.pkl')
awd_pgm_ele_df = pd.read_pickle('awd_pgm_ele_df.pkl')
awd_pgm_ref_df = pd.read_pickle('awd_pgm_ref_df.pkl')

In [3]:
# Write all of the dataframes to a SQLite database

with sqlite3.connect(db_name) as conn:
    awards_df.to_sql('awards', conn, if_exists='replace', index=False)
    pi_df.to_sql('pi', conn, if_exists='replace', index=False)
    pgm_ele_df.to_sql('pgm_ele', conn, if_exists='replace', index=False)
    pgm_ref_df.to_sql('pgm_ref', conn, if_exists='replace', index=False)
    app_fund_df.to_sql('app_fund', conn, if_exists='replace', index=False)
    oblg_fy_df.to_sql('oblg_fy', conn, if_exists='replace', index=False)

    awd_pi_df.to_sql('awd_pi', conn, if_exists='replace', index=False)
    awd_pgm_ele_df.to_sql('awd_pgm_ele', conn, if_exists='replace', index=False)
    awd_pgm_ref_df.to_sql('awd_pgm_ref', conn, if_exists='replace', index=False)

In [None]:
# Test the database
with sqlite3.connect(db_name) as conn:
    # Check the tables
    tables = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", conn)
    print(tables)

    # Check the awards table
    awards_check = pd.read_sql_query("SELECT * FROM awards LIMIT 5;", conn)
    print(awards_check)

    # Check the pi table
    pi_check = pd.read_sql_query("SELECT * FROM pi LIMIT 5;", conn)
    print(pi_check)

    # Check the pgm_ele table
    pgm_ele_check = pd.read_sql_query("SELECT * FROM pgm_ele LIMIT 5;", conn)
    print(pgm_ele_check)

    # Check the pgm_ref table
    pgm_ref_check = pd.read_sql_query("SELECT * FROM pgm_ref LIMIT 5;", conn)
    print(pgm_ref_check)



In [9]:
query = """
SELECT p.pi_full_name, count(*) as num_awards
FROM pi p
JOIN awd_pi ap ON p.nsf_id = ap.nsf_id
GROUP BY p.pi_full_name
ORDER BY num_awards DESC;
"""

# Retrieve the number of awards by PI
with sqlite3.connect(db_name) as conn:
    num_awds_by_pi_df = pd.read_sql_query(query, conn)

# Print the number of awards by PI
print(num_awds_by_pi_df.head(10))

            pi_full_name  num_awards
0           Nancy R Gray         178
1  J. Ardie Butch Dillen          76
2         Jerene Shaheed          60
3              Wei Zhang          57
4            Sajal K Das          50
5      Terry B Appelgate          48
6    Nicholas G Feamster          45
7       Nicholas R Bates          42
8     Katherine E Bailey          42
9         Roman Lubynsky          41
