# Summary Statistics 
2023-08-08 ZD  

This notebook will explore options to gather summary statistics and other reporting data calculated from data within processed grants data.  

In [1]:
import pandas as pd

# Method to import from parent directory
import os
import sys
root_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))
sys.path.append(root_dir)

import config

In [2]:
# Define directory containing processed grant data
processed_dir = '../'+config.PROCESSED_DIR

In [3]:
# Make empty list to fill with dataframes
df_list = []

# Load each file as a dataframe
for filename in os.listdir(processed_dir):
    if filename.endswith(".tsv"):
        filepath = os.path.join(processed_dir, filename)
        df_temp = pd.read_csv(filepath, sep='\t')

        # Add program column with filename
        df_temp['program'] = filename

        # Add dataframe to list
        df_list.append(df_temp)

# Combine all dataframes
df = pd.concat(df_list, ignore_index=True)

In [4]:
# Check fiscal years of all grants
df['fiscal_year'].value_counts().reset_index().sort_values(by='index')

Unnamed: 0,index,fiscal_year
23,2000,1
22,2001,2
19,2002,3
18,2003,3
17,2004,3
12,2005,6
21,2006,2
15,2007,5
20,2008,2
14,2009,5


In [5]:
df[df['fiscal_year'] < 2000]

Unnamed: 0,project_num,core_project_num,appl_id,fiscal_year,project_title,abstract_text,pref_terms,org_name,org_city,org_state,...,principal_investigators,program_officers,award_amount,agency_ic_fundings,award_notice_date,project_start_date,project_end_date,full_foa,api_source_search,program


In [6]:
df.columns.tolist()

['project_num',
 'core_project_num',
 'appl_id',
 'fiscal_year',
 'project_title',
 'abstract_text',
 'pref_terms',
 'org_name',
 'org_city',
 'org_state',
 'org_country',
 'principal_investigators',
 'program_officers',
 'award_amount',
 'agency_ic_fundings',
 'award_notice_date',
 'project_start_date',
 'project_end_date',
 'full_foa',
 'api_source_search',
 'program']

In [7]:
df.groupby('program')['core_project_num'].nunique().reset_index()

Unnamed: 0,program,core_project_num
0,ADMIRALStudyAdmixtureanalysisofacutelymphoblas...,1
1,AcquiredResistancetoTherapyNetworkARTNet.tsv,6
2,AllofUs.tsv,1837
3,BarrettsEsophagusTranslationalResearchNetworkB...,4
4,BrainTumorSPOREGrant.tsv,1
5,CANCERIMMUNEMONITORINGANDANALYSISCENTERS.tsv,4
6,CancerPreventionInterceptionTargetedAgentDisco...,2
7,CellularCancerBiologyImagingResearch.tsv,4
8,FredHutchinsonCancerResearchCenterLungSPORE.tsv,1
9,FusionOncoproteinsinChildhoodCancersFusOnc2.tsv,9


In [8]:
df.groupby('program')['project_num'].nunique().reset_index()

Unnamed: 0,program,project_num
0,ADMIRALStudyAdmixtureanalysisofacutelymphoblas...,4
1,AcquiredResistancetoTherapyNetworkARTNet.tsv,6
2,AllofUs.tsv,3128
3,BarrettsEsophagusTranslationalResearchNetworkB...,40
4,BrainTumorSPOREGrant.tsv,26
5,CANCERIMMUNEMONITORINGANDANALYSISCENTERS.tsv,6
6,CancerPreventionInterceptionTargetedAgentDisco...,2
7,CellularCancerBiologyImagingResearch.tsv,8
8,FredHutchinsonCancerResearchCenterLungSPORE.tsv,6
9,FusionOncoproteinsinChildhoodCancersFusOnc2.tsv,16


In [9]:
import datetime

In [10]:
current_year = datetime.datetime.now().year

In [11]:
current_year

2023

In [12]:
fiscal_years = [str(year) for year in range(2000, current_year+1)]

In [13]:
fiscal_years

['2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2020',
 '2021',
 '2022',
 '2023']

In [14]:
df.describe()

Unnamed: 0,appl_id,fiscal_year,award_amount,agency_ic_fundings
count,3655.0,3655.0,3655.0,3655.0
mean,10387730.0,2021.671956,618149.8,607511.4
std,469835.4,2.259909,1069417.0,1069447.0
min,6172074.0,2000.0,1.0,0.0
25%,10361890.0,2021.0,363713.0,361473.0
50%,10496940.0,2022.0,457724.0,455853.0
75%,10616960.0,2023.0,596098.0,593334.5
max,10881490.0,2023.0,13559980.0,13559980.0
