# Summary Statistics 
2023-08-08 ZD  

This notebook will explore options to gather summary statistics and other reporting data calculated from data within processed grants data.  

In [1]:
import pandas as pd

# Method to import from parent directory
import os
import sys
root_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))
sys.path.append(root_dir)

import config

### Load data from all Key Program grants output CSVs into a single dataframe for stats

In [2]:
# Define directory containing processed grant data
processed_dir = '../'+config.PROCESSED_DIR

# Define directory to store reports. Create if doesn't already exist
reports_dir = '../'+config.REPORTS_DIR
if not os.path.exists(reports_dir):
    os.makedirs(reports_dir)

In [3]:
# Make empty list to fill with dataframes
df_list = []

# Load each file as a dataframe
for filename in os.listdir(processed_dir):
    if filename.endswith(".tsv"):
        filepath = os.path.join(processed_dir, filename)
        df_temp = pd.read_csv(filepath, sep='\t')

        # Add program column with filename and remove trailing ".tsv"
        df_temp['program'] = filename[:-4]

        # Add dataframe to list
        df_list.append(df_temp)

# Combine all dataframes
df = pd.concat(df_list, ignore_index=True)

In [4]:
# Store combined program df for ad-hoc analyses.
# Use CSV filetype to separate from Program-specific TSVs
combined_filename = reports_dir + '/' + 'allGrantsData.csv'

df.to_csv(combined_filename, index=False)

### Start exploring for patterns and stats to report out

In [5]:
# Look at a single row in detail
df.loc[0]

project_num                                                  1U54CA274220-01
core_project_num                                                 U54CA274220
appl_id                                                             10517140
fiscal_year                                                             2022
project_title              Acquired Resistance to Therapy and Iron (ARTI)...
abstract_text              Overall Summary\nApproximately 50% of cancer p...
pref_terms                 Achievement;Advisory Committees;Basic Science;...
org_name                                UNIVERSITY OF TX MD ANDERSON CAN CTR
org_city                                                             HOUSTON
org_state                                                                 TX
org_country                                                    UNITED STATES
principal_investigators                               Boyi Gan, Albert Koong
program_officers                                                  Mihoko Kai

In [6]:
df.dtypes

project_num                object
core_project_num           object
appl_id                     int64
fiscal_year                 int64
project_title              object
abstract_text              object
pref_terms                 object
org_name                   object
org_city                   object
org_state                  object
org_country                object
principal_investigators    object
program_officers           object
award_amount                int64
agency_ic_fundings          int64
award_notice_date          object
project_start_date         object
project_end_date           object
full_foa                   object
api_source_search          object
program                    object
dtype: object

In [7]:
# Check fiscal years of all grants
df['fiscal_year'].value_counts().reset_index().sort_values(by='index')

Unnamed: 0,index,fiscal_year
23,2000,1
22,2001,2
19,2002,3
18,2003,3
17,2004,3
12,2005,6
21,2006,2
15,2007,5
20,2008,2
14,2009,5


In [8]:
# Check columns
df.columns.tolist()

['project_num',
 'core_project_num',
 'appl_id',
 'fiscal_year',
 'project_title',
 'abstract_text',
 'pref_terms',
 'org_name',
 'org_city',
 'org_state',
 'org_country',
 'principal_investigators',
 'program_officers',
 'award_amount',
 'agency_ic_fundings',
 'award_notice_date',
 'project_start_date',
 'project_end_date',
 'full_foa',
 'api_source_search',
 'program']

In [9]:
# Get number of core projects for each program
df.groupby('program')['core_project_num'].nunique().reset_index()

Unnamed: 0,program,core_project_num
0,ADMIRALStudyAdmixtureanalysisofacutelymphoblas...,1
1,AcquiredResistancetoTherapyNetworkARTNet,6
2,AllofUs,1841
3,BarrettsEsophagusTranslationalResearchNetworkB...,4
4,BrainTumorSPOREGrant,1
5,CANCERIMMUNEMONITORINGANDANALYSISCENTERS,4
6,CancerPreventionInterceptionTargetedAgentDisco...,2
7,CellularCancerBiologyImagingResearch,4
8,FredHutchinsonCancerResearchCenterLungSPORE,1
9,FusionOncoproteinsinChildhoodCancersFusOnc2,9


In [10]:
# # Convert date columns to datetime objects
# df['project_start_date'] = pd.to_datetime(df['project_start_date'])
# df['project_end_date'] = pd.to_datetime(df['project_end_date'])

### Build a summary of all Programs and the number of projects for each

In [11]:
# Get the year from the date objects
# Lose some resolution but make the data easier to read for stats
df['project_start_date_year'] = df['project_start_date'].apply(lambda x: int(x[:4]))
df['project_end_date_year'] = df['project_end_date'].apply(lambda x: int(x[:4]))

In [12]:
# Copy fiscal year column for later min and max stats
df['fiscal_year_copy'] = df['fiscal_year']

In [13]:
# Define functions to apply to each column
agg_funcs = {
    'api_source_search': 'nunique',
    'core_project_num': 'nunique',
    'project_num': 'nunique',
    'project_start_date_year': 'min',
    'project_end_date_year': 'max',
    'fiscal_year': 'min',
    'fiscal_year_copy': 'max'
}

In [14]:
# Define column titles better suited for reporting
rename_dict = {
    "program": "Program",
    "core_project_num": "Core Project Count",
    "project_num": "Grant/Award Count",
    "api_source_search": "Provided NOFOs/Awards with Associated Grants",
    "project_start_date_year": "Earliest Project Start Date",
    "project_end_date_year": "Latest Project End Date",
    "fiscal_year": "Earliest Fiscal Year",
    "fiscal_year_copy": "Latest Fiscal Year"
}

In [15]:
# Group by 'program' and apply aggregation functions defined above
summary_stat_df = df.groupby('program').agg(agg_funcs).reset_index()

# Rename columns for better presentation as defined above
summary_stat_df.rename(columns=rename_dict, inplace=True)

# Store program summary 
program_summary_filename = reports_dir + '/' + 'programSummaryStats.csv'
summary_stat_df.to_csv(program_summary_filename, index=False)

In [16]:
summary_stat_df

Unnamed: 0,Program,Provided NOFOs/Awards with Associated Grants,Core Project Count,Grant/Award Count,Earliest Project Start Date,Latest Project End Date,Earliest Fiscal Year,Latest Fiscal Year
0,ADMIRALStudyAdmixtureanalysisofacutelymphoblas...,1,1,4,2020,2023,2020,2022
1,AcquiredResistancetoTherapyNetworkARTNet,2,6,6,2017,2027,2022,2022
2,AllofUs,6,1841,3170,1981,2029,2018,2023
3,BarrettsEsophagusTranslationalResearchNetworkB...,3,4,40,2011,2024,2011,2021
4,BrainTumorSPOREGrant,1,1,26,2002,2023,2002,2022
5,CANCERIMMUNEMONITORINGANDANALYSISCENTERS,2,4,6,2017,2028,2017,2023
6,CancerPreventionInterceptionTargetedAgentDisco...,1,2,2,2022,2027,2022,2022
7,CellularCancerBiologyImagingResearch,1,4,8,2021,2026,2021,2023
8,FredHutchinsonCancerResearchCenterLungSPORE,1,1,6,2019,2024,2019,2023
9,FusionOncoproteinsinChildhoodCancersFusOnc2,2,9,16,2018,2024,2018,2022


For the most part, Earliest Grant Date aligns with earliest Fiscal Year. Outliers are the very early start dates for All of Us, ALCHEMIST, and EDRN programs compared to the earlist fiscal year. This might indicate a project that received additional funds or supplements many years after the proejct began. Could this be data deposition or a similar modernization effort?

In [17]:
# Get a df of all grants with project start dates before year 2000
early_project_df = df[df['project_start_date_year'] < 2000]

In [18]:
# # Export to csv for quick ad-hoc analysis
# early_project_df.to_csv('earlyProjectReport.csv',index=False)

In [19]:
# Group to find patterns and common sources for the projects with very early start dates
early_project_df.groupby(['program', 'api_source_search', 'core_project_num', 'project_title'])['project_num'].nunique().reset_index()

Unnamed: 0,program,api_source_search,core_project_num,project_title,project_num
0,AllofUs,nofo_PA20-185,R01CA031845,Synthetic Studies Related to Cancer Research/T...,2
1,AllofUs,nofo_PA20-185,R01CA047296,A Pathway of Tumor Suppression,2
2,AllofUs,nofo_PA20-185,R01CA053840,Protein Tyrosine Dephosphorylation & Signal Tr...,3
3,AllofUs,nofo_PA20-185,R01CA067007,Mismatch Repair and Carcinogenesis,2
4,AllofUs,nofo_PA20-185,R01CA067985,DNA Damage Repair by MUTYH and MUTYH Variants ...,3
...,...,...,...,...,...
98,AllofUs,nofo_PA20-272,P30CA082103,Project HOPE: The Pediatric/AYA Omics Project,1
99,AllofUs,nofo_PA20-272,P30CA082709,The Big Ten Electronic Health Record Consortiu...,1
100,AllofUs,nofo_PA20-272,U24CA055727,CHILDHOOD CANCER SURVIVOR STUDY: Somatic and G...,1
101,TheAdjuvantLungCancerEnrichmentMarkerIdentific...,award_U10CA031946,U10CA031946,CANCER AND LEUKEMIA GROUP B,5


In [20]:
def find_rows_with_different_values(df, shared_column, compare_column):
    """Find rows that share a value in a specified column
    but have different values in another specified column.
    """

    grouped = df.groupby(shared_column)[compare_column].transform('nunique')
    selected_row_df = df[grouped > 1]

    return selected_row_df

In [21]:
# Check for Core Projects found in multiple programs and compare sources

shared_column = 'core_project_num'
compare_column = 'program'

# Get grant-level rows with same project but different program
df_shared = find_rows_with_different_values(df, shared_column, compare_column)

# Group with the provided search value and count unique grants 
df_shared_projects = (df_shared.groupby(
                        ['api_source_search', shared_column, compare_column])
                        .size().reset_index()
                        .rename(columns={0:'grant_count'}))

# Store shared programs
shared_projects_filename = reports_dir + '/' + 'sharedProjects.csv'
df_shared_projects.to_csv(shared_projects_filename, index=False)

In [22]:
# Check for different FOA values (from NIH Reporter) across non-matching provided NOFOs
shared_column = 'api_source_search'
compare_column = 'full_foa'

df_shared = find_rows_with_different_values(df, shared_column, compare_column)

df_shared.groupby([shared_column, compare_column]).size().reset_index().rename(columns={0:'grant_count'})

Unnamed: 0,api_source_search,full_foa,grant_count
0,award_P50CA097257,PAR-00-087,9
1,award_P50CA097257,PAR-05-156,7
2,award_P50CA097257,PAR-10-003,5
3,award_P50CA097257,PAR-14-353,5
4,award_P50CA165962,PA-18-906,1
5,award_P50CA165962,PA-21-071,1
6,award_P50CA165962,PAR-10-003,5
7,award_P50CA165962,PAR-18-313,6
8,award_P50CA196530,PAR-14-031,6
9,award_P50CA196530,PAR-18-313,3
