In [1]:
import pandas as pd
import seaborn as sn

In [3]:
# load metadata
ukri_metadata = pd.read_csv("../clean-data/fine-scale/UK/UKRI/UKRI-project-metadata.csv")
# as the USA metadata project id column is made of only numbers, python thinks it is suppose to be numeric, so coerce it into a string to avoid problems when concatenating the UKRI data.
nsf_metadata = pd.read_csv("../clean-data/fine-scale/USA/NSF/NSF-project-metadata.csv", dtype="object")
nih_metadata = pd.read_csv("../clean-data/fine-scale/USA/NIH/NIH-project-metadata.csv", dtype="object")


In [4]:
all_metadata = pd.concat([ukri_metadata, nsf_metadata, nih_metadata], ignore_index=True)
len(all_metadata)

3001919

In [5]:
all_metadata.head()

Unnamed: 0,ProjectId,Country,CountryFundingBody,FundingBody,LeadInstitution,StartDate,EndDate,FundingAmount,FundingCurrency
0,566EB681-6816-452F-A230-11D70589A7F1,UK,UKRI,EPSRC,University of Glasgow,30/09/2021,30/03/2025,0.0,GBP
1,571DFC22-1759-4B91-8DBC-1238FD72E91C,UK,UKRI,EPSRC,University of Leeds,15/01/2007,13/07/2010,437892.0,GBP
2,57E6F125-572F-49E6-918E-1243BB40E3F3,UK,UKRI,STFC,University of Leeds,30/09/2012,29/09/2017,284532.0,GBP
3,440EE7E6-F4F2-409B-BFD4-121A40502D94,UK,UKRI,BBSRC,University of Warwick,29/09/2019,20/02/2022,0.0,GBP
4,448AEB1F-A096-44EC-8F2A-11ACCD2C6506,UK,UKRI,BBSRC,John Innes Centre,30/09/2013,30/03/2015,190730.0,GBP


In [6]:
all_metadata.Country.value_counts()

USA               2822576
UK                 107831
CANADA               5901
UNITED KINGDOM       2409
SOUTH AFRICA         1679
                   ...   
BOLIVIA                 2
PAPUA N GUINEA          2
ESWATINI                1
GUINEA-BISSAU           1
ECUADOR                 1
Name: Country, Length: 131, dtype: int64

In [7]:
all_metadata.CountryFundingBody.value_counts()

NIH     2621654
NSF      272434
UKRI     107831
Name: CountryFundingBody, dtype: int64

In [8]:
all_metadata.FundingBody.value_counts()

NATIONAL CANCER INSTITUTE                                                    341513
NATIONAL CENTER FOR RESEARCH RESOURCES                                       254930
NATIONAL INSTITUTE OF GENERAL MEDICAL SCIENCES                               231829
NATIONAL HEART, LUNG, AND BLOOD INSTITUTE                                    226837
NATIONAL INSTITUTE OF ALLERGY AND INFECTIOUS DISEASES                        221970
                                                                              ...  
CLINICAL PHARMACOLOGY AND TOXICOLOGY BRANCH                                       4
OFFICE OF CHIEF PUBLIC HEALTH PRACTICE                                            4
TWO OR MORE SPONSORS                                                              2
COORDINATING CENTER FOR INFECTIOUS DISEASES                                       1
HEALTH RESOURCES AND SERVICES ADMINISTRATION/BUREAU OF HEALTH PROFESSIONS         1
Name: FundingBody, Length: 133, dtype: int64

In [9]:
all_metadata.groupby(['Country', 'CountryFundingBody', 'FundingBody']).size()

Country   CountryFundingBody  FundingBody                                                                    
ALBANIA   NIH                 National Center for Immunization and Respiratory Diseases (NCIRD)                   3
ANGOLA    NIH                 COORDINATING OFFICE OF GLOBAL HEALTH                                                1
                              Center for Global Health                                                            2
                              NATIONAL CENTER FOR HIV, VIRAL HEPATITIS, STDS AND TB PREVENTION                    4
                              NATIONAL CENTER FOR IMMUNICATION AND RESPIRATORY DISEASES                           1
                                                                                                                 ..
ZIMBABWE  NIH                 NATIONAL CENTER FOR INFECTIOUS DISEASES (NCID)                                     11
                              NATIONAL HEART, LUNG, AND BLOOD INSTITUTE       

In [11]:
all_metadata.groupby(['CountryFundingBody', 'FundingBody']).size()

CountryFundingBody  FundingBody                                        
NIH                 AGENCY FOR HEALTHCARE RESEARCH AND QUALITY             13338
                    AGENCY FOR TOXIC SUBSTANCES AND DISEASE REGISTRY        1077
                    Administration for Children and Families                 319
                    BIOLOGICS-IMMUNOLOGY AND INFECTIOUS DISEASES BRANCH        7
                    BUREAU OF HEALTH PLANNING AND RESOURCES DEVELOPMENT      197
                                                                           ...  
UKRI                EPSRC                                                  33924
                    Innovate UK                                            25425
                    MRC                                                    12508
                    NERC                                                   11011
                    STFC                                                    7190
Length: 133, dtype: int64

In [12]:
ukri_metadata.FundingBody.value_counts()

EPSRC          33924
Innovate UK    25425
BBSRC          17773
MRC            12508
NERC           11011
STFC            7190
Name: FundingBody, dtype: int64

In [13]:
nsf_metadata.FundingBody.value_counts()

MPS    86877
GEO    57177
ENG    56193
BIO    54762
TIP    17425
Name: FundingBody, dtype: int64

In [14]:
nih_metadata.FundingBody.value_counts()

NATIONAL CANCER INSTITUTE                                                    341513
NATIONAL CENTER FOR RESEARCH RESOURCES                                       254930
NATIONAL INSTITUTE OF GENERAL MEDICAL SCIENCES                               231829
NATIONAL HEART, LUNG, AND BLOOD INSTITUTE                                    226837
NATIONAL INSTITUTE OF ALLERGY AND INFECTIOUS DISEASES                        221970
                                                                              ...  
OFFICE OF CHIEF PUBLIC HEALTH PRACTICE                                            4
CLINICAL PHARMACOLOGY AND TOXICOLOGY BRANCH                                       4
TWO OR MORE SPONSORS                                                              2
HEALTH RESOURCES AND SERVICES ADMINISTRATION/BUREAU OF HEALTH PROFESSIONS         1
COORDINATING CENTER FOR INFECTIOUS DISEASES                                       1
Name: FundingBody, Length: 122, dtype: int64

In [15]:
tokens = pd.read_csv("../clean-data/fine-scale/UK-USA/titles-abstracts-tokenized.csv", dtype="object")

In [16]:
tokens.head()

Unnamed: 0,ProjectId,TitleAbstract
0,6507783,"['oncology', 'award', 'internist', 'training',..."
1,8305231,"['role', 'glia', 'formation', 'synapsis', 'syn..."
2,8904702,"['umbrella', 'amphotericin', 'conjugate', 'umb..."
3,2024085,"['measuring', 'gain', 'goal', 'application', '..."
4,2205115,"['imaging', 'hardcopy']"
