In [8]:
'''
this code is for retrieve all the desired PubMed Format elements: PMID, PT, OT, MH from a list of PMIDs by 
using the webpage https://pubmed.ncbi.nlm.nih.gov/32790733/?format=pubmed 
The input list is a result from SCAIView output back end file
 '''

import pandas as pd
import requests
from bs4 import BeautifulSoup

In [14]:
#1. obtain a unique list of PMIDs, because one PMID may link to multiple Clinical Trial IDs.
def PMID_input(file):
    """
    Get the input NCTID as a list
    :param file
    :return the list
    """
    PMID_list = []
    filepath = "input\\"+file
    df = pd.read_csv(filepath, sep = '\t', header = 0)
    print("The "+file+" contains "+str(df.shape[0])+" rows in total.")
    PMID_list = df['PMID'].dropna().tolist()
    print("There are "+str(len(PMID_list))+" rows with PMID.")
    PMID_list_uni= list(set(PMID_list))
    print("There are "+str(len(PMID_list_uni))+" unique PMIDs.")
    return PMID_list_uni

In [15]:
SCAIviewfile = input('Please give the file name contains PMID list:' )
PMID_list_uni = PMID_input(SCAIviewfile)

Please give the file name contains PMID list:results-ivermectin.txt
The results-ivermectin.txt contains 2058 rows in total.
There are 1972 rows with PMID.
There are 138 unique PMIDs.


In [16]:
#2. for each PMID in list, find the PMID, PT, OT and MH info from web and save to a dataframe

def page_content_body(URL):
    '''give an URL, will return the text of the body tag'''
    response = requests.get(URL)
    page = response.text
    soup = BeautifulSoup(page)
    content = soup.body.find('div', attrs={'class':'article-page'}).text
    return content

def create_new_row(content):
    '''give the content of page body. Creat a new row for the dataframe
       a new row includes PMID, PT, OT, MH, GR,SI, TI data extracted from the page'''
    content_list = content.splitlines()
    PT_list = []
    OT_list = []
    MH_list = []
    GR_list = []
    SI_list = []
    for line in content_list:
        if line.startswith('PMID'):
            PMID = line.replace(r'PMID','').replace('-','').lstrip()
        if line.startswith('TI'):
            TI = line.replace(r'TI  - ','')
        if line.startswith('PT'):
            PT = line.replace(r'PT','').replace('-','').lstrip()
            PT_list.append(PT)
        if line.startswith('OT'):
            OT = line.replace(r'OT','').replace('-','').lstrip()
            OT_list.append(OT)
        if line.startswith('MH'):
            MH = line.replace(r'MH','').replace('-','').lstrip()
            MH_list.append(MH)
        if line.startswith('GR'):
            GR = line.replace(r'GR','').replace('-','').lstrip()
            GR_list.append(GR)
        if line.startswith('SI'):
            SI = line.replace(r'SI','').replace('-','').lstrip()
            SI_list.append(SI)

    PT_list_str = ';'.join(PT_list)
    OT_list_str = ';'.join(OT_list)
    MH_list_str = ';'.join(MH_list)
    GR_list_str = ';'.join(GR_list)
    SI_list_str = ';'.join(SI_list)

    new_row = {'PMID':PMID, 'PT':PT_list_str, 'OT':OT_list_str, 'MH':MH_list_str, 'SI':SI_list_str, 'TI':TI,'GR':GR_list_str}
    return new_row
    
Base_URL = "https://pubmed.ncbi.nlm.nih.gov/"
PM_terms_df = pd.DataFrame(columns = ['PMID', 'PT', 'OT','MH','SI','TI','GR'])

for x in range(len(PMID_list_uni)):
    PMID = str(PMID_list_uni[x].replace('PMID:',''))
    pagelink = Base_URL+PMID+"/?format=pubmed"
    print("processing "+pagelink+"...")
    content = page_content_body(pagelink)
    new_row = create_new_row(content)
    print(new_row)
    PM_terms_df = PM_terms_df.append(new_row, ignore_index=True)
    n=x+1
    print(str(n)+" row added!")

processing https://pubmed.ncbi.nlm.nih.gov/32497632/?format=pubmed...
{'PMID': '32497632', 'PT': 'Journal Article;Review', 'OT': '', 'MH': 'Aminoquinolines/therapeutic use;AntiBacterial Agents/therapeutic use;Antiviral Agents/therapeutic use;Betacoronavirus/*isolation & purification;COVID19;COVID19 Vaccines;Coronavirus Infections/*drug therapy/prevention & control/virology;Drug Repositioning;Humans;Pandemics;Pneumonia, Viral/*drug therapy/virology;SARSCoV2;Viral Vaccines/administration & dosage;DA 2020/08/25 06:00', 'SI': '', 'TI': 'Potential therapeutic targets for combating SARS-CoV-2: Drug repurposing, clinical ', 'GR': ''}
1 row added!
processing https://pubmed.ncbi.nlm.nih.gov/32560227/?format=pubmed...
{'PMID': '32560227', 'PT': 'Journal Article;Review', 'OT': 'O  NNLM;COVID19;SARSCoV2;coronavirus;therapies;vaccines', 'MH': 'DA 2020/06/21 06:01', 'SI': '', 'TI': 'Vaccines and Therapies in Development for SARS-CoV-2 Infections.', 'GR': 'R01AI13976802/National Institute of Allergy 

{'PMID': '33132570', 'PT': 'Journal Article;Review', 'OT': 'O  NNLM;Coronavirus;Coronavirus disease2019;Heparin;Hydroxychloroquine;Remdesvir;SARSCoV2;Tocilizumab', 'MH': 'DA 2020/11/03 06:01', 'SI': '', 'TI': 'Current Approaches to COVID-19: Therapy and Prevention.', 'GR': ''}
16 row added!
processing https://pubmed.ncbi.nlm.nih.gov/33398233/?format=pubmed...
{'PMID': '33398233', 'PT': 'Journal Article;Review', 'OT': '', 'MH': 'DA 2021/01/06 06:01', 'SI': '', 'TI': 'Novel coronavirus disease (COVID-19) pandemic: A recent mini review.', 'GR': ''}
17 row added!
processing https://pubmed.ncbi.nlm.nih.gov/33482149/?format=pubmed...
{'PMID': '33482149', 'PT': 'Journal Article;Review', 'OT': 'O  NNLM;*COVID19;*Drug safety and drug efficacy;*Mutation;*RNA therapeutics;*SARSCoV2', 'MH': 'DA 2021/01/23 06:00', 'SI': '', 'TI': 'Biochemical features and mutations of key proteins in SARS-CoV-2 and their impacts ', 'GR': ''}
18 row added!
processing https://pubmed.ncbi.nlm.nih.gov/32768971/?format=

{'PMID': '32346490', 'PT': 'Journal Article', 'OT': 'O  NNLM;COVID19 outbreak;CoVMpro;CoVNsp12 polymerase;CoVNsp13 helicase;SARSCoV2', 'MH': 'DA 2020/04/30 06:01', 'SI': '', 'TI': 'Structural elucidation of SARS-CoV-2 vital proteins: Computational methods reveal ', 'GR': ''}
32 row added!
processing https://pubmed.ncbi.nlm.nih.gov/32992245/?format=pubmed...
{'PMID': '32992245', 'PT': "Journal Article;Research Support, NonU.S. Gov't", 'OT': 'O  NNLM;*Antiviral;*COVID19;*Coronavirus;*Repurposed drugs;*SARSCoV2', 'MH': 'Antiviral Agents/chemistry/*therapeutic use;COVID19/immunology/*therapy;COVID19 Vaccines/*immunology;*Drug Repositioning;Humans;SARSCoV2/*drug effects/immunology;DA 2020/12/22 06:00', 'SI': '', 'TI': 'COVID-19 therapy: What weapons do we bring into battle?', 'GR': ''}
33 row added!
processing https://pubmed.ncbi.nlm.nih.gov/32880078/?format=pubmed...
{'PMID': '32880078', 'PT': 'Journal Article;Review', 'OT': 'O  NNLM;COVID19;Computational;Coronavirus;Drug;SARSCoV2;Vaccine'

{'PMID': '32708302', 'PT': 'Journal Article;Review', 'OT': 'O  NNLM;NFκB pathway;anticancer;antiinflammatory;autoimmune disease;clinical trials;small molecules', 'MH': 'AntiInflammatory Agents/*pharmacology;Antineoplastic Agents/*pharmacology;Drug Discovery/*methods;Humans;Ikappa B Kinase/antagonists & inhibitors/*metabolism;NFkappa B/antagonists & inhibitors/*metabolism;Proteasome Inhibitors/pharmacology;Signal Transduction/*drug effects/genetics;Ubiquitination/drug effects;DA 2021/03/05 06:00', 'SI': '', 'TI': 'Small Molecule NF-κB Pathway Inhibitors in Clinic.', 'GR': 'NRFCRP17201702/National Research Foundation Singapore/'}
46 row added!
processing https://pubmed.ncbi.nlm.nih.gov/32391242/?format=pubmed...
{'PMID': '32391242', 'PT': 'Journal Article;Review', 'OT': 'O  NNLM;Biomarkers;COVID19;Coronavirus management algorithm;Precision medicine', 'MH': 'DA 2020/05/12 06:00', 'SI': '', 'TI': 'A Precision Medicine Approach to SARS-CoV-2 Pandemic Management.', 'GR': ''}
47 row added!
pr

{'PMID': '32734518', 'PT': 'Journal Article;Review', 'OT': 'O  NNLM;COVID19;Coronaviruses;Drug classification;Pandemic;Pharmaceutical agents;Possible treatments;SARSCoV2', 'MH': 'Animals;COVID19/*drug therapy/physiopathology/virology;Humans;Inflammation/drug therapy/physiopathology/virology;SARSCoV2/*drug effects/pathogenicity;DA 2020/12/15 06:00', 'SI': '', 'TI': 'Classification of the present pharmaceutical agents based on the possible effective ', 'GR': ''}
60 row added!
processing https://pubmed.ncbi.nlm.nih.gov/33493917/?format=pubmed...
{'PMID': '33493917', 'PT': 'Journal Article;Review', 'OT': 'O  NNLM;COVID19;Computational docking;Drug repurposing;SARSCoV2;Vaccine', 'MH': 'Antiviral Agents/*therapeutic use;COVID19/*drug therapy/*prevention & control;*COVID19 Vaccines;Clinical Trials as Topic;Drug Repositioning;Humans;Pandemics;DA 2021/03/03 06:00', 'SI': '', 'TI': 'The growing complexity of COVID-19 drug and vaccine candidates: challenges and ', 'GR': ''}
61 row added!
processi

{'PMID': '32718020', 'PT': 'Journal Article;Review', 'OT': 'O  NNLM;ACE2;COVID19;SARSCoV2;TMPRSS2;coronavirus;endocytosis;spike protein;viral entry;viral fusion', 'MH': 'Antibodies, Monoclonal/chemistry/pharmacology/therapeutic use;Antiviral Agents/chemistry/pharmacology/*therapeutic use;Betacoronavirus/drug effects/*physiology;COVID19;Clinical Trials as Topic;Coronavirus Infections/*drug therapy;Humans;Pandemics;Peptides/chemistry/pharmacology/therapeutic use;Pneumonia, Viral/*drug therapy;Polysaccharides/chemistry/pharmacology/therapeutic use;SARSCoV2;Small Molecule Libraries/chemistry/pharmacology/therapeutic use;Virus Attachment/drug effects;Virus Internalization/drug effects;DA 2020/08/13 06:00', 'SI': '', 'TI': 'Potential Anti-COVID-19 Therapeutics that Block the Early Stage of the Viral Life ', 'GR': 'SC3GM131986; P20GM103424/GM/NIGMS NIH HHS/United States'}
75 row added!
processing https://pubmed.ncbi.nlm.nih.gov/33071609/?format=pubmed...
{'PMID': '33071609', 'PT': 'Editorial'

{'PMID': '33072781', 'PT': 'Systematic Review', 'OT': 'O  NNLM;COVID19;SARSCoV2;antivirals;clinical trials;drug development;immunomodulators;research protocols', 'MH': 'DA 2020/10/20 06:01', 'SI': '', 'TI': 'The Pipeline of Therapeutics Testing During the Emergency Phase of the COVID-19 ', 'GR': ''}
87 row added!
processing https://pubmed.ncbi.nlm.nih.gov/32474009/?format=pubmed...
{'PMID': '32474009', 'PT': 'Journal Article;Review', 'OT': 'O  NNLM;COVID19;Prevention;SARSCoV2;Transmission;Treatment', 'MH': 'Adrenal Cortex Hormones/therapeutic use;Antiviral Agents/therapeutic use;Betacoronavirus/*drug effects/pathogenicity;COVID19/*epidemiology/prevention & control/*therapy/transmission;COVID19 Vaccines;Clinical Trials as Topic;Coronavirus Infections/*epidemiology/prevention & control/*therapy/transmission;Drug Repositioning;Humans;Immunization, Passive/methods;Immunologic Factors/therapeutic use;Molecular Targeted Therapy/methods;*Pandemics/prevention & control;Personal Protective Equi

{'PMID': '33234158', 'PT': 'Letter;Randomized Controlled Trial', 'OT': 'O  NNLM;COVID19;Hospitalization;Ivermectin;Protocol;Randomized controlled trial', 'MH': 'Adult;Antiparasitic Agents/administration & dosage/*therapeutic use;Argentina/epidemiology;COVID19/*drug therapy/epidemiology/virology;CaseControl Studies;DoubleBlind Method;Female;Hospitalization/statistics & numerical data;Humans;Ivermectin/administration & dosage/*therapeutic use;Male;Pandemics/prevention & control;Placebos/administration & dosage;Prospective Studies;SARSCoV2/*genetics;Time Factors;DA 2020/12/15 06:00', 'SI': 'ClinicalTrials.gov/NCT04529525', 'TI': 'Ivermectin to prevent hospitalizations in patients with COVID-19 (IVERCOR-COVID19): ', 'GR': ''}
101 row added!
processing https://pubmed.ncbi.nlm.nih.gov/33330858/?format=pubmed...
{'PMID': '33330858', 'PT': 'Preprint', 'OT': '', 'MH': 'DA 2020/12/18 06:01', 'SI': '', 'TI': 'Drug Repurposing for COVID-19 using Graph Neural Network with Genetic, Mechanistic, ', '

{'PMID': '33519133', 'PT': 'Journal Article', 'OT': 'O  NNLM;Covid 19;Siddha strategy;accelerated recovery;synergistic effect', 'MH': 'DA 2021/02/02 06:00', 'SI': '', 'TI': 'An Open Clinical Evaluation Of Selected Siddha Regimen In Expediting The Management ', 'GR': ''}
116 row added!
processing https://pubmed.ncbi.nlm.nih.gov/33227708/?format=pubmed...
{'PMID': '33227708', 'PT': 'Journal Article;Review', 'OT': 'O  NNLM;Azithromycin;COVID19;Chloroquine/hydroxychloroquine;Interferons;Treatment;Vaccine', 'MH': 'Antiviral Agents/*therapeutic use;Azithromycin/*therapeutic use;COVID19/*prevention & control;COVID19 Vaccines/*therapeutic use;Chloroquine/*therapeutic use;Humans;Hydroxychloroquine/therapeutic use;Interferons/*therapeutic use;Treatment Outcome;DA 2021/01/01 06:00', 'SI': '', 'TI': 'Prevention and treatment of COVID-19: Focus on interferons, ', 'GR': ''}
117 row added!
processing https://pubmed.ncbi.nlm.nih.gov/33014380/?format=pubmed...
{'PMID': '33014380', 'PT': 'Journal Articl

{'PMID': '32472459', 'PT': 'Journal Article;Review', 'OT': 'O  NNLM;Algorithm;COVID19;Management;SarsCoV2', 'MH': 'Algorithms;*Antiviral Agents/immunology/pharmacology;Betacoronavirus/isolation & purification;COVID19;COVID19 Testing;Clinical Laboratory Techniques/*methods;*Coronavirus Infections/diagnosis/drug therapy/epidemiology/immunology/therapy;Critical Pathways;*Cytokine Release Syndrome/diagnosis/drug therapy/etiology;Humans;*Pandemics;Patient Care Team/*organization & administration;*Pneumonia, Viral/epidemiology/immunology/therapy;SARSCoV2;DA 2020/06/25 06:00', 'SI': '', 'TI': 'Treatment algorithm for COVID-19: a multidisciplinary point of view.', 'GR': ''}
134 row added!
processing https://pubmed.ncbi.nlm.nih.gov/32987852/?format=pubmed...
{'PMID': '32987852', 'PT': 'Journal Article;Review', 'OT': 'O  NNLM;COVID19;SARSCoV2;immune response;inflammation', 'MH': 'DA 2020/09/30 06:01', 'SI': '', 'TI': 'SARS-CoV-2: From Structure to Pathology, Host Immune Response and Therapeutic 

In [17]:
display(PM_terms_df)

Unnamed: 0,PMID,PT,OT,MH,SI,TI,GR
0,32497632,Journal Article;Review,,Aminoquinolines/therapeutic use;AntiBacterial ...,,Potential therapeutic targets for combating SA...,
1,32560227,Journal Article;Review,O NNLM;COVID19;SARSCoV2;coronavirus;therapies...,DA 2020/06/21 06:01,,Vaccines and Therapies in Development for SARS...,R01AI13976802/National Institute of Allergy an...
2,32916249,Editorial,O NNLM;SARSCoV2;control;diagnostics;hygiene;t...,COVID19/diagnosis/drug therapy/*prevention & c...,,Integrated control of COVID-19 in resource-poo...,
3,32784499,Journal Article;Review,O NNLM;COVID19;SARSCoV2;antiviral;coronavirus...,DA 2020/08/14 06:01,,Emerging Therapeutic Modalities against COVID-19.,1R01CA24119401A1/NH/NIH HHS/United States;1R01...
4,32871201,"Journal Article;Research Support, N.I.H., Extr...",O NNLM;*Bioinspired materials;*Drug delivery;...,DA 2020/09/02 06:00,,Recent trends in protein and peptide-based bio...,R35 GM127042/GM/NIGMS NIH HHS/United States;R2...
...,...,...,...,...,...,...,...
133,32472459,Journal Article;Review,O NNLM;Algorithm;COVID19;Management;SarsCoV2,Algorithms;*Antiviral Agents/immunology/pharma...,,Treatment algorithm for COVID-19: a multidisci...,
134,32987852,Journal Article;Review,O NNLM;COVID19;SARSCoV2;immune response;infla...,DA 2020/09/30 06:01,,"SARS-CoV-2: From Structure to Pathology, Host ...",13SOL/Unitatea Executiva pentru Finantarea Inv...
135,32864299,Journal Article;Review,O NNLM;COVID19;Coronavirus;Hydroxychloroquine...,DA 2020/08/31 06:00,,Hydroxychloroquine in COVID-19: Potential Mech...,
136,32433345,Journal Article;Review,,"Active Transport, Cell Nucleus/drug effects;An...",,The pharmacological development of direct acti...,


In [18]:
PM_terms_df.to_csv('output/PubMed_extract_ivermectin.csv',sep='\t') 

In [19]:
filtered_PM_df= PM_terms_df[PM_terms_df['PT'].str.contains('Trial')]

In [20]:
filtered_PM_df.to_csv('output/PubMed_extract_filtered_ivermectin.csv',sep='\t')