In [1]:
import pandas as pd
import random
import numpy as np
import re
import json
import pymongo

In [2]:
with open('config.json') as config_file:
    config = json.load(config_file)
entrez_email = config.get('entrez_email')
entrez_key = config.get('entrez_key')

### Obtain LPSN list and map all synonyms 

Download LSPN excel file from https://www.bacterio.net/

1) List of bacteria with status correct name is created.(validly published and not validly published).

2) We take all synonyms and mispellings, match it with all other same nomenclature type. If one of them has correct name status, all synonyms are merged to correct name bacteria.

3)All synonyms that cannot be matched with correct name is added along with 1 step list of bacterias with correct name. ( no other correct names associated with name)
	Only one nomenclature type->status= synonym, no corect name
	2 or more nomenclature type-> all status type= synonyms, no correct name


If "status" column has "correct name", then it is correct name for bacteria. If the "status" column has "synonym", it is synonym to some other bacteria which has status "correct name". Hence it is added to synonyms list mapping to bacteria with correct name.

"mispellings" are added with synonyms if atleast one bacteria has status "correct name" corresponding to same nomenclature. If only "synonyms" are present without correct name for bacteria, all names are added to master list.

In [3]:
import pandas as pd


def synonyms(bacteria_name):
    found_in_lspn=False
    bacteria_name=bacteria_name.lower()
    g_name=bacteria_name.split(" ")[0].capitalize().strip()
    s_name=bacteria_name.split(" ")[1].strip().lower()
    try:
        df=data[data["genus_name"]==g_name]
        df=df[df["sp_epithet"]==s_name]
        #print(df)
        n_type=df['nomenclatural_type'][df['subsp_epithet'].isna()].iat[0]
        #print(n_type)
        found_in_lspn=True
    except:
        found_in_lspn=False
        
    correct_name=""
    synonyms=[]
    correct_name_found=False
    misspelling=[]
    if found_in_lspn==True:
        status=str(df['status'][df['subsp_epithet'].isna()].iat[0]).lower().split(";")[-1].strip()
        if status=="correct name":
            correct_name=bacteria_name
        else:
            #for i in n_type.split(";"):
            res= data[data['nomenclatural_type'].str.contains(n_type, case=True, flags=0, na=None, regex=True)]
            #print(res)
            for i in res.index:
                name= str(data.loc[i]["genus_name"]).lower()+" "+str(data.loc[i]["sp_epithet"]).lower()
                status=str(data.loc[i]["status"]).lower().split(";")[-1].strip()
                if status=="correct name" and correct_name_found==False:
                    correct_name=name
                    correct_name_found=True
                elif status=="synonym":
                    synonyms.append(name)
                elif status=="misspelling":
                    synonyms.append(name)
                    
    if correct_name=="":
        correct_name=bacteria_name        
        
    return found_in_lspn,correct_name, [i for i in list(set(synonyms)) if i!=correct_name]
        

In [20]:
import pymongo
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
dblist = myclient.list_database_names()
mydb = myclient["pathogens_filtered"]

lspn = mydb["lspn_final_new"]
synonym=mydb["new_synonyms_new"]

def extract_lspn_and_synonyms(file_name):
    data=pd.read_csv(file_name)
    data=data.dropna(subset=["sp_epithet"])
    allspecies_de=[]
    for i in data.index:  
        allspecies_de.append(str(data.loc[i]['genus_name']).lower()+" "+str(data.loc[i]['sp_epithet']))

    for i in list(set(allspecies_de)):
        found_in_lspn, correct_name,synonym_names=synonyms(i)
        #print(i,correct_name,synonym_names)
        if found_in_lspn==True:
            count= lspn.count_documents({"bacteria name":correct_name})
            if count<1:
                x=lspn.insert_one({"bacteria name":correct_name}) 
        for j in synonym_names:
            count= synonym.count_documents({"bacteria synonym":j,"bacteria name": correct_name})
            if count<1:
                x=synonym.insert_one({"bacteria synonym":j,"bacteria name": correct_name})
        

In [21]:
extract_lspn_and_synonyms("lpsn_gss_2023-10-20.csv")

  res= data[data['nomenclatural_type'].str.contains(n_type, case=True, flags=0, na=None, regex=True)]


### Data Aquisition 
Download from pubmed

First Count number of papers availabe for bacteria using "species+genus name".

Then obtain all papers till end of count.

Use pubmed API key to download multiple queries simultaneosly upto 10 queries/sec . Else pubmed rejects more than 2 queries/sec.


In [29]:
#!pip install biopython
from Bio import Entrez 
#https://biopython-tutorial.readthedocs.io/en/latest/notebooks/09%20-%20Accessing%20NCBIs%20Entrez%20databases.html#
from Bio import Medline

def get_pubmed_NUM_RESULTS(query, mindate, maxdate,database='pubmed'):
    Entrez.email = entrez_email
    Entrez.api_key =entrez_key
    query_f= f"{query} AND {mindate}[PDAT] : {maxdate}[PDAT]"
    handle = Entrez.egquery(term=query_f)
    record = Entrez.read(handle)
    #print(record)
    for row in record["eGQueryResult"]: 
        if row["DbName"] == database: 
            #print("total records in pubmed with given keywords {}= {}".format(str(query), row["Count"]))
            return int(row["Count"])
            

    
def get_pubmed_structured_data(query,mindate, maxdate,size=10000, start_index=0):
    Entrez.email = entrez_email
    Entrez.api_key = entrez_key
    all_res = []
    count_of_papers=get_pubmed_NUM_RESULTS(query,mindate=str(mindate), maxdate=str(maxdate))
    #print(count_of_papers)
    #query="((human[Title/Abstract]) AND (bacteria[Title/Abstract]))"
    print(query)
    retmax=100
    for i in range(size//retmax):
        retstart = start_index + i*retmax
        if retstart>9999 or retstart>count_of_papers:
            return all_res
        print('...%d'%(start_index + (i+1)*retmax),end=' ')
        try:
            handle = Entrez.esearch(db="pubmed", term=query, retmax=retmax, retstart=retstart, mindate=str(mindate), maxdate=str(maxdate))
            record = Entrez.read(handle)
            idlist = record["IdList"]
            #print("count",record["Count"])
        
            #print(len(idlist))
            handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text")
            records = Medline.parse(handle)    
            res=[]
            for doc in records:
                if 'AB' in doc and int(doc['DP'].split()[0])>=int(mindate) and int(doc['DP'].split()[0])<=int(maxdate):
                    if 'MH' in doc:
                        res.append([doc['PMID'],doc['DP'],doc['TI'],doc['AB'],doc['MH']])
                    else:
                        res.append([doc['PMID'],doc['DP'],doc['TI'],doc['AB'],"no mesh terms "])        
            #res = [[doc['PMID'],doc['DP'],doc['TI'],doc['AB'],doc['MH']]  if 'AB' in doc and int(doc['DP'].split()[0])>=int(mindate) and int(doc['DP'].split()[0])<=int(maxdate)]
        except Exception as e:
            print('<Warning: error at:%d  [%s]>'%(i, e), end=' ')
        else:
            all_res.extend(res)
    print('Done!')
    return all_res


### Filtering pipeline

#### Stage 1- Filter abstracts with human association keywords
query="human humans male males female girl girls boy boys adolescent adolescents individuals individual females baby babies elderly man men women woman infant infants patient patients child children adult adults"

#### Stage 2- Filter abstracts with infection/causation association keywords
query="infected derived infection infections infectious infective life threat disease morbidity mortality sepsis septic fibrosis mycetoma septicaemia sinonasal rhinosinusitis septicaemia diarrhea bacteraemia bacteremia meningitis fever pus cystic purulent pyogenic abscess empyema pneumonia phlegmon erysipelas ulcer ecthyma diarrhoea dysentry systemic"
+
words ending with "itis" or "osis"

In [30]:
import re
def filter_dataset(abstract):
    #stage 1 to get abstracts relavent to humans
    stage_1=False
    query="human humans male males female girl girls boy boys adolescent adolescents individuals individual females baby babies elderly man men women woman infant infants patient patients child children adult adults"
    query=query.split(" ")
    string1= str(abstract)
    string1=re.sub('[^a-zA-Z ]', "",string1)
    for j in query:
        if j in " ".join(string1.lower().split(".")).split(" "):
            #print(j)
            stage_1=True
            break
    if stage_1==False:
        return stage_1
    
    #stage 2 to get abstracts relavent to infections from abstracts related to humans
    found=False
    query="infected derived infection infections infectious infective life threat disease morbidity mortality sepsis septic fibrosis mycetoma septicaemia sinonasal rhinosinusitis septicaemia diarrhea bacteraemia bacteremia meningitis fever pus cystic purulent pyogenic abscess empyema pneumonia phlegmon erysipelas ulcer ecthyma diarrhoea dysentry systemic"
    query=query.split(" ")
    for j in query:
        if j in " ".join(string1.lower().split(".")).split(" "):
            #print(j)
            found=True
            break
    if found==False and len(re.findall("[a-zA-Z]+itis+[ .]", string1))>0:
        found=True
    elif found==False and len(re.findall("[a-zA-Z]+osis+[ .]", string1))>0:
        found=True                  
    return found

In [31]:
def filter_results(results):
    selected_results=[]
    for result in results:
        pmid, date, title, abstract,mesh = result
        if filter_dataset(str(title)+str(abstract)) is True:
            selected_results.append(result)
    return selected_results

### Extract LSPN list of bacteria

Download with 4 differnt process for faster data acquisition

3 for LPSN list and 1 for synonyms of bacteria in LSPN list 

In [32]:
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
dblist = myclient.list_database_names()
mydb = myclient["pathogens_filtered"]
mycol = mydb["lspn_final_new"]
syn=mydb["new_synonyms_new"]
list_of_bacteria_lspn=[]
list_of_bacteria_syn=[]
result=mycol.find({})
for i in result:
    list_of_bacteria_lspn.append(i['bacteria name'])

result=syn.find({})
for i in result:
    list_of_bacteria_syn.append(i['bacteria synonym'])
print(len(list_of_bacteria_lspn),len(list_of_bacteria_syn))

20345 4706


In [33]:
div=len(list_of_bacteria_lspn)//3
count=len(list_of_bacteria_lspn)
len(list_of_bacteria_lspn[:div])+len(list_of_bacteria_lspn[div:2*div])+len(list_of_bacteria_lspn[2*div:])

20345

In [34]:
#!pip install pymongo
def filter_write_database(pid,date, bacteria_name,title, abstract,mesh_terms):
    myclient = pymongo.MongoClient("mongodb://localhost:27017/")
    dblist = myclient.list_database_names()
    mydb = myclient["pathogens_db"]
    mycol = mydb["pathogens_coll"]
    myquery={"pid":pid,"bacteria name":bacteria_name }
    count = mycol.count_documents(myquery)

    # Check if the document exists
    if count < 1:
        x = mycol.insert_one({"pid":pid,"bacteria name":bacteria_name,"date":date,"title":title,"abstract":abstract,"Mesh terms":mesh_terms })


In [35]:
import pymongo
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
dblist = myclient.list_database_names()
mydb = myclient["pathogens_filtered"]
lspn = mydb["lspn_final_new"]
synonym_db = mydb["new_synonyms_new"]

start_date="1900"
end_date="2024"
download_complete=[]

for ind in list_of_bacteria_lspn:
    f_res= filter_results(get_pubmed_structured_data(ind,str(start_date),str(end_date)))
    if len(f_res)>0:
        for i_results in f_res: 
            pid, date, title, abstract,mesh_terms=i_results
            filter_write_database(int(pid),date, ind,title, abstract,str(mesh_terms))
        download_complete.append(ind)
    break
    




acidovorax+cattleyae
...100 

In [60]:
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
dblist = myclient.list_database_names()
mydb = myclient["pathogens_filtered"]
lspn = mydb["lspn_final_new"]
synonym_db = mydb["new_synonyms_new"]

start_date="2020"
end_date="2024"
download_complete=[]

for ind in list_of_bacteria_syn:
    f_res= filter_results(get_pubmed_structured_data(ind,str(start_date),str(end_date)))
    if len(f_res)>0:
        for i_results in f_res: 
            pid, date, title, abstract,mesh_terms=i_results
            try:
                syn_res=synonym_db.find({"bacteria synonym":ind})
                for syn in syn_res: 
                    filter_write_database(int(pid),date, syn['bacteria name'],title, abstract,str(mesh_terms))
                    download_complete.append(ind)
            except:
                print("error in mapping synonyms for", i['bacteria name'],ind["pid"])

    break


clostridium histolyticum
...100 ...200 ...300 