In [1]:
##This notebook was created to query pubmed for documents 
##related to the free energy principle and active inference
##Authored by Bleu Knight
##February 2022
import pandas as pd

In [2]:
#define terms of interest
queryTerms = ['free energy principle', 'active inference']
queryTerms

['free energy principle', 'active inference']

In [3]:
#this access the NCBI query API and gets the abstract for the PMIDs of interest
from Bio import Entrez

Entrez.email = "you@email.com"
#creates open dictionary to store PMID with query term
titleID = {}
def getTitleIDs(QTs):
    for Q in QTs:
        t = '"' + Q + '"[TIAB]' + "AND pubmed pmc open access[filter]"
        #queries title and abstract for term of interest and open access papers
        handle = Entrez.esearch(db = "pubmed", term = t, retmax = 100000)
        record = Entrez.read(handle)
        PMIDS = record["IdList"]
        titleID[Q] = PMIDS

getTitleIDs(queryTerms)
print(titleID)

{'free energy principle': ['36507308', '36420157', '36389542', '36389215', '36359666', '36313813', '36248528', '36246500', '36228614', '36119714', '36035589', '36016665', '36008463', '35967689', '35966370', '35923916', '35911596', '35814345', '35812784', '35756264', '35742142', '35712161', '35626572', '35597682', '35478746', '35462780', '35250743', '35250723', '35221541', '35210988', '35205595', '35153603', '35126062', '35069367', '35069133', '35052115', '34946007', '34945925', '34945871', '34901166', '34895862', '34867580', '34828219', '34819563', '34721166', '34682030', '34650816', '34573780', '34457352', '34441216', '34441172', '34354621', '34267621', '34210008', '34202965', '34199648', '33935674', '33921298', '33828471', '33810573', '33746730', '33733149', '33732119', '33673663', '33669529', '33627890', '33612868', '33597069', '33584461', '33551917', '33505336', '33489551', '33488462', '33471182', '33362668', '33343039', '33335499', '33304260', '33286659', '33271162', '33240146', '

In [4]:
#create list from dictionary

title_PMIDs = sum(titleID.values(), [])

ids = pd.DataFrame([*set(title_PMIDs)])
ids


Unnamed: 0,0
0,21960978
1,35615182
2,34202804
3,32818063
4,29887647
...,...
337,29747865
338,35462780
339,36304590
340,33265602


In [5]:
#this accesses the NCBI query API and obtains abstracts for PMIDs of interest
title_abstracts = {}

#creates list to store entries without abstract text
without_title = []

#enter your email address
Entrez.email = 'you@email.com'

handle = Entrez.efetch(db="pubmed", id=','.join(map(str, title_PMIDs)),
                       rettype="xml", retmode="text")
records = Entrez.read(handle)
for pubmed_article in records['PubmedArticle']:
    pmid = int(str(pubmed_article['MedlineCitation']['PMID']))
    article = pubmed_article['MedlineCitation']['Article']
    if 'Abstract' in article:
        abstract = article['Abstract']['AbstractText'][0]
        title_abstracts[pmid] = abstract
    else:
       without_title.append(pmid)

print(title_abstracts)



In [6]:
#convert list to data frame
abs = pd.DataFrame([title_abstracts]).T
abs

Unnamed: 0,0
36507308,Integrated world modeling theory (IWMT) is a s...
36420157,The constrained disorder principle defines the...
36389215,"Based on a material view and reductionism, sci..."
36359666,"Cognition, historically considered uniquely hu..."
36313813,Developmental selection of neurons and synapse...
...,...
23129312,The descending projections from motor cortex s...
23110076,This paper introduces a model of oculomotor co...
22654776,If perception corresponds to hypothesis testin...
22241972,The role of dopamine in behaviour and decision...


In [7]:
#this accesses the NCBI query API and obtains authors for PMIDs of interest
title_authors = {}
without_author = []

#enter your email address
Entrez.email = 'you@email.com'

handle = Entrez.efetch(db="pubmed", id=','.join(map(str, title_PMIDs)),
                       rettype="xml", retmode="text")
records = Entrez.read(handle)
for pubmed_article in records['PubmedArticle']:
    pmid = int(str(pubmed_article['MedlineCitation']['PMID']))
    article = pubmed_article['MedlineCitation']['Article']
    if 'AuthorList' in article:
        author = article['AuthorList']
        title_authors[pmid] = author
    else:
       without_author.append(pmid)

print(title_authors)

{36507308: ListElement([DictElement({'AffiliationInfo': [{'Identifier': [], 'Affiliation': 'Department of Psychiatry and Behavioral Sciences, Johns Hopkins University School of Medicine, Center for Psychedelic and Consciousness Research, Baltimore, MD, United States.'}, {'Identifier': [], 'Affiliation': 'Cognitive Science Program, Indiana University, Bloomington, IN, United States.'}, {'Identifier': [], 'Affiliation': 'Institute for Advanced Consciousness Studies (IACS), Santa Monica, CA, United States.'}], 'Identifier': [], 'LastName': 'Safron', 'ForeName': 'Adam', 'Initials': 'A'}, attributes={'ValidYN': 'Y'})], attributes={'CompleteYN': 'Y'}), 36420157: ListElement([DictElement({'AffiliationInfo': [{'Identifier': [], 'Affiliation': 'Faculty of Medicine, Hebrew University and Department of Medicine, Hadassah Medical Center, Jerusalem, Israel.'}], 'Identifier': [], 'LastName': 'Ilan', 'ForeName': 'Yaron', 'Initials': 'Y'}, attributes={'ValidYN': 'Y'})], attributes={'CompleteYN': 'Y'})

In [8]:
###This next set of scripts will unpack these nested dictionaries and lists to obtain author data
df = pd.DataFrame([title_authors])
#obtain list of pmids
listpmid = list(df.columns)
#separate info for each author
authors_unpacked = []
for a in listpmid:
    d = df[a] 
    for i in enumerate(d):
        authors_unpacked.append(i)        
unzipped_object = zip(*authors_unpacked)
unzipped_list = list(unzipped_object)
z = unzipped_list[1]

In [9]:
#find first author info
firstAuthors = ([x[0] for x in z])
df_Firsts = pd.DataFrame(firstAuthors)
df_Firsts['PMIDS']= listpmid
cols = ['PMIDS', 'AffiliationInfo']
flat_firsts = df_Firsts[cols]
Affils = [[d.get('Affiliation') for d in x] for x in flat_firsts['AffiliationInfo']]
flat_firsts['Affiliation'] = Affils
df_Firsts['Affiliation'] = flat_firsts.Affiliation
df_Firsts.drop(['AffiliationInfo'], axis = 1, inplace = True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flat_firsts['Affiliation'] = Affils


In [10]:
#convert author info to DataFrame with all authors, up to a total of 14 authors
z2 = pd.DataFrame(z)
z2['PMIDS'] = listpmid
#change DataFrame index to PMID
z2.set_index("PMIDS", inplace=True)

In [11]:
#untangle second author info
Authors1 = z2[1]
Authors1.dropna(inplace = True)
A1 = pd.DataFrame(Authors1)
A1.reset_index(inplace = True)
A1.columns = ['PMIDS', "Authors"]
ID1 = A1['PMIDS'] 
AA1 = A1["Authors"].tolist()
dfAA1 = pd.DataFrame(AA1)
dfAA1['PMIDS'] = ID1

#pop out second author affiliations
Affils1 = [[d.get('Affiliation') for d in x] for x in dfAA1['AffiliationInfo']]
dfAA1['Affiliation'] = Affils1
dfAA1.drop(['AffiliationInfo'], axis = 1, inplace = True)
dfAA1

Unnamed: 0,Identifier,LastName,ForeName,Initials,PMIDS,Affiliation
0,[],Cerritelli,Francesco,F,36389542,"[Clinical-Based Human Research Department, Fou..."
1,[],Chen,Linlin,L,36389215,"[Department of Vasculocardiology, Shenzhen Lon..."
2,[],Bourke,Paul David,PD,36313813,"[Faculty of Arts, Business, Law and Education,..."
3,[],De Foe,Alexander,A,36248528,[School of Educational Psychology and Counsell...
4,[],Çatal,Ozan,O,36246500,"[IDLab, Department of Information Technology, ..."
...,...,...,...,...,...,...
271,[],Shipp,Stewart,S,23129312,[]
272,[],Perrinet,Laurent U,LU,23110076,[]
273,[],Adams,Rick A,RA,22654776,[]
274,[],Shiner,Tamara,T,22241972,[]


In [12]:
#untangle third author info
Authors2 = z2[2]
Authors2.dropna(inplace = True)
A2 = pd.DataFrame(Authors2)
A2.reset_index(inplace = True)
A2.columns = ['PMIDS', "Authors"]
ID2 = A2['PMIDS'] 
AA2 = A2["Authors"].tolist()
dfAA2 = pd.DataFrame(AA2)
dfAA2['PMIDS'] = ID2
#pop out third author affiliations

Affils2 = [[d.get('Affiliation') for d in x] for x in dfAA2['AffiliationInfo']]
dfAA2['Affiliation'] = Affils2
dfAA2.drop(['AffiliationInfo'], axis = 1, inplace = True)

In [13]:
#untangle fourth author info
Authors3 = z2[3]
Authors3.dropna(inplace = True)
A3 = pd.DataFrame(Authors3)
A3.reset_index(inplace = True)
A3.columns = ['PMIDS', "Authors"]
ID3 = A3['PMIDS'] 
AA3 = A3["Authors"].tolist()
dfAA3 = pd.DataFrame(AA3)
dfAA3['PMIDS'] = ID3
Affils3 = [[d.get('Affiliation') for d in x] for x in dfAA3['AffiliationInfo']]
dfAA3['Affiliation'] = Affils3
dfAA3.drop(['AffiliationInfo'], axis = 1, inplace = True)

In [14]:
#untangle fifth author info
Authors4 = z2[4]
Authors4.dropna(inplace = True)
A4 = pd.DataFrame(Authors4)
A4.reset_index(inplace = True)
A4.columns = ['PMIDS', "Authors"]
ID4 = A4['PMIDS'] 
AA4 = A4["Authors"].tolist()
dfAA4 = pd.DataFrame(AA4)
dfAA4['PMIDS'] = ID4
Affils4 = [[d.get('Affiliation') for d in x] for x in dfAA4['AffiliationInfo']]
dfAA4['Affiliation'] = Affils4
dfAA4.drop(['AffiliationInfo'], axis = 1, inplace = True)

In [15]:
#untangle sixth author info
Authors5 = z2[5]
Authors5.dropna(inplace = True)
A5 = pd.DataFrame(Authors5)
A5.reset_index(inplace = True)
A5.columns = ['PMIDS', "Authors"]
ID5 = A5['PMIDS'] 
AA5 = A5["Authors"].tolist()
dfAA5 = pd.DataFrame(AA5)
dfAA5['PMIDS'] = ID5
Affils5 = [[d.get('Affiliation') for d in x] for x in dfAA5['AffiliationInfo']]
dfAA5['Affiliation'] = Affils5
dfAA5.drop(['AffiliationInfo'], axis = 1, inplace = True)

In [16]:
#untangle seventh author info
Authors6 = z2[6]
Authors6.dropna(inplace = True)
A6 = pd.DataFrame(Authors6)
A6.reset_index(inplace = True)
A6.columns = ['PMIDS', "Authors"]
ID6 = A6['PMIDS'] 
AA6 = A6["Authors"].tolist()
dfAA6 = pd.DataFrame(AA6)
dfAA6['PMIDS'] = ID6
Affils6 = [[d.get('Affiliation') for d in x] for x in dfAA6['AffiliationInfo']]
dfAA6['Affiliation'] = Affils6
dfAA6.drop(['AffiliationInfo'], axis = 1, inplace = True)

In [17]:
#untangle eighth author info
Authors7 = z2[7]
Authors7.dropna(inplace = True)
A7 = pd.DataFrame(Authors7)
A7.reset_index(inplace = True)
A7.columns = ['PMIDS', "Authors"]
ID7 = A7['PMIDS'] 
AA7 = A7["Authors"].tolist()
dfAA7 = pd.DataFrame(AA7)
dfAA7['PMIDS'] = ID7
Affils7 = [[d.get('Affiliation') for d in x] for x in dfAA7['AffiliationInfo']]
dfAA7['Affiliation'] = Affils7
dfAA7.drop(['AffiliationInfo'], axis = 1, inplace = True)

In [18]:
#untangle ninth author info
Authors8 = z2[8]
Authors8.dropna(inplace = True)
A8 = pd.DataFrame(Authors8)
A8.reset_index(inplace = True)
A8.columns = ['PMIDS', "Authors"]
ID8 = A8['PMIDS'] 
AA8 = A8["Authors"].tolist()
dfAA8 = pd.DataFrame(AA8)
dfAA8['PMIDS'] = ID8
Affils8 = [[d.get('Affiliation') for d in x] for x in dfAA8['AffiliationInfo']]
dfAA8['Affiliation'] = Affils8
dfAA8.drop(['AffiliationInfo'], axis = 1, inplace = True)

In [19]:
#untangle tenth author info
Authors9 = z2[9]
Authors9.dropna(inplace = True)
A9 = pd.DataFrame(Authors3)
A9.reset_index(inplace = True)
A9.columns = ['PMIDS', "Authors"]
ID9 = A9['PMIDS'] 
AA9 = A9["Authors"].tolist()
dfAA9 = pd.DataFrame(AA9)
dfAA9['PMIDS'] = ID9
Affils9 = [[d.get('Affiliation') for d in x] for x in dfAA9['AffiliationInfo']]
dfAA9['Affiliation'] = Affils9
dfAA9.drop(['AffiliationInfo'], axis = 1, inplace = True)

In [20]:
#untangle eleventh author info
Authors10 = z2[10]
Authors10.dropna(inplace = True)
A10 = pd.DataFrame(Authors10)
A10.reset_index(inplace = True)
A10.columns = ['PMIDS', "Authors"]
ID10 = A10['PMIDS'] 
AA10 = A10["Authors"].tolist()
dfAA10 = pd.DataFrame(AA10)
dfAA10['PMIDS'] = ID10
Affils10 = [[d.get('Affiliation') for d in x] for x in dfAA10['AffiliationInfo']]
dfAA10['Affiliation'] = Affils10
dfAA10.drop(['AffiliationInfo'], axis = 1, inplace = True)

In [21]:
#untangle twelfth author info
Authors11 = z2[11]
Authors11.dropna(inplace = True)
A11 = pd.DataFrame(Authors11)
A11.reset_index(inplace = True)
A11.columns = ['PMIDS', "Authors"]
ID11 = A11['PMIDS'] 
AA11 = A11["Authors"].tolist()
dfAA11 = pd.DataFrame(AA11)
dfAA11['PMIDS'] = ID11
Affils11 = [[d.get('Affiliation') for d in x] for x in dfAA11['AffiliationInfo']]
dfAA11['Affiliation'] = Affils11
dfAA11.drop(['AffiliationInfo'], axis = 1, inplace = True)

In [22]:
#untangle thirteenth author info
Authors12 = z2[12]
Authors12.dropna(inplace = True)
A12 = pd.DataFrame(Authors12)
A12.reset_index(inplace = True)
A12.columns = ['PMIDS', "Authors"]
ID12 = A12['PMIDS'] 
AA12 = A12["Authors"].tolist()
dfAA12 = pd.DataFrame(AA12)
dfAA12['PMIDS'] = ID12
Affils12 = [[d.get('Affiliation') for d in x] for x in dfAA12['AffiliationInfo']]
dfAA12['Affiliation'] = Affils12
dfAA12.drop(['AffiliationInfo'], axis = 1, inplace = True)

In [23]:
#merge all author info in with PMIDs
all = pd.concat([df_Firsts,dfAA1, dfAA2, dfAA3, dfAA4, dfAA5, dfAA6, dfAA7, dfAA8, dfAA9, dfAA10, dfAA11, dfAA12,], ignore_index=True)
all['Name']= all['ForeName'] + " " + all['LastName']
all

Unnamed: 0,Identifier,LastName,ForeName,Initials,PMIDS,Affiliation,Name
0,[],Safron,Adam,A,36507308,[Department of Psychiatry and Behavioral Scien...,Adam Safron
1,[],Ilan,Yaron,Y,36420157,"[Faculty of Medicine, Hebrew University and De...",Yaron Ilan
2,[],Duquette,Patrice,P,36389542,"[Private Practitioner, Birmingham, MI, United ...",Patrice Duquette
3,[],Chen,Jicheng,J,36389215,"[Department of Vasculocardiology, Shenzhen Lon...",Jicheng Chen
4,[0000-0001-9881-400X],Dodig-Crnkovic,Gordana,G,36359666,[Department of Computer Science and Engineerin...,Gordana Dodig-Crnkovic
...,...,...,...,...,...,...,...
1240,[],Galea,Joseph M,JM,22241972,[],Joseph M Galea
1241,[],Friston,Karl J,KJ,32083297,"[Wellcome Centre for Human Neuroimaging, Unive...",Karl J Friston
1242,[],Petzschner,Frederike H,FH,27895566,"[Translational Neuromodeling Unit, Institute f...",Frederike H Petzschner
1243,[],Howes,Oliver D,OD,32083297,"[Psychiatric Imaging Group, Robert Steiner MRI...",Oliver D Howes


In [24]:
#get PMIDs for open source papers from csv file
li = pd.read_csv("ids.csv")
ll = li['PMID']
ll

0      25128318
1      29572721
2      33471182
3      23744445
4      31258246
         ...   
253    34819563
254    34866668
255    34867580
256    30381799
257    34682030
Name: PMID, Length: 258, dtype: int64

In [25]:
#get list of all author names for each pmid
dicts = {}
keys = ll
count = []
for i in keys:
    vals = []
    t = list(all[all['PMIDS']==i]['Name'])
    vals.append(t)
    dicts[i] = vals
print(dicts)

{25128318: [['Laurent U Perrinet', 'Rick A Adams', 'Karl J Friston']], 29572721: [['Raphael Kaplan', 'Karl J Friston']], 33471182: [['Chang Sub Kim']], 23744445: [['Harriet Brown', 'Rick A Adams', 'Isabel Parees', 'Mark Edwards', 'Karl Friston', 'Mark Edwards']], 31258246: [['Daniel Williams']], 29887647: [['Micah Allen', 'Karl J Friston']], 33612868: [['Majid D Beni']], 26275633: [['Sasha Ondobaka', 'James Kilner', 'Karl Friston']], 31830495: [['Adam Linson', 'Thomas Parr', 'Karl J Friston']], 32732017: [['Karl J Friston', 'Noor Sajid', 'David Ricardo Quiroga-Martinez', 'Thomas Parr', 'Cathy J Price', 'Emma Holmes', 'Thomas Parr']], 25583383: [['Rick A Adams', 'Eduardo Aponte', 'Louise Marshall', 'Karl J Friston', 'Karl J Friston']], 31756340: [['Ensor Rafael Palacios', 'Adeel Razi', 'Thomas Parr', 'Michael Kirchhoff', 'Karl Friston', 'Michael Kirchhoff']], 25277283: [['E Quattrocki', 'Karl Friston']], 27375276: [['Karl Friston', 'Thomas FitzGerald', 'Francesco Rigoli', 'Philipp Schwa

In [26]:
#count number of authors for each PMID
names = pd.DataFrame(dicts).T
count = []
for x in names[0]:
    count.append(len(x))

In [27]:
#attach author count to names df
names['num'] = count
names.columns = ["authors", "num_authors"]

In [28]:
#get first author for each pmid
fas = df_Firsts
fas['Name']= fas['ForeName'] + " " + fas['LastName']
dicts2 = {}
keys = ll
count2 = []
for i in keys:
    vals = []
    t = fas[fas['PMIDS']==i]['Name'].values[0]
    vals.append(t)
    dicts2[i] = vals

In [29]:
#convert to dataframe for merge
firsts = pd.DataFrame(dicts2).T
firsts.columns = ["first_author"]

In [30]:
#merge moving into obft
df_open_fep_AI = abs.join(names)
df2_open_fep_AI = df_open_fep_AI.join(firsts)

In [31]:
#obtain titles for PMIDs
titles = {}

#creates list to store entries without abstract text
without_title = []
 #enter your email address
Entrez.email = 'you@email.com'

handle = Entrez.efetch(db="pubmed", id=','.join(map(str, title_PMIDs)),
                       rettype="xml", retmode="text")
records = Entrez.read(handle)
for pubmed_article in records['PubmedArticle']:
    pmid = int(str(pubmed_article['MedlineCitation']['PMID']))
    article = pubmed_article['MedlineCitation']['Article']
    if 'ArticleTitle' in article:
        title = article['ArticleTitle']
        titles[pmid] = title
    else:
       without_title.append(pmid)

In [32]:
#format to append obft
titles2 = pd.DataFrame([titles]).T
titles2.columns = (["title"])

In [33]:
open_fep_AI = df2_open_fep_AI.join(titles2)
open_fep_AI

Unnamed: 0,0,authors,num_authors,first_author,title
36507308,Integrated world modeling theory (IWMT) is a s...,,,,Integrated world modeling theory expanded: Imp...
36420157,The constrained disorder principle defines the...,,,,The constrained disorder principle defines liv...
36389215,"Based on a material view and reductionism, sci...",,,,The hard problem of consciousness-A perspectiv...
36359666,"Cognition, historically considered uniquely hu...",,,,Cognition as Morphological/Morphogenetic Embod...
36313813,Developmental selection of neurons and synapse...,,,,"Unification of free energy minimization, spati..."
...,...,...,...,...,...
23129312,The descending projections from motor cortex s...,"[Rick A Adams, Stewart Shipp, Karl J Friston]",3.0,Rick A Adams,Predictions not commands: active inference in ...
23110076,This paper introduces a model of oculomotor co...,"[Rick A Adams, Laurent U Perrinet, Karl Friston]",3.0,Rick A Adams,Smooth pursuit and visual occlusion: active in...
22654776,If perception corresponds to hypothesis testin...,"[Karl Friston, Rick A Adams, Laurent Perrinet,...",5.0,Karl Friston,Perceptions as hypotheses: saccades as experim...
22241972,The role of dopamine in behaviour and decision...,"[Karl J Friston, Tamara Shiner, Thomas FitzGer...",10.0,Karl J Friston,"Dopamine, affordance and active inference."


In [34]:
#read in dois for open source articles
doi = pd.read_csv("PMIDS2.csv")
doi.set_index(["PMID"], drop = True, inplace = True)

In [35]:
#append obft with DOIs
fep_AI = open_fep_AI.join(doi)
fep_AI.columns = ['abstract', 'authors', 'num_authors', 'first_author', 'title', 'DOI']


In [36]:
#export obft
fep_AI.to_csv("FEP_ActInf.csv")