# > Select papers published in the four specified journals

In [1]:
from tqdm import tqdm
import csv
import sys
csv.field_size_limit(sys.maxsize)
import pandas as pd

In [2]:
target=["Proceedings of the National Academy of Sciences","PLoS ONE","Nature","Science"]

In [3]:
data=[]
with open("Data/author_contribution_full.csv",mode="r",encoding="utf-8") as FI:
     
     reader=csv.reader(FI)
     cols=next(reader)
     
     for row in tqdm(reader):
          pid=row[cols.index("arxiv_id")]
          dic=eval(row[cols.index("author_contribution")])
          TS=len(dic)
          if TS<2:
               continue
          if sum(dic.values())==0:
               continue
          journal=row[cols.index("journal_info_from_open_alex")]
          if journal not in target:
               continue
          doi=row[cols.index("doi_from_open_alex")]
          if doi=="":
               continue
          PY=int(row[cols.index("pub_year_from_open_alex")])
          data.append([pid,TS,dic,journal,doi,PY])

2840569it [04:28, 10564.30it/s]


In [4]:
df=pd.DataFrame(data)
df.columns=["PaperID","TeamSize","AuthorContribution","Journal","DOI","PublicationYear"]

In [5]:
df.value_counts("Journal")

Journal
Proceedings of the National Academy of Sciences    947
Nature                                             873
PLoS ONE                                           481
Science                                            439
Name: count, dtype: int64

# > Obtaining self-reported contribution data from websites

In [6]:
import pandas as pd
import numpy as np

In [7]:
from selenium import webdriver
from bs4 import BeautifulSoup
import re

In [8]:
from tqdm import tqdm
import os

## Nature

In [38]:
df_Nature=df[df["Journal"]=="Nature"]
df_Nature=df_Nature.reset_index(drop=True)
df_Nature["ArticleType"]=np.nan
df_Nature["SelfReportedData"]=np.nan

In [39]:
n=len(df_Nature)
turn=1
while turn<=3:
    blanks=0
    for i in tqdm(range(n)):
        if type(df_Nature.loc[i,"SelfReportedData"])!=str:
            doi=df_Nature.loc[i,"DOI"]
            index=doi.split("/")[3]+"."+doi.split("/")[4].lstrip("nature")
            filename="Data/Source/Nature/"+str(index)+".html"
            flag=os.path.isfile(filename)
            if flag==False:
                link=df_Nature.loc[i,"DOI"]
                try:
                    # Getting the source code of the webpage
                    driver=webdriver.Chrome()
                    driver.get(link)
                    driver.implicitly_wait(10)
                    page=driver.page_source
                    fp=page.encode("utf-8")
                    driver.quit() 
                    # Saveing the source code
                    with open(filename,mode="w",encoding="utf-8") as f:
                        f.write(page)
                except:
                    pass        
            else:
                fp=open(filename,"r",encoding="utf-8")
            # Parsing the source code
            soup=BeautifulSoup(fp,"lxml")
            # Obtaining the article type
            try:
                src=soup.select("#content > main > article > div.c-article-header > header > ul.c-article-identifiers > li:nth-child(1)")
                t=src[0].get_text()
                df_Nature.loc[i,"ArticleType"]=t
            except:
                pass
            # Obtaining author list
            src=soup.find_all(attrs={"name":"dc.creator"})
            author_list=[]
            for item in src:
                author_list.append(item["content"])
                df_Nature.loc[i,"Authors_SR"]=str(author_list)
                df_Nature.loc[i,"TeamSize_SR"]=len(author_list)
            # Obtaining author contribution statement
            contributions_heading = soup.select_one("h3#contributions")
            if contributions_heading:
                src=contributions_heading.find_next_sibling('p')
                try:
                    AC=src.get_text()
                    if "Correspondence" not in AC:
                        df_Nature.loc[i,"SelfReportedData"]=AC
                except:
                    blanks+=1              
    turn+=1
    if blanks==0:
        break

  df_Nature.loc[i,"ArticleType"]=t
  df_Nature.loc[i,"SelfReportedData"]=AC
100%|██████████| 873/873 [00:41<00:00, 20.91it/s]


In [40]:
df_Nature=df_Nature.dropna()
df_Nature=df_Nature.reset_index(drop=True)

### - Self-reported data

In [41]:
df_Nature=df_Nature.rename(columns={"Authors_SR":"Authors_SR_raw"})

In [42]:
# Normalizing self_reported data
n=len(df_Nature)
keywords1=["co-authors","coauthors"]
keywords2=["All authors","all authors","Both authors","both authors"]
for i in range(n):
    AC_raw=df_Nature.loc[i,"SelfReportedData"]
    ACDict={}

    for kw in keywords2:
        AC_raw = AC_raw.replace(kw,"ALL")
    AC_raw=re.sub(r"&[a-z]*\;"," ",AC_raw)
    # Split the text into sentences
    sentence_pattern = r'(?<=[a-zA-z][.!?\;])\s+(?![a-z])'
    sentences = re.split(sentence_pattern, AC_raw)
    if len(sentences)==1 and ":" in sentences[0]: # not full sentences, but pattern like "Experiment: A.B., C.D. Writing: E.F."
        pattern = re.compile(r'[A-Za-z\s]+:\s*[A-Za-z.,\s-]+')
        sentences = pattern.findall(AC_raw)
    # sentences = segmenter.segment(AC_raw)

    # Regular expression to match author names and tasks
    author_pattern = r'\b(?:[A-ZvÖÄÜ][\.][\-]*)+[\s\,\:]|ALL'

    for sentence in sentences:
        # Find all author names in the sentence
        sentence+=" " # in case the name is at the end of the sentence
        authors = re.findall(author_pattern, sentence)
        for j in range(len(authors)):
            authors[j]=authors[j].replace("-","")
            authors[j]=authors[j].replace(",","")
            authors[j]=authors[j].replace(":","")
            authors[j]=authors[j].replace(" ","")
        
        # Find the task in the sentence
        task=sentence
        task=task.replace("and","")
        task=task.replace(",","")
        task=task.replace(";"," tab")
        task=re.sub(author_pattern, '', task)
        task=task.lower()

        # Remove any trailing or leading punctuation
        task = task.rstrip('.').strip()
        
        # Add the task and authors to the dictionary
        if task and authors:
            if task in ACDict:
                ACDict[task].extend(authors)
            else:
                ACDict[task] = authors
    
    df_Nature.loc[i,"SelfReportedData_Dict"]=str(ACDict)

In [43]:
# Identifying writing contributors
n=len(df_Nature)
keywords=["wrote","writing","manuscript","edit","revise","revision"]
for i in range(n):
    authors_full = []
    ACDict=eval(df_Nature.loc[i,"SelfReportedData_Dict"])
    for task in ACDict:
        flag=False
        for kw in keywords:
            if kw in task.lower():
                flag=True
                break
        if flag:
            authors=[a.upper() for a in ACDict[task]]
            authors_full += authors
    authors_full=list(set(authors_full))
    df_Nature.loc[i,"WritingPeople_SR"]=str(authors_full)

In [44]:
# Converting author names to abbreviations
n=len(df_Nature)
for i in range(n):
    authorList_raw=eval(df_Nature.loc[i,"Authors_SR_raw"])
    authorList=[]
    for author in authorList_raw:
        author.replace("-","") 
        names=author.split(",")
        try:
            lastName=names[0]
            firstName=names[1]
        except:
            continue
        abbr=""
        for name in firstName.split():
            abbr+=name[0].upper()+"."
        for name in lastName.split():
            abbr+=name[0].upper()+"."
        abbr=abbr.upper()
        authorList.append(abbr)
    if len(authorList)!=len(authorList_raw):
        continue
    df_Nature.loc[i,"Authors_SR"]=str(authorList)

In [45]:
# Matching authors' names and their ranks
n=len(df_Nature)
for i in range(n):
    try:
        authorList=eval(df_Nature.loc[i,"Authors_SR"])
    except:
        continue
    wroteList_raw=eval(df_Nature.loc[i,"WritingPeople_SR"])
    # Dealing with the cases that all authors contributed
    keywords=["ALL","BOTH","AUTHORS"]
    flag=False
    for kw in keywords:
        if kw in wroteList_raw:
            flag=True
            break
    if flag:
        wroteList=[r+1 for r in range(len(authorList))]
        df_Nature.loc[i,"WritingPeople_SR_rank"]=str(wroteList)
        df_Nature.loc[i,"WritingNumber_SR"]=len(wroteList)
        continue
    # Dealing with the cases that only part of the authors contributed
    # i) default style; ii) family name first
    wroteList=[]
    try:
        check=authorList.copy()
        for writer in wroteList_raw:
            if writer in authorList:
                rank=authorList.index(writer)+1
                wroteList.append(rank)
                check.remove(writer)
            break
        wroteList.sort()
    except:
        pass
    if len(wroteList)==len(wroteList_raw):
        df_Nature.loc[i,"WritingPeople_SR_rank"]=str(wroteList)
        df_Nature.loc[i,"WritingNumber_SR"]=len(wroteList)

### - Observational data

In [46]:
df_Nature=df_Nature.rename(columns={"TeamSize":"TeamSize_Obs"})

In [47]:
n=len(df_Nature)
for i in range(n):
    l_rank=[]
    AC=df_Nature.loc[i,"AuthorContribution"]
    rank=1
    for author in AC:
        if AC[author]!=0:
            l_rank.append(rank)
        rank+=1
    df_Nature.loc[i,"WritingPeople_Obs_rank"]=str(l_rank)
    df_Nature.loc[i,"WritingNumber_Obs"]=len(l_rank)

### - Summary

In [48]:
df_Nature=df_Nature.dropna()
df_Nature=df_Nature.reset_index(drop=True)

In [49]:
df_Nature=df_Nature[df_Nature["TeamSize_Obs"]==df_Nature["TeamSize_SR"]]
df_Nature=df_Nature[df_Nature["PublicationYear"]<=2023]
df_Nature=df_Nature.reset_index(drop=True)

In [50]:
print(len(df_Nature))
print(np.min(df_Nature["PublicationYear"]),np.max(df_Nature["PublicationYear"]))

204
2010 2023


## Science

In [51]:
df_Science=df[df["Journal"]=="Science"]
df_Science=df_Science.reset_index(drop=True)
df_Science["ArticleType"]=np.nan
df_Science["SelfReportedData"]=np.nan

In [52]:
n=len(df_Science)
turn=1
while turn<=3:
    blanks=0
    for i in tqdm(range(n)):
        if type(df_Science.loc[i,"SelfReportedData"])!=str:
            doi=df_Science.loc[i,"DOI"]
            index=doi.split("/")[3]+doi.split("/")[4].lstrip("science")
            filename="Data/Source/Science/"+str(index)+".html"
            flag=os.path.isfile(filename)
            if flag==False:
                link=df_Science.loc[i,"DOI"]
                try:
                    # Getting the source code of the webpage
                    driver=webdriver.Chrome()
                    driver.get(link)
                    driver.implicitly_wait(10)
                    page=driver.page_source
                    fp=page.encode("utf-8")
                    driver.quit() 
                    # Saveing the source code
                    with open(filename,mode="w",encoding="utf-8") as f:
                        f.write(page)
                except:
                    pass        
            else:
                fp=open(filename,"r",encoding="utf-8")
            # Parsing the source code
            soup=BeautifulSoup(fp,"lxml")
            # Obtaining the article type
            try:
                t=soup.find(attrs={"class":"meta-panel__type"}).get_text()
                df_Science.loc[i,"ArticleType"]=t
            except:
                pass
            # Obtaining author list
            src=soup.find_all(attrs={"property":"author","role":"listitem"})
            author_list=[]
            for item in src:
                FName=item.find(attrs={"property":"familyName"})
                LName=item.find(attrs={"property":"givenName"})
                try:
                    FName=FName.get_text()
                    LName=LName.get_text()
                    author_list.append(LName+", "+FName)
                except:
                    pass
            df_Science.loc[i,"Authors_SR"]=str(author_list)
            df_Science.loc[i,"TeamSize_SR"]=len(author_list)
            src=soup.select("#tab-information > section.core-acknowledgments > div")
            for item in src:
                try:
                    # AC=item.get_text()
                    AC=str(item)
                    if "Authors contributions:" in AC or "Author contributions:" in AC:
                        AC=re.split(r'Authors contributions\:|Author contributions\:', AC)
                        AC=AC[1]
                        AC=AC.replace("</div>","")
                        AC=AC.replace("</b>","")
                        AC=AC.split("<b>")[0]
                        df_Science.loc[i,"SelfReportedData"]=AC
                except:
                    blanks+=1           
    turn+=1
    if blanks==0:
        break

  df_Science.loc[i,"ArticleType"]=t
  df_Science.loc[i,"SelfReportedData"]=AC
100%|██████████| 439/439 [00:37<00:00, 11.66it/s]


In [53]:
df_Science=df_Science.dropna()
df_Science=df_Science.reset_index(drop=True)

### - Self-reported data

In [54]:
df_Science=df_Science.rename(columns={"Authors_SR":"Authors_SR_raw"})

In [55]:
# Normalizing self_reported data
n=len(df_Science)
keywords1=["co-authors","coauthors"]
keywords2=["All authors","all authors","Both authors","both authors"]
for i in range(n):
    AC_raw=df_Science.loc[i,"SelfReportedData"]
    ACDict={}

    for kw in keywords2:
        AC_raw = AC_raw.replace(kw,"ALL")
    AC_raw=re.sub(r"&[a-z]*\;"," ",AC_raw)
    # Split the text into sentences
    sentence_pattern = r'(?<=[a-zA-z][.!?\;])\s+(?![a-z])'
    sentences = re.split(sentence_pattern, AC_raw)
    if len(sentences)==1 and ":" in sentences[0]: # not full sentences, but pattern like "Experiment: A.B., C.D. Writing: E.F."
        pattern = re.compile(r'[A-Za-z\s]+:\s*[A-Za-z.,\s-]+')
        sentences = pattern.findall(AC_raw)

    # Regular expression to match author names and tasks
    author_pattern = r'\b(?:[A-ZvÖÄÜ][\.][\-]*)+[\s\,\:]|ALL'

    for sentence in sentences:
        # Find all author names in the sentence
        sentence+=" " # in case the name is at the end of the sentence
        authors = re.findall(author_pattern, sentence)
        for j in range(len(authors)):
            authors[j]=authors[j].replace("-","")
            authors[j]=authors[j].replace(",","")
            authors[j]=authors[j].replace(":","")
            authors[j]=authors[j].replace(" ","")
        
        # Find the task in the sentence

        # task = re.sub(author_pattern, '', sentence).strip()
        task=sentence
        task=task.replace("and","")
        task=task.replace(",","")
        task=task.replace(";"," tab")
        task=re.sub(author_pattern, '', task)
        task=task.lower()

        # Remove any trailing or leading punctuation
        task = task.rstrip('.').strip()
        
        # Add the task and authors to the dictionary

        if task and authors:
            if task in ACDict:
                ACDict[task].extend(authors)
            else:
                ACDict[task] = authors
    
    df_Science.loc[i,"SelfReportedData_Dict"]=str(ACDict)

In [56]:
# Identifying writing contributors
n=len(df_Science)
keywords=["wrote","writing","manuscript","edit","revise","revision"]
for i in range(n):
    authors_full = []
    ACDict=eval(df_Science.loc[i,"SelfReportedData_Dict"])
    for task in ACDict:
        flag=False
        for kw in keywords:
            if kw in task.lower():
                flag=True
                break
        if flag:
            authors=[a.upper() for a in ACDict[task]]
            authors_full += authors
            # df_Science.loc[i,"WritingPeople_SR"]=str(authors)
            # break
    authors_full=list(set(authors_full))
    df_Science.loc[i,"WritingPeople_SR"]=str(authors_full)

In [57]:
# Converting author names to abbreviations
n=len(df_Science)
for i in range(n):
    authorList_raw=eval(df_Science.loc[i,"Authors_SR_raw"])
    authorList=[]
    for author in authorList_raw:
        author.replace("-","") 
        names=author.split(",")
        lastName=names[0]
        firstName=names[1]
        abbr=""
        for name in firstName.split():
            abbr+=name[0].upper()+"."
        for name in lastName.split():
            abbr+=name[0].upper()+"."
        abbr=abbr.upper()
        # abbr=abbr.replace("Å","A")
        # abbr=abbr.replace("Ć","C")
        authorList.append(abbr)
    df_Science.loc[i,"Authors_SR"]=str(authorList)

In [58]:
# Matching authors' names and their ranks
n=len(df_Science)
for i in range(n):
    authorList=eval(df_Science.loc[i,"Authors_SR"])
    wroteList_raw=eval(df_Science.loc[i,"WritingPeople_SR"])
    # Dealing with the cases that all authors contributed
    keywords=["ALL","BOTH","AUTHORS"]
    flag=False
    for kw in keywords:
        if kw in wroteList_raw:
            flag=True
            break
    if flag:
        wroteList=[r+1 for r in range(len(authorList))]
        df_Science.loc[i,"WritingPeople_SR_rank"]=str(wroteList)
        df_Science.loc[i,"WritingNumber_SR"]=len(wroteList)
        continue
    # Dealing with the cases that only part of the authors contributed
    wroteList=[]
    try:
        check=authorList.copy()
        for writer in wroteList_raw:
            if writer in authorList:
                rank=authorList.index(writer)+1
                wroteList.append(rank)
                check.remove(writer)
        wroteList.sort()
    except:
        pass
    if len(wroteList)==len(wroteList_raw):
        df_Science.loc[i,"WritingPeople_SR_rank"]=str(wroteList)
        df_Science.loc[i,"WritingNumber_SR"]=len(wroteList)

### - Observational data

In [59]:
df_Science=df_Science.rename(columns={"TeamSize":"TeamSize_Obs"})

In [60]:
n=len(df_Science)
for i in range(n):
    l_rank=[]
    AC=df_Science.loc[i,"AuthorContribution"]
    rank=1
    for author in AC:
        if AC[author]!=0:
            l_rank.append(rank)
        rank+=1
    df_Science.loc[i,"WritingPeople_Obs_rank"]=str(l_rank)
    df_Science.loc[i,"WritingNumber_Obs"]=len(l_rank)

### - Summary

In [61]:
df_Science=df_Science.dropna()
df_Science=df_Science.reset_index(drop=True)

In [62]:
df_Science=df_Science[df_Science["TeamSize_Obs"]==df_Science["TeamSize_SR"]]
df_Science=df_Science[df_Science["PublicationYear"]<=2023]
df_Science=df_Science.reset_index(drop=True)

In [63]:
print(len(df_Science))
print(np.min(df_Science["PublicationYear"]),np.max(df_Science["PublicationYear"]))

48
2018 2023


## PNAS

In [9]:
df_PNAS=df[df["Journal"]=="Proceedings of the National Academy of Sciences"]
df_PNAS=df_PNAS.reset_index(drop=True)
df_PNAS["SelfReportedData"]=np.nan

In [10]:
n=len(df_PNAS)
turn=1
while turn<=3:
    blanks=0
    for i in tqdm(range(n)):
        if type(df_PNAS.loc[i,"SelfReportedData"])!=str:
            doi=df_PNAS.loc[i,"DOI"]
            index=doi.split("pnas.")[-1]
            filename="Data/Source/PNAS/"+str(index)+".html"
            flag=os.path.isfile(filename)
            if flag==False:
                link=df_PNAS.loc[i,"DOI"]
                try:
                    # Getting the source code of the webpage
                    driver=webdriver.Chrome()
                    driver.get(link)
                    driver.implicitly_wait(10)
                    page=driver.page_source
                    fp=page.encode("utf-8")
                    driver.quit() 
                    # Saveing the source code
                    with open(filename,mode="w",encoding="utf-8") as f:
                        f.write(page)
                except:
                    pass        
            else:
                fp=open(filename,"r",encoding="utf-8")
            # Parsing the source code
            soup=BeautifulSoup(fp,"lxml")
            # Obtaining author list
            src=soup.find_all(attrs={"name":"citation_author"})
            author_list=[]
            for item in src:
                author_list.append(item["content"])
                df_PNAS.loc[i,"Authors_SR"]=str(author_list)
                df_PNAS.loc[i,"TeamSize_SR"]=len(author_list)
            try:
                AC=np.nan
                # Check if author contributions are recorded in "Authors Info & Affiliations" section
                src=soup.select("#tab-contributors > section.core-authors-notes > div")
                for item in src: 
                    text=item.get_text()
                    if "wrote the paper" in text:
                        AC=text
                # Check if author contributions are recorded as footnotes
                if type(AC)!=str:
                    src=soup.find_all("div",role="paragraph")
                    for item in src:
                        text=item.get_text()
                        if "wrote the paper" in text:
                            AC=text 
                if type(AC)!=str:
                    src=soup.find_element_by_xpath("/html/body/div[1]/div/div/main/div[1]/div/article/div[3]/div[1]/section[1]/section[4]/section/section[1]/div")
                    for item in src:
                        text=item.get_text()
                        if "wrote the paper" in text:
                            AC=text 
                AC=AC.replace("Author contributions: ","")
                df_PNAS.loc[i,"SelfReportedData"]=AC
            except:
                blanks+=1           
    turn+=1
    if blanks==0:
        break

  df_PNAS.loc[i,"SelfReportedData"]=AC
100%|██████████| 947/947 [03:11<00:00,  4.95it/s]
100%|██████████| 947/947 [01:00<00:00, 15.59it/s] 
100%|██████████| 947/947 [00:49<00:00, 19.16it/s] 


In [11]:
df_PNAS=df_PNAS.dropna()
df_PNAS=df_PNAS.reset_index(drop=True)

### - Self-reported data

In [12]:
df_PNAS=df_PNAS.rename(columns={"Authors_SR":"Authors_SR_raw"})

In [13]:
# Normalizing self_reported data
n=len(df_PNAS)

for i in range(n):
    AC_raw=df_PNAS.loc[i,"SelfReportedData"]
    ACDict={}
    AC_raw=re.sub(r"&[a-z]*\;"," ",AC_raw)

    sentences = AC_raw.split(";")

    # Regular expression to match author names and tasks
    author_pattern = r'\b(?:[A-ZvÖÄÜ][\.][\-]*)+[\s\,\:]'

    for sentence in sentences:
        # Find all author names in the sentence
        sentence+=" " # in case the name is at the end of the sentence
        authors = re.findall(author_pattern, sentence)
        for j in range(len(authors)):
            authors[j]=authors[j].replace("-","")
            authors[j]=authors[j].replace(",","")
            authors[j]=authors[j].replace(":","")
            authors[j]=authors[j].replace(" ","")
        
        # Find the task in the sentence

        # task = re.sub(author_pattern, '', sentence).strip()
        task=sentence
        task=task.replace("and","")
        task=task.replace(",","")
        task=task.replace(";"," tab")
        task=re.sub(author_pattern, '', task)
        task=task.lower()

        # Remove any trailing or leading punctuation
        task = task.rstrip('.').strip()
        
        # Add the task and authors to the dictionary

        if task and authors:
            if task in ACDict:
                ACDict[task].extend(authors)
            else:
                ACDict[task] = authors
    
    df_PNAS.loc[i,"SelfReportedData_Dict"]=str(ACDict)

In [14]:
# Identifying writing contributors
n=len(df_PNAS)
keywords=["wrote","writing","manuscript","editted","editting","revise","revision"]
for i in range(n):
    authors_full = []
    ACDict=eval(df_PNAS.loc[i,"SelfReportedData_Dict"])
    for task in ACDict:
        flag=False
        for kw in keywords:
            if kw in task.lower():
                flag=True
                break
        if flag:
            authors=[a.upper() for a in ACDict[task]]
            authors_full += authors
            # df_PNAS.loc[i,"WritingPeople_SR"]=str(authors)
            # break
    authors_full=list(set(authors_full))
    df_PNAS.loc[i,"WritingPeople_SR"]=str(authors_full)

In [15]:
# Converting author names to abbreviations
n=len(df_PNAS)
for i in range(n):
    authorList_raw=eval(df_PNAS.loc[i,"Authors_SR_raw"])
    authorList=[]
    for author in authorList_raw:
        author.replace("-","") 
        names=author.split(",")
        lastName=names[0]
        firstName=names[1]
        abbr=""
        for name in firstName.split():
            abbr+=name[0].upper()+"."
        for name in lastName.split():
            abbr+=name[0].upper()+"."
        abbr=abbr.upper()
        authorList.append(abbr)
    df_PNAS.loc[i,"Authors_SR"]=str(authorList)

In [16]:
# Matching authors' names and their ranks
n=len(df_PNAS)
for i in range(n):
    authorList=eval(df_PNAS.loc[i,"Authors_SR"])
    wroteList_raw=eval(df_PNAS.loc[i,"WritingPeople_SR"])
    # Dealing with the cases that all authors contributed
    keywords=["ALL","BOTH","AUTHORS"]
    flag=False
    for kw in keywords:
        if kw in wroteList_raw:
            flag=True
            break
    if flag:
        wroteList=[r+1 for r in range(len(authorList))]
        df_PNAS.loc[i,"WritingPeople_SR_rank"]=str(wroteList)
        df_PNAS.loc[i,"WritingNumber_SR"]=len(wroteList)
        continue
    # Dealing with the cases that only part of the authors contributed
    wroteList=[]
    try:
        check=authorList.copy()
        for writer in wroteList_raw:
            if writer in authorList:
                rank=authorList.index(writer)+1
                wroteList.append(rank)
                check.remove(writer)
        wroteList.sort()
    except:
        pass
    if len(wroteList)==len(wroteList_raw):
        df_PNAS.loc[i,"WritingPeople_SR_rank"]=str(wroteList)
        df_PNAS.loc[i,"WritingNumber_SR"]=len(wroteList)

### - Observational data

In [17]:
df_PNAS=df_PNAS.rename(columns={"TeamSize":"TeamSize_Obs"})

In [19]:
n=len(df_PNAS)
for i in range(n):
    l_rank=[]
    AC=df_PNAS.loc[i,"AuthorContribution"]
    rank=1
    for author in AC:
        if AC[author]!=0:
            l_rank.append(rank)
        rank+=1
    df_PNAS.loc[i,"WritingPeople_Obs_rank"]=str(l_rank)
    df_PNAS.loc[i,"WritingNumber_Obs"]=len(l_rank)

### - Summary

In [20]:
df_PNAS=df_PNAS.dropna()
df_PNAS=df_PNAS.reset_index(drop=True)

In [21]:
df_PNAS=df_PNAS[df_PNAS["TeamSize_Obs"]==df_PNAS["TeamSize_SR"]]
df_PNAS=df_PNAS[df_PNAS["PublicationYear"]<=2023]
df_PNAS=df_PNAS.reset_index(drop=True)

In [22]:
print(len(df_PNAS))
print(np.min(df_PNAS["PublicationYear"]),np.max(df_PNAS["PublicationYear"]))

599
2006 2023


## Plos One

In [23]:
df_POne=df[df["Journal"]=="PLoS ONE"]
df_POne=df_POne.reset_index(drop=True)
df_POne["SelfReportedData"]=np.nan

In [24]:
n=len(df_POne)
turn=1
while turn<=3:
    blanks=0
    for i in tqdm(range(n)):
        if type(df_POne.loc[i,"SelfReportedData"])!=str:
            doi=df_POne.loc[i,"DOI"]
            index=doi.split("/")[3]+"."+doi.split("/")[4].lstrip("journal.pone")
            filename="Data/Source/Plos One/"+str(index)+".html"
            flag=os.path.isfile(filename)
            if flag==False:
                link=df_POne.loc[i,"DOI"]
                try:
                    # Getting the source code of the webpage
                    driver=webdriver.Chrome()
                    driver.get(link)
                    driver.implicitly_wait(10)
                    page=driver.page_source
                    fp=page.encode("utf-8")
                    driver.quit() 
                    # Saveing the source code
                    with open(filename,mode="w",encoding="utf-8") as f:
                        f.write(page)
                except:
                    pass        
            else:
                fp=open(filename,"r",encoding="utf-8")
            # Parsing the source code
            soup=BeautifulSoup(fp,"lxml")
            # Obtaining author list
            src=soup.find_all(attrs={"class":"author-name"})
            author_list=[]
            for item in src:
                author=item.get_text().replace("\n","")
                author=author.rstrip(",")
                author=author.strip(" ")
                author_list.append(author)
            df_POne.loc[i,"Authors_SR"]=str(author_list)
            df_POne.loc[i,"TeamSize_SR"]=len(author_list)
            # Obtaining author contribution statement
            try:       
                src=soup.find_all(attrs={"class":"roles"})
                TS=len(src)
                AC={}
                for rank in range(TS):
                    item=src[rank]
                    AC[author_list[rank]]=item.get_text().replace("\n","")
                if len(AC)!=0:
                    df_POne.loc[i,"SelfReportedData"]=str(AC)
                else:
                    try:
                        AC=soup.select("#artText > div.contributions.toc-section > p")[0].get_text()
                        if AC!="\n":
                            df_POne.loc[i,"SelfReportedData"]=AC
                        else:
                            AC=""
                            try:
                                src=soup.select("#artText > div.contributions.toc-section > ol")
                                for item in src:
                                    AC+=item.get_text().replace("\n","")
                                df_POne.loc[i,"SelfReportedData"]=AC
                            except:
                                pass
                    except:
                        pass
            except:
                blanks+=1           
    turn+=1
    if blanks==0:
        break

  df_POne.loc[i,"SelfReportedData"]=AC
100%|██████████| 481/481 [00:26<00:00, 17.96it/s]


In [25]:
df_POne=df_POne.dropna()
df_POne=df_POne.reset_index(drop=True)

### - Self-reported data

In [26]:
df_POne=df_POne.rename(columns={"Authors_SR":"Authors_SR_raw"})

In [27]:
# Normalizing self_reported data
df_POne["SelfReportedData_Dict"]=np.nan

n=len(df_POne)
for i in range(n):
    ACDict={}
    AC=df_POne.loc[i,"SelfReportedData"]
    if "{" in AC:
        AC=eval(AC)
        contributors=list(AC.keys())
        for contributor in contributors:
            task = AC[contributor]
            if " " in contributor:
                contributor=[c_[0] for c_ in contributor.split()]
                contributor=".".join(list(contributor))+"."
            if task not in ACDict:
                ACDict[task] = [contributor]
            else:
                ACDict[task].append(contributor)
    else:
        AC_list=AC.split(".")
        for item in AC_list:
            if ":" not in item:
                continue
            item=item.split(":")
            task=item[0].lower()
            contributors=[]
            contributors_raw=item[1].split()
            for contributor in contributors_raw:
                contributor=".".join(list(contributor))+"."
                contributors.append(contributor)
            ACDict[task]=contributors
    df_POne.loc[i,"SelfReportedData_Dict"]=str(ACDict)

  df_POne.loc[i,"SelfReportedData_Dict"]=str(ACDict)


In [28]:
# Identifying writing contributors
n=len(df_POne)
keywords=["wrote","writing","manuscript","edit","revise","revision"]
for i in range(n):
    authors_full = []
    ACDict=eval(df_POne.loc[i,"SelfReportedData_Dict"])
    for task in ACDict:
        flag=False
        for kw in keywords:
            if kw in task.lower():
                flag=True
                break
        if flag:
            authors=[a.upper() for a in ACDict[task]]
            authors_full += authors
            # df_POne.loc[i,"WritingPeople_SR"]=str(authors)
            # break
    authors_full=list(set(authors_full))
    df_POne.loc[i,"WritingPeople_SR"]=str(authors_full)

In [29]:
# Converting author names to abbreviations
n=len(df_POne)
for i in range(n):
    authorList_raw=eval(df_POne.loc[i,"Authors_SR_raw"])
    authorList=[]
    for author in authorList_raw:
        names=author.split(" ")
        abbr=""
        for name in names:
            abbr+=name[0].upper()+"."
        authorList.append(abbr)
    df_POne.loc[i,"Authors_SR"]=str(authorList)

In [30]:
# Matching authors' names and their ranks
n=len(df_POne)
for i in range(n):
    authorList=eval(df_POne.loc[i,"Authors_SR"])
    wroteList_raw=eval(df_POne.loc[i,"WritingPeople_SR"])
    # Dealing with the cases that all authors contributed
    keywords=["ALL","BOTH","AUTHORS"]
    flag=False
    for kw in keywords:
        if kw in wroteList_raw:
            flag=True
            break
    if flag:
        wroteList=[r+1 for r in range(len(authorList))]
        df_POne.loc[i,"WritingPeople_SR_rank"]=str(wroteList)
        df_POne.loc[i,"WritingNumber_SR"]=len(wroteList)
        continue
    # Dealing with the cases that only part of the authors contributed
    wroteList=[]
    try:
        check=authorList.copy()
        for writer in wroteList_raw:
            if writer in authorList:
                rank=authorList.index(writer)+1
                wroteList.append(rank)
                check.remove(writer)
        wroteList.sort()
    except:
        pass
    if len(wroteList)==len(wroteList_raw):
        df_POne.loc[i,"WritingPeople_SR_rank"]=str(wroteList)
        df_POne.loc[i,"WritingNumber_SR"]=len(wroteList)

### - Observational data

In [31]:
df_POne=df_POne.rename(columns={"TeamSize":"TeamSize_Obs"})

In [33]:
n=len(df_POne)
for i in range(n):
    l_rank=[]
    AC=df_POne.loc[i,"AuthorContribution"]
    rank=1
    for author in AC:
        if AC[author]!=0:
            l_rank.append(rank)
        rank+=1
    df_POne.loc[i,"WritingPeople_Obs_rank"]=str(l_rank)
    df_POne.loc[i,"WritingNumber_Obs"]=len(l_rank)

### - Summary

In [34]:
df_POne=df_POne.dropna()
df_POne=df_POne.reset_index(drop=True)

In [35]:
df_POne=df_POne[df_POne["TeamSize_Obs"]==df_POne["TeamSize_SR"]]
df_POne=df_POne[df_POne["PublicationYear"]<=2023]
df_POne=df_POne.reset_index(drop=True)

In [36]:
print(len(df_POne))
print(np.min(df_POne["PublicationYear"]),np.max(df_POne["PublicationYear"]))

362
2007 2023


# > Calculate Precision and Recall

## Nature

In [64]:
n=len(df_Nature)
for i in range(n):
    actual=eval(df_Nature.loc[i,"WritingPeople_SR_rank"])
    predict=eval(df_Nature.loc[i,"WritingPeople_Obs_rank"])
    TP=0
    FP=0
    FN=0
    for author in actual:
        if author in predict:
            TP+=1
        else:
            FN+=1
    for author in predict:
        if author not in actual:
            FP+=1
    try:
        precision=TP/(TP+FP)
        df_Nature.loc[i,"Precision"]=precision
    except:
        pass
    try:
        recall=TP/(TP+FN)
        df_Nature.loc[i,"Recall"]=recall
    except:
        pass

In [65]:
print("Precision: "+str(np.mean(df_Nature["Precision"]))+" ("+str(np.std(df_Nature["Precision"]))+")")
print("Recall: "+str(np.mean(df_Nature["Recall"]))+" ("+str(np.std(df_Nature["Recall"]))+")")

Precision: 0.8153594771241831 (0.3794000127660417)
Recall: 0.6402894862055889 (0.2754430348693917)


## Science

In [66]:
n=len(df_Science)
for i in range(n):
    actual=eval(df_Science.loc[i,"WritingPeople_SR_rank"])
    predict=eval(df_Science.loc[i,"WritingPeople_Obs_rank"])
    TP=0
    FP=0
    FN=0
    for author in actual:
        if author in predict:
            TP+=1
        else:
            FN+=1
    for author in predict:
        if author not in actual:
            FP+=1
    try:
        precision=TP/(TP+FP)
        df_Science.loc[i,"Precision"]=precision
    except:
        pass
    try:
        recall=TP/(TP+FN)
        df_Science.loc[i,"Recall"]=recall
    except:
        pass

In [67]:
print("Precision: "+str(np.mean(df_Science["Precision"]))+" ("+str(np.std(df_Science["Precision"]))+")")
print("Recall: "+str(np.mean(df_Science["Recall"]))+" ("+str(np.std(df_Science["Recall"]))+")")

Precision: 0.8958333333333334 (0.3054766312211496)
Recall: 0.5925824345652987 (0.29055299912112964)


## PNAS

In [68]:
n=len(df_PNAS)
for i in range(n):
    actual=set(eval(df_PNAS.loc[i,"WritingPeople_SR_rank"]))
    predict=set(eval(df_PNAS.loc[i,"WritingPeople_Obs_rank"]))
    TP=0
    FP=0
    FN=0
    for author in actual:
        if author in predict:
            TP+=1
        else:
            FN+=1
    for author in predict:
        if author not in actual:
            FP+=1
    try:
        precision=TP/(TP+FP)
        df_PNAS.loc[i,"Precision"]=precision
    except:
        pass
    try:
        recall=TP/(TP+FN)
        df_PNAS.loc[i,"Recall"]=recall
    except:
        pass

In [69]:
print("Precision: "+str(np.mean(df_PNAS["Precision"]))+" ("+str(np.std(df_PNAS["Precision"]))+")")
print("Recall: "+str(np.mean(df_PNAS["Recall"]))+" ("+str(np.std(df_PNAS["Recall"]))+")")

Precision: 0.8973912147952215 (0.2472327131702179)
Recall: 0.7310290710800915 (0.286984675757484)


## Plos One

In [70]:
n=len(df_POne)
for i in range(n):
    actual=eval(df_POne.loc[i,"WritingPeople_SR_rank"])
    predict=eval(df_POne.loc[i,"WritingPeople_Obs_rank"])
    TP=0
    FP=0
    FN=0
    for author in actual:
        if author in predict:
            TP+=1
        else:
            FN+=1
    for author in predict:
        if author not in actual:
            FP+=1
    try:
        precision=TP/(TP+FP)
        df_POne.loc[i,"Precision"]=precision
    except:
        pass
    try:
        recall=TP/(TP+FN)
        df_POne.loc[i,"Recall"]=recall
    except:
        pass

In [71]:
print("Precision: "+str(np.mean(df_POne["Precision"]))+" ("+str(np.std(df_POne["Precision"]))+")")
print("Recall: "+str(np.mean(df_POne["Recall"]))+" ("+str(np.std(df_POne["Recall"]))+")")

Precision: 0.92255985267035 (0.22684808227890876)
Recall: 0.6540618869270555 (0.29705449411037993)
