Find the author collaborations between the top 10 authors overall and between the top 30 authors since 2020.

Imports

In [4]:
import pandas as pd
import plotly.graph_objects as go
import IPython.display as ipd

Load the datasets

In [5]:
#Read the datasets into pandas dataframes

articles = pd.read_csv('../data/articles.leptospirosis.csv')
authors = pd.read_csv('../data/authors.leptospirosis.csv')
paper_counts = pd.read_csv('../data/paper_counts.csv')

View the datasets

In [6]:
# View the articles dataset
ipd.display("Leptospirosis Articles: ", articles.head())

# View the authors dataset
ipd.display("Leptospirosis Authors: ", authors.head())

# View the paper counts dataset
ipd.display("Leptospirosis Paper Counts: ", paper_counts.head())

'Leptospirosis Articles: '

Unnamed: 0,PMID,Title,Abstract,ISSN,Journal,Location,Year,FirstAuthorForename,FirstAuthorLastname,FirstAuthorInitials,FirstAuthorAffiliation
0,10548299,Risk factors for death and changing patterns i...,The risk factors for death and changes in clin...,0002-9637,The American journal of tropical medicine and ...,(61) 630-4,1999,E,Daher,E,"Departamento de Medicina Clínica, Faculdade de..."
1,10569777,Leptospiral outer membrane proteins OmpL1 and ...,New vaccine strategies are needed for preventi...,0019-9567,Infection and immunity,(67) 6572-82,1999,D A,Haake,DA,"Division of Infectious Diseases, Veterans Affa..."
2,10585813,Acute lung injury in leptospirosis: clinical a...,Forty-two consecutive patients with leptospiro...,1058-4838,Clinical infectious diseases : an official pub...,(29) 1561-3,1999,P C,Marotto,PC,"Intensive Care Unit, Instituto de Infectologia..."
3,10586903,Assessment of the efficacy of an IgM-elisa and...,In a prospective study in Barbados between 197...,0002-9637,The American journal of tropical medicine and ...,(61) 731-4,1999,P,Cumberland,P,"Infectious Disease Epidemiology Unit, London S..."
4,10596270,[An old disease with a new face: canine leptos...,The clinical features of the disease are prese...,0036-7281,Schweizer Archiv fur Tierheilkunde,(141) 499-507,1999,A,Steger-Lieb,A,"Klinik für kleine Haustiere, Universität Bern."


'Leptospirosis Authors: '

Unnamed: 0,PMID,AuthorN,AuthorForename,AuthorLastname,AuthorInitials,AuthorAffiliation
0,10548299,1,E,Daher,E,"Departamento de Medicina Clínica, Faculdade de..."
1,10548299,2,D M,Zanetta,DM,
2,10548299,3,M B,Cavalcante,MB,
3,10548299,4,R C,Abdulkader,RC,
4,10569777,1,D A,Haake,DA,"Division of Infectious Diseases, Veterans Affa..."


'Leptospirosis Paper Counts: '

Unnamed: 0,Year,Count
0,1799,1
1,1801,1
2,1802,1
3,1805,1
4,1866,1


Merge the authors database with articles database on PMID including only the Year of publication from articles.

In [7]:
articles_authors = pd.merge(authors, articles[['PMID', 'Year']], on='PMID', how='outer')
ipd.display("Merged dataset: ", articles_authors.head())


'Merged dataset: '

Unnamed: 0,PMID,AuthorN,AuthorForename,AuthorLastname,AuthorInitials,AuthorAffiliation,Year
0,10548299,1.0,E,Daher,E,"Departamento de Medicina Clínica, Faculdade de...",1999
1,10548299,2.0,D M,Zanetta,DM,,1999
2,10548299,3.0,M B,Cavalcante,MB,,1999
3,10548299,4.0,R C,Abdulkader,RC,,1999
4,10569777,1.0,D A,Haake,DA,"Division of Infectious Diseases, Veterans Affa...",1999


Data Cleaning

Find the null counts on the merged dataset

In [8]:
articles_authors.isnull().sum()

Unnamed: 0,0
PMID,0
AuthorN,2
AuthorForename,12
AuthorLastname,2
AuthorInitials,36
AuthorAffiliation,8008
Year,0


Drop rows with AuthonN, Forename, Lastname and Initials as null

In [9]:
articles_authors = articles_authors[~articles_authors.iloc[:, 1:5].isna().all(axis=1)]
ipd.display("New null counts: ", articles_authors.isnull().sum())

'New null counts: '

Unnamed: 0,0
PMID,0
AuthorN,0
AuthorForename,10
AuthorLastname,0
AuthorInitials,34
AuthorAffiliation,8006
Year,0


Create AuthorName column combining AuthorInitials and AuthorLastname

In [10]:
# Ensure all columns used for author names are strings, and handle any NaN values by replacing them with empty strings
articles_authors.loc[:, 'AuthorLastname'] = articles_authors['AuthorLastname'].fillna('').astype(str)
articles_authors.loc[:, 'AuthorInitials'] = articles_authors['AuthorInitials'].fillna('').astype(str)

# Concatenate the initials, and lastname to form the author's name
articles_authors.loc[:, 'AuthorName'] = articles_authors['AuthorInitials'] + ' ' + articles_authors['AuthorLastname']

# Display the dataframe
ipd.display("Articles and authors merged dataset with AuthorName column: ", articles_authors.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  articles_authors.loc[:, 'AuthorName'] = articles_authors['AuthorInitials'] + ' ' + articles_authors['AuthorLastname']


'Articles and authors merged dataset with AuthorName column: '

Unnamed: 0,PMID,AuthorN,AuthorForename,AuthorLastname,AuthorInitials,AuthorAffiliation,Year,AuthorName
0,10548299,1.0,E,Daher,E,"Departamento de Medicina Clínica, Faculdade de...",1999,E Daher
1,10548299,2.0,D M,Zanetta,DM,,1999,DM Zanetta
2,10548299,3.0,M B,Cavalcante,MB,,1999,MB Cavalcante
3,10548299,4.0,R C,Abdulkader,RC,,1999,RC Abdulkader
4,10569777,1.0,D A,Haake,DA,"Division of Infectious Diseases, Veterans Affa...",1999,DA Haake


Find the collaborations between

1.   the top 10 authors of all time
2.   the top 30 authors from 2020 on publications from 2020



Create a function to find the author collaborations

In [11]:
def count_author_collaborations(df):
    # Create a dictionary to count collaborations between authors
    author_collaboration_counts = {}

    # Loop over each paper (PMID) and find the co-authors
    for pmid, group in df.groupby('PMID'):  # Group by PMID
        authors_names = group['AuthorName'].tolist() # List of authors for each PMID
        for i in range(len(authors_names)):
            for j in range(i + 1, len(authors_names)): # Start from i+1 to avoid duplicates and counting self collabs
                pair = tuple(sorted([authors_names[i], authors_names[j]])) # Sort the tuples to avoid duplicates
                if pair not in author_collaboration_counts:
                    author_collaboration_counts[pair] = 0 # Initialize the count to zero if pair is not in the dict
                author_collaboration_counts[pair] += 1 # Increment the count
    return author_collaboration_counts


Create function to get author labels and Sankey Links

In [12]:
def create_author_labels_and_links(collaboration_counts):
    author_labels = list(set(author for pair in collaboration_counts.keys() for author in pair)) # list of authors from the collaboration counts dict
    author_labels.sort()  # Sort to maintain consistent positioning
    author_index = {author: idx for idx, author in enumerate(author_labels)}  # Create a dictionary to map authors to indices

    links = []
    for pair, count in collaboration_counts.items(): # Loop through each pair of authors and their corresponding counts
        links.append({
            'source': author_index[pair[0]], # Create and add link between the authors to the list
            'target': author_index[pair[1]],
            'value': count
        })

    return author_labels, links

Create a function to plot the sankey diagram using plotly

In [13]:
def create_sankey_diagram(author_labels, links, title):
    sankey_fig = go.Figure(go.Sankey(  # Create a sankey figure
        node=dict( # Node configuration
            pad=20, # Spacing between the nodes
            thickness=20, # Thickness of each node
            line=dict(color="black", width=0.5), # Border definition for each of the nodes
            label=author_labels,
            hovertemplate="<b>%{label}</b><br>Number of Collaborations : %{value:.0f} <extra></extra>" # Defines the display when hovering over a node
        ),
        link=dict(
            source=[link['source'] for link in links], # Source index
            target=[link['target'] for link in links], # Target index
            value=[link['value'] for link in links] # Collaboration count
        )
    ))

    sankey_fig.update_layout(title=title) # Update the layout by setting the title
    sankey_fig.show() # Display the diagram
    return sankey_fig

1. Find the top 10 authors

In [14]:
# Count the number of papers each author has worked on and get only the top 10
top10_author_counts = articles_authors['AuthorName'].value_counts().head(10)

ipd.display("Top 10 authors count: ", top10_author_counts)

# Filter the dataset to only include the top 10 authors
df_top10_authors = articles_authors[articles_authors['AuthorName'].isin(top10_author_counts.index)]

ipd.display("Top 10 authors dataframe: ", df_top10_authors.head())

'Top 10 authors count: '

Unnamed: 0_level_0,count
AuthorName,Unnamed: 1_level_1
AI Ko,86
M Picardeau,78
W Lilenbaum,77
OA Dellagostin,60
MG Reis,57
RA Hartskeerl,56
N Koizumi,56
SA Vasconcellos,56
C Goarant,41
F Costa,41


'Top 10 authors dataframe: '

Unnamed: 0,PMID,AuthorN,AuthorForename,AuthorLastname,AuthorInitials,AuthorAffiliation,Year,AuthorName
83,10699040,8.0,R A,Hartskeerl,RA,,2000,RA Hartskeerl
140,10747272,2.0,R A,Hartskeerl,RA,,2000,RA Hartskeerl
171,10781725,6.0,S A,Vasconcellos,SA,,2000,SA Vasconcellos
375,11251907,10.0,R A,Hartskeerl,RA,,2001,RA Hartskeerl
415,11298286,1.0,M,Picardeau,M,Unité de Bactériologie Moléculaire et Médicale...,2001,M Picardeau


Find and plot the collaborations of the top 10 authors of all time using a sankey diagram

In [15]:
# Get the collaboration counts between the top 10 authors
top10_collaboration_counts = count_author_collaborations(df_top10_authors)

# Create the author labels and sankey links
author_labels_top10, links_top10 = create_author_labels_and_links(top10_collaboration_counts)

# Plot the sankey diagram
sankey_fig_top10 = create_sankey_diagram(author_labels_top10, links_top10, "Collaborations Between Top 10 Authors")

# Save the diagram as a html file
sankey_fig_top10.write_html('Top10AuthorsCollabs.html')

2. Find top 30 authors from 2020

In [16]:
# Filter data to include only papers from 2020 and onwards
authors_articles_from_2020 = articles_authors[articles_authors['Year'] >= 2020]

# Count the number of papers each author has worked on (Top 30 authors from 2020 onwards)
top30_author_counts = authors_articles_from_2020['AuthorName'].value_counts().head(30)
ipd.display("Top 30 authors since 2020 count: ", top30_author_counts)

# Filter the dataset to only include the top 30 authors from 2020 onwards
df_top30_authors = authors_articles_from_2020[authors_articles_from_2020['AuthorName'].isin(top30_author_counts.index)]
ipd.display("Top 30 authors from 2020 dataframe: ", df_top30_authors.head())

'Top 30 authors since 2020 count: '

Unnamed: 0_level_0,count
AuthorName,Unnamed: 1_level_1
W Lilenbaum,24
F Costa,19
M Picardeau,18
JE Nally,18
MIN Di Azevedo,16
N Koizumi,16
MB Heinemann,15
AI Ko,15
N Srisawat,13
MG Reis,12


'Top 30 authors from 2020 dataframe: '

Unnamed: 0,PMID,AuthorN,AuthorForename,AuthorLastname,AuthorInitials,AuthorAffiliation,Year,AuthorName
14774,30713004,6.0,M B,Heinemann,MB,"Laboratório de Zoonoses Bacterianas do VPS, Fa...",2020,MB Heinemann
14776,30713004,8.0,A L T O,Nascimento,ALTO,Laboratorio Especial de Desenvolvimento de Vac...,2020,ALTO Nascimento
14861,30875033,5.0,Z,Sekawi,Z,I Department of Medical Microbiology & Parasit...,2020,Z Sekawi
15033,31029530,5.0,Zamberi,Sekawi,Z,Department of Medical Microbiology and Parasit...,2020,Z Sekawi
15443,31455598,7.0,Zamberi,Sekawi,Z,Department of Medical Microbiology and Parasit...,2020,Z Sekawi


Find and plot the collaborations of the top 30 authors since 2020 using a sankey diagram

In [17]:
# Get the collaboration counts between the top 30 authors from 2020
top30_collaboration_counts = count_author_collaborations(df_top30_authors)

# Create the author labels and sankey links
author_labels_top30, links_top30 = create_author_labels_and_links(top30_collaboration_counts)

# Plot the sankey diagram
sankey_fig_top30 = create_sankey_diagram(author_labels_top30, links_top30, "Collaboration Between Top 30 Authors from 2020 on publications from 2020")

# Save the diagram as a html file
sankey_fig_top30.write_html('Top30Collabs.html')