In [1]:
import pandas as pd
import json
import re
import networkx as nx
import os
import numpy as np
import igraph as ig
import leidenalg
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

In [2]:
def calculate_average_topic_distribution(author_papers):
    if not author_papers:
        return np.zeros(len(author_papers[0]))
    return np.mean(author_papers, axis=0)

In [3]:
def parse_string_to_list(s, num_topics=14):
    try:
#         print(s)
        return np.fromstring(s.strip('[]'), sep=' ')
    except:
        return np.zeros(num_topics)

In [4]:
file_path = 'Full_Author_Topic_w_2002.csv'
output_folder = 'coauthorship_networks'

Cumulative Networks and Leiden Clustering

In [5]:
def create_cumulative_coauthorship_networks(file_path, output_folder):
    # Load the CSV file
    df = pd.read_csv(file_path)
    
#     data['Full Authors'] = data['Full Authors']
#     data['distr'] = data['distr'].apply(lambda x: parse_string_to_list(x))
    
#     print(data)
    # Initialize a dictionary to hold cumulative graphs for each year
    cumulative_graphs = {}
    author_topic_distributions = {}

    # Create the output folder if it does not exist
    os.makedirs(output_folder, exist_ok=True)

    # Create coauthorship networks
    for year in sorted(df['Year'].unique()):
        print(year)
#         print(df)
#         print(df.columns)
#         print(df['Year'])
        yearly_data = df[df['Year'] <= year]
        
        G = nx.Graph()
        
        for _, row in yearly_data.iterrows():
            authors_dict = eval(row['Full Authors'])
            topic_distribution = parse_string_to_list(row['distr'])
#             print(topic_distribution)
            for author_id, author_name in authors_dict.items():
                if author_id not in author_topic_distributions:
                    author_topic_distributions[author_id] = []
                author_topic_distributions[author_id].append(topic_distribution)
                if author_id not in G:
                    G.add_node(author_id, name=author_name)

            author_ids = list(authors_dict.keys())
            for i, author_id in enumerate(author_ids):
                for coauthor_id in author_ids[i+1:]:
                    if G.has_edge(author_id, coauthor_id):
                        G[author_id][coauthor_id]['weight'] += 1
                    else:
                        G.add_edge(author_id, coauthor_id, weight=1)

        # Calculate average topic distributions for authors
        avg_topic_distributions = {author_id: calculate_average_topic_distribution(papers)
                                   for author_id, papers in author_topic_distributions.items()}

        # Store average topic distributions as node attributes
        for author_id, avg_distribution in avg_topic_distributions.items():
            if author_id in G:
                G.nodes[author_id]['avg_topic_distribution'] = avg_distribution.tolist()

        # Calculate author alignment (topic similarity) for each edge
        for u, v, data in G.edges(data=True):
            if u in avg_topic_distributions and v in avg_topic_distributions:
                similarity = cosine_similarity([avg_topic_distributions[u]], [avg_topic_distributions[v]])[0][0]
                G[u][v]['alignment'] = similarity

        cumulative_graphs[year] = G
        
    return cumulative_graphs
    
#     print(f"Average topic distributions for year {year}: {avg_topic_distributions}")

In [6]:
cumul_graphs = create_cumulative_coauthorship_networks(file_path, output_folder)

1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024


In [7]:
for year, graph in cumul_graphs.items():
    nx.write_gml(graph, os.path.join(output_folder, f"cumulative_coauthorship_network_{year}.gml"))

In [8]:
for year, graph in cumul_graphs.items():
        ig_graph = ig.Graph.from_networkx(graph)

#         ig_graph = ig.Graph.TupleList(graph.edges(), directed=False)
#         weights = [graph[u][v]['weight'] for u, v in graph.edges()]
#         ig_graph.es['weight'] = weights
        
        # Run Leiden clustering
        partition = leidenalg.find_partition(ig_graph, leidenalg.ModularityVertexPartition)

        # Save clustering results
        clusters = {vertex['name']: cluster for vertex, cluster in zip(ig_graph.vs, partition.membership)}
        with open(os.path.join(output_folder, f"leiden_clustering_cumulative_{year}.json"), 'w') as f:
            json.dump(clusters, f)
        
        print(year)

1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024


Cross-Sectional Networks

In [9]:
def create_cs_coauthorship_networks(file_path, output_folder):
    # Load the CSV file
    df = pd.read_csv(file_path)
    
#     data['Full Authors'] = data['Full Authors']
#     data['distr'] = data['distr'].apply(lambda x: parse_string_to_list(x))
    
#     print(data)
    # Initialize a dictionary to hold cumulative graphs for each year

    cross_sectional_graphs = {}

    # Create the output folder if it does not exist
    os.makedirs(output_folder, exist_ok=True)

    # Create coauthorship networks
    for year in sorted(df['Year'].unique()):
        print(year)
        yearly_data = df[df['Year'] == year]
        
        author_topic_distributions = {}
        G = nx.Graph()
        
        for _, row in yearly_data.iterrows():
            authors_dict = eval(row['Full Authors'])
            topic_distribution = parse_string_to_list(row['distr'])
            for author_id, author_name in authors_dict.items():
                if author_id not in author_topic_distributions:
                    author_topic_distributions[author_id] = []
                author_topic_distributions[author_id].append(topic_distribution)
                if author_id not in G:
                    G.add_node(author_id, name=author_name)

            author_ids = list(authors_dict.keys())
            for i, author_id in enumerate(author_ids):
                for coauthor_id in author_ids[i+1:]:
                    G.add_edge(author_id, coauthor_id)

        # Calculate average topic distributions for authors
        avg_topic_distributions = {author_id: calculate_average_topic_distribution(papers)
                                   for author_id, papers in author_topic_distributions.items()}

        # Store average topic distributions as node attributes
        for author_id, avg_distribution in avg_topic_distributions.items():
            if author_id in G:
                G.nodes[author_id]['avg_topic_distribution'] = avg_distribution.tolist()

        # Calculate author alignment (topic similarity) for each edge
        for u, v, data in G.edges(data=True):
            if u in avg_topic_distributions and v in avg_topic_distributions:
                similarity = cosine_similarity([avg_topic_distributions[u]], [avg_topic_distributions[v]])[0][0]
                G[u][v]['alignment'] = similarity

        cross_sectional_graphs[year] = G
        
    return cross_sectional_graphs
    
#     print(f"Average topic distributions for year {year}: {avg_topic_distributions}")

In [10]:
cs_graphs = create_cs_coauthorship_networks(file_path, output_folder)

1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024


In [11]:
print(cs_graphs)

{1996: <networkx.classes.graph.Graph object at 0x154f4a47a410>, 1997: <networkx.classes.graph.Graph object at 0x154f4a489410>, 1998: <networkx.classes.graph.Graph object at 0x154f512e1e10>, 1999: <networkx.classes.graph.Graph object at 0x154f4b440a10>, 2000: <networkx.classes.graph.Graph object at 0x154f512e1d10>, 2001: <networkx.classes.graph.Graph object at 0x154f4bbb4610>, 2002: <networkx.classes.graph.Graph object at 0x154f4bc07610>, 2003: <networkx.classes.graph.Graph object at 0x154f4bbb4410>, 2004: <networkx.classes.graph.Graph object at 0x154f49fc60d0>, 2005: <networkx.classes.graph.Graph object at 0x154f49f80b50>, 2006: <networkx.classes.graph.Graph object at 0x154f49edb1d0>, 2007: <networkx.classes.graph.Graph object at 0x154f49f80c90>, 2008: <networkx.classes.graph.Graph object at 0x154f4bbb4910>, 2009: <networkx.classes.graph.Graph object at 0x154f49e39a10>, 2010: <networkx.classes.graph.Graph object at 0x154f49d9f3d0>, 2011: <networkx.classes.graph.Graph object at 0x154f49

In [12]:
for year, graph in cs_graphs.items():
    nx.write_gml(graph, os.path.join(output_folder, f"cross_sectional_coauthorship_network_{year}.gml"))

In [13]:
for year, graph in cs_graphs.items():
        ig_graph = ig.Graph.from_networkx(graph)

#         ig_graph = ig.Graph.TupleList(graph.edges(), directed=False)
#         weights = [graph[u][v]['weight'] for u, v in graph.edges()]
#         ig_graph.es['weight'] = weights
        
        # Run Leiden clustering
        partition = leidenalg.find_partition(ig_graph, leidenalg.ModularityVertexPartition)

        # Save clustering results
        clusters = {vertex['name']: cluster for vertex, cluster in zip(ig_graph.vs, partition.membership)}
        with open(os.path.join(output_folder, f"leiden_clustering_cross_sect_{year}.json"), 'w') as f:
            json.dump(clusters, f)
        
        print(year)

1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
