In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import itertools
from collections import Counter
import matplotlib.pyplot as plt

In [4]:
# Path to the data
path = "C://Users//Dimitri//Desktop//ENSAE3A//NetworkData//Tables//"

In [5]:
# Load the data
attrs_nos = pd.read_csv(path + "attrs_nos.csv", encoding = "ISO-8859-1")

In [6]:
# Print the data
attrs_nos

Unnamed: 0,url,title,authors,date,jel_code,keywords,editor,journal,article_id,authors_list,authors_nos
0,https://ideas.repec.org/a/oup/qjecon/v1y1886i1...,The Reaction in Political Economy,Charles F. Dunbar,1886-02-02,,,oup,qjecon,v1y1886i1p1-27..html,['Charles F. Dunbar'],[6160]
1,https://ideas.repec.org/a/oup/qjecon/v1y1886i1...,Private Monopolies and Public Rights,Arthur T. Hadley,1886-02-02,,,oup,qjecon,v1y1886i1p28-44..html,['Arthur T. Hadley'],[3334]
2,https://ideas.repec.org/a/oup/qjecon/v1y1886i1...,Silver Before Congress in 1886,S. Dana Horton,1886-02-02,,,oup,qjecon,v1y1886i1p45-75..html,['S. Dana Horton'],[36527]
3,https://ideas.repec.org/a/oup/qjecon/v1y1886i1...,"The Arithmetic, Geometric, and Harmonic Means",F. Coggeshall,1886-02-02,,,oup,qjecon,v1y1886i1p83-86..html,['F. Coggeshall'],[11962]
4,https://ideas.repec.org/a/oup/qjecon/v1y1886i1...,Legislation for Labor Arbitration,H. M. Williams,1886-02-02,,,oup,qjecon,v1y1886i1p86-91..html,['H. M. Williams'],[15170]
5,https://ideas.repec.org/a/oup/qjecon/v1y1886i1...,Correspondence,Arthur Mangin,1886-02-02,,,oup,qjecon,v1y1886i1p91-102..html,['Arthur Mangin'],[3314]
6,https://ideas.repec.org/a/oup/qjecon/v1y1887i2...,An Historical Sketch of the Knights of Labor,Carroll D. Wright,1887-02-02,,,oup,qjecon,v1y1887i2p137-168..html,['Carroll D. Wright'],[5822]
7,https://ideas.repec.org/a/oup/qjecon/v1y1887i2...,The Disposition of Our Public Lands,Albert Bushnell Hart,1887-02-02,,,oup,qjecon,v1y1887i2p169-183..html,['Albert Bushnell Hart'],[1003]
8,https://ideas.repec.org/a/oup/qjecon/v1y1887i2...,The South-Western Strike of 1886,F. W. Taussig,1887-02-02,,,oup,qjecon,v1y1887i2p184-222..html,['F. W. Taussig'],[31109]
9,https://ideas.repec.org/a/oup/qjecon/v1y1887i2...,Marshall's Theory of Value and Distribution,J. Laurence Laughlin,1887-02-02,,,oup,qjecon,v1y1887i2p227-232..html,['J. Laurence Laughlin'],[17717]


In [7]:
def str_to_list(x):
    """
    Interpret strings of the form "['int1', 'int2']" as the list [int1, int2]
    
    Params:
        x (str) : the string to interpret
    Returns:
        list : the output list
    """
    x = x.replace("[", "")
    x = x.replace("]", "")
    splitted = x.split(", ")
    no_list = [int(i) for i in splitted]
    return no_list

In [8]:
# Interpret "author_nos" column as a list of numbers of authors
attrs_nos["authors_nos"] = attrs_nos["authors_nos"].apply(str_to_list)

In [9]:
def get_edges_list(auths_nums):
    """
    Get list of edges between authors from a series of list of authors 
    (each entry of the series are the authors of a given article).
    
    Params:
        auths_nums (pandas.core.series.Series) : the series of list of authors
        
    Returns:
        list : a list of tuples with possibly redundant tuples
    """
    edges_list = []
    auths_nums_reduced = auths_nums[auths_nums.apply(lambda x: len(x)) > 1]
    for auth_list in auths_nums_reduced:
        combinations = list(itertools.combinations(auth_list, 2))
        edges_list += combinations
    return edges_list         

In [12]:
# Get list of edges
auths_nos = attrs_nos["authors_nos"].copy()
edges_list = get_edges_list(auths_nos)
# Print it
print(edges_list)

[(1499, 17717), (41344, 36583), (10735, 4646), (10803, 31109), (18361, 42689), (13297, 42309), (17631, 31109), (18361, 42689), (162, 31109), (162, 31109), (23621, 31109), (43065, 42145), (43065, 42055), (43065, 17596), (42145, 42055), (42145, 17596), (42055, 17596), (16068, 35902), (31109, 39650), (23176, 16343), (32779, 23940), (11896, 42074), (31109, 15103), (523, 31109), (12760, 39624), (5660, 12272), (17693, 20443), (15877, 20774), (33761, 3320), (162, 29101), (162, 29624), (29101, 29624), (11952, 15715), (13030, 10516), (13030, 198), (10516, 198), (40061, 42519), (20443, 42051), (25091, 20774), (15864, 11112), (26810, 31693), (42539, 20443), (26810, 31693), (35075, 4077), (17564, 42051), (17564, 30544), (42051, 30544), (11146, 36210), (15651, 34887), (20443, 42539), (14609, 31693), (22079, 24574), (17287, 215), (17287, 10443), (215, 10443), (6115, 11780), (22079, 24574), (14609, 31693), (6087, 42123), (39661, 30555), (42123, 6087), (30484, 27963), (10358, 13479), (11892, 34318), (

In [16]:
def sort_edges(edges_list):
    """
    For all tuples in edges_list, put the lowest number in the first place and then sort the whole list.
    
    Params:
        edges_list (list) : list of tuples representing the authors pairs
    
    Returns:
        list : a list of tuples after the two sorting steps described above
    
    """
    sorted_edges_list = []
    for edge in edges_list:
        edge_l = list(edge)
        edge_l.sort()
        sorted_edges_list.append(tuple(edge_l))
    sorted_edges_list.sort()
    return sorted_edges_list

In [17]:
# Sort list of edges
s_edges_list = sort_edges(edges_list)
# Print the result
print(s_edges_list)

[(0, 4484), (0, 21754), (0, 25342), (0, 27992), (0, 32889), (0, 32889), (0, 32889), (0, 33905), (0, 35088), (0, 35088), (1, 16507), (1, 28403), (2, 16936), (2, 17012), (2, 33047), (2, 33049), (4, 33192), (5, 16705), (5, 23907), (7, 23913), (7, 42005), (9, 39074), (10, 21450), (11, 17462), (12, 17290), (12, 17503), (13, 100), (13, 17356), (13, 29598), (14, 39549), (15, 16), (15, 167), (16, 167), (19, 13151), (20, 22408), (20, 23044), (20, 33176), (21, 2430), (21, 2430), (21, 25266), (21, 27760), (22, 5024), (22, 27162), (23, 15026), (25, 25643), (25, 36465), (27, 39656), (27, 39951), (27, 41970), (28, 36461), (29, 110), (31, 36057), (33, 1606), (34, 7898), (36, 36), (38, 36461), (41, 17291), (41, 20725), (41, 33212), (41, 33212), (43, 18224), (46, 7718), (46, 33269), (47, 30931), (48, 7710), (49, 7710), (50, 2321), (53, 38978), (54, 41985), (55, 36916), (56, 5911), (57, 33226), (58, 33244), (59, 17355), (62, 16094), (64, 1765), (64, 3822), (64, 43468), (66, 7716), (66, 34872), (67, 3053

In [18]:
def weighted_edges_list(sorted_edges_list):
    """
    Convert list of authors pairs to a dict of dict to pass to the networkx.Graph constructor.
    Count the duplicates and store them in the "weights" attributes of the dictionnary so that an edge that appears k times
    in sorted_edges_list will get a weight of k in the graph.
    
    Params:
        list : sorted list of tuples
    
    Returns:
        dict : a dict of dict, for a given tuple (author1, author2), entry of the form :
        {author1 : {author2: {'weight': n_collaborations(author1, author2)}}}
    """
    counter_dict = dict(Counter(sorted_edges_list))
    nx_dict = dict()
    for key in counter_dict.keys():
        nx_dict[key[0]] = {key[1]: {'weight': counter_dict[key]}}
    return nx_dict

In [19]:
nx_dict = weighted_edges_list(s_edges_list)
print(nx_dict)

{0: {27992: {'weight': 1}}, 1: {16507: {'weight': 1}}, 2: {17012: {'weight': 1}}, 4: {33192: {'weight': 1}}, 5: {23907: {'weight': 1}}, 7: {23913: {'weight': 1}}, 9: {39074: {'weight': 1}}, 10: {21450: {'weight': 1}}, 11: {17462: {'weight': 1}}, 12: {17290: {'weight': 1}}, 13: {17356: {'weight': 1}}, 14: {39549: {'weight': 1}}, 15: {167: {'weight': 1}}, 16: {167: {'weight': 1}}, 19: {13151: {'weight': 1}}, 20: {23044: {'weight': 1}}, 21: {25266: {'weight': 1}}, 22: {27162: {'weight': 1}}, 23: {15026: {'weight': 1}}, 25: {36465: {'weight': 1}}, 27: {39656: {'weight': 1}}, 28: {36461: {'weight': 1}}, 29: {110: {'weight': 1}}, 31: {36057: {'weight': 1}}, 33: {1606: {'weight': 1}}, 34: {7898: {'weight': 1}}, 36: {36: {'weight': 1}}, 38: {36461: {'weight': 1}}, 41: {33212: {'weight': 2}}, 43: {18224: {'weight': 1}}, 46: {33269: {'weight': 1}}, 47: {30931: {'weight': 1}}, 48: {7710: {'weight': 1}}, 49: {7710: {'weight': 1}}, 50: {2321: {'weight': 1}}, 53: {38978: {'weight': 1}}, 54: {41985: 

In [20]:
def get_nodes_list(auths_nums):
    """
    Get the different authors from the series of list of authors. Useful to add non connected authors to the graph afterwards.
    
    Params:
        auths_nums (pandas.core.series.Series) : the series of list of authors.
    
    Returns:
        list : the list of authors.
    """
    concat = []
    for auth_list in auths_nums:
        concat +=  auth_list
    return list(set(concat))

In [21]:
# Get nodes list
nodes_list = get_nodes_list(auths_nos)

In [22]:
# Create graphs only from edges
authors_graph = nx.Graph(nx_dict)
# Add the nodes that have no edges
authors_graph.add_nodes_from(nodes_list)

In [23]:
# Extract the adjacency matrix as a scipy sparse matrix (won't fit into the RAM as numpy matrix)
adjacency_matrix = nx.to_scipy_sparse_matrix(authors_graph)
# Print it
print(adjacency_matrix)