In [143]:
import pandas as pd
import networkx as nx
import re

In [128]:
# Read all the files
df_train = pd.read_csv('training_set.txt', header=None, sep=' ', names=['X1', 'X2', 'Y'])
df_test = pd.read_csv('testing_set.txt', header=None, sep=' ', names=['X1', 'X2'])
df_info = pd.read_csv('node_information.csv', header=None, names=['ID', 'Year', 'Title', 'Author', 'Journal', 'Abstract'], index_col = 'ID')

In [129]:
# Fillin nan values
df_info = df_info.fillna('')

In [130]:
# Clean author name
def clean_author(li):
    res = []
    
    for text in li:
        
        # Convert to lower cases
        text = text.lower()
        
        # Remove the part after '('
        text = text.split('(')[0]
        
        # Remove all the punctuations but dots
        text = re.sub(r'[^\w\s.]','',text)        
        
        # Remove spaces at the beginning and the end
        text = text.strip()
        
        # Get last names and initials
        if text != '':
            if '.' in text:
                initial = text.split('.')[0][0]
            else:
                initial = text[0]
            # Assume last names always appear after the last dot
            last_name = text.split('.')[-1].split(' ')[-1]
            text = initial + '. ' + last_name
        else:
            text = ''
        
        # Keep only names that are not too short
        if len(text) > 4:
            res.append(text)
        
    return res

In [131]:
# Generate an author graph
def node2aut(x):
    # Get author lists for X1 and X2
    aut1 = df_info.loc[x[0]]['Author']
    aut2 = df_info.loc[x[1]]['Author']
    
    # Return None if X1 or X2 doesn't have any author
    if len(aut1) == 0 or len(aut2) == 0:
        return None
    
    # Return citation pairs
    res = [(a1, a2) for a1 in aut1 for a2 in aut2]
    
    return res

In [132]:
# Get the author column
aut = df_info['Author'].str.split(',')
df_info['Author'] = aut.map(clean_author)

In [133]:
# Get the edge list
edges = df_train[df_train['Y']==1]

In [134]:
# Return author edges
aut_edges = edges.apply(node2aut, axis=1).dropna()
aut_edges = [j for i in aut_edges.to_list() for j in i]
aut_edges = pd.DataFrame(aut_edges, columns = ['Source', 'Destination'])
aut_edges = aut_edges.groupby(['Source', 'Destination']).size().reset_index()
aut_edges.columns = ['Source', 'Destination', 'Weight']

In [135]:
# Save to the disk
# aut_edges.to_csv('author.csv', index=None)

# Author network feature extraction

In [188]:
# Read the saved file
edges = pd.read_csv('author.csv')

In [199]:
# Construct an author graph
G = nx.from_pandas_edgelist(edges, 'Source', 'Destination', True, create_using=nx.DiGraph())

In [200]:
edges

Unnamed: 0,Source,Destination,Weight
0,1. 10,f. wilczek,1
1,1. 10,j. preskill,1
2,1. 10,s. coleman,1
3,1. pages,l. dickey,1
4,1. pages,v. kaplunovsky,1
...,...,...,...
443012,z. zongan,t. pantev,1
443013,z. zongan,u. lecce,1
443014,z. zongan,v. sadov,2
443015,z. zongan,z. kakushadze,1


In [201]:
# Get graph attributes
att = {'in_degree_centrality':nx.in_degree_centrality(G),
        'out_degree_centrality':nx.out_degree_centrality(G),
        'eigenvector_centrality':nx.eigenvector_centrality(G, weight='Weight')}
att = pd.DataFrame(att).reset_index()
att = att.set_index('index')

In [202]:
# Sort by centrality
att = att.sort_values('eigenvector_centrality', ascending=False)

In [203]:
att.head()

Unnamed: 0_level_0,in_degree_centrality,out_degree_centrality,eigenvector_centrality
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
c. vafa,0.191463,0.063167,0.356714
h. lu,0.104133,0.062799,0.319881
c. pope,0.105605,0.061818,0.315223
e. witten,0.331166,0.063412,0.265511
m. cvetic,0.111247,0.075432,0.232406


In [204]:
# A helper funtion to return the maxium centrality among co-authors
def max_centrality(x):
    try:
        return att.loc[x]['eigenvector_centrality'].max()
    except:
        return -1

In [205]:
df_info['Author_cen'] = df_info['Author'].map(max_centrality)

In [206]:
# Save to the disk
df_info.to_csv('df_info_author_centrality.csv')