In [1]:
import igraph as ig
import networkx as nx
import pandas as pd
import numpy as np
import re
import swifter
from sklearn.metrics.pairwise import cosine_similarity

# Helper Functions

In [2]:
# A helper funtion to assign communities
def assign_communities(community, df, name):
    # Create a new df
    df = df.copy()
    
    # Create a new column for communities
    df[name] = 0
    
    for i, com in enumerate(community):
        
        # Get node indices
        com = G.vs[com]['id']
        
        # Convert strings to integers
        com = [int(x) for x in com]
        
        mask = df['ID'].isin(com)
        df.loc[mask, name] = i

    return df

# A helper funtion to calculate the shortest distance between 2 nodes
def helper_shortest_path(x):
    try:
        source = old2new[x[0]]
        destination = old2new[x[1]]
        d = G.shortest_paths(source, destination)[0][0]

    # When one of the vertices doesnt even exist
    except:
        d = 999
    
    # Remove the existing edges between A and B to avoid data leakage (Too computationally expensive)
    # if d == 1:
    #    G_copy = G.copy()
    #    G_copy.delete_edges([(source, destination)])
    #    d = G_copy.shortest_paths(source, destination)[0][0]
    
    # Convert infinity to 999
    if d > 999:
        d = 999
    
    return d

# A helper funtion to clean author name
def clean_author(li):
    res = []
    
    for text in li:
        
        # Convert to lower cases
        text = text.lower()
        
        # Remove the part after '('
        text = text.split('(')[0]
        
        # Remove all the punctuations but dots
        text = re.sub(r'[^\w\s.]','',text)        
        
        # Remove spaces at the beginning and the end
        text = text.strip()
        
        # Get last names and initials
        if text != '':
            if '.' in text:
                initial = text.split('.')[0][0]
            else:
                initial = text[0]
            # Assume last names always appear after the last dot
            last_name = text.split('.')[-1].split(' ')[-1]
            text = initial + '. ' + last_name
        else:
            text = ''
        
        # Keep only names that are not too short
        if len(text) > 4:
            res.append(text)
        
    return res

# A helper funtion to calculate the number of intersections
def intersection(x):
    return len(set(x.iloc[0]).intersection(x.iloc[1]))

# A helper funtion to assign similarities between nodes
def similarity(x, M):
    try:
        sim = M[old2new[x[0]]][old2new[x[1]]]
    except:
        sim = 0
    return sim

# 1. Data Preperation

In [3]:
# Read all the files
df_train = pd.read_csv('training_set.txt', header=None, sep=' ', names=['X1', 'X2', 'Y'])
df_test = pd.read_csv('testing_set.txt', header=None, sep=' ', names=['X1', 'X2'])
df_info = pd.read_csv('node_information.csv', header=None, names=['ID', 'Year', 'Title', 'Author', 'Journal', 'Abstract'])

df_doc2vec = pd.read_csv('doc2vec.csv')

In [4]:
# Build a graph for feature extraction - DONE
# edges = df_train[df_train['Y']==1]
# G = nx.from_pandas_edgelist(edges, 'X1', 'X2', True, create_using=nx.DiGraph())
# nx.write_graphml(G,'graph.graphml')

# Read the saved graph
G = ig.read('graph.graphml',format="graphml")

# Construct index mappings
n_nodes = len(G.vs)
old2new = {int(G.vs['id'][i]):G.vs.indices[i] for i in range(n_nodes)}

# 2. Node Features

## 2.1 Centrality

In [519]:
# Get graph attributes - DONE

# att = {'in_degree_centrality':G.degree(mode='in'),
#         'out_degree_centrality':G.degree(mode='out'),
#         'eigenvector_centrality':G.eigenvector_centrality(),
#         'in_closeness_centrality': G.closeness(mode='in'),
#         'out_closeness_centrality': G.closeness(mode='out'),        
#         'betweenness_centrality': G.betweenness(),
#         'pagerank': G.pagerank()}

# att = pd.DataFrame(att)
# att['index'] = G.vs['id']
# att['index'] = pd.to_numeric(att['index'])
# df_info = df_info.merge(att, left_on='ID', right_on='index', how='left')

  
  import sys


## 2.2 Doc2Vec

In [520]:
# Get embeddings for titles and abstracts - DONE

# df_info = df_info.merge(df_doc2vec, left_on='ID', right_on='ID', how='left')

## 2.3 Community Detection 

In [521]:
# Infomap algorithm - DONE

# c_infomap = G.community_infomap()
# df_info = assign_communities(c_infomap, df_info, 'infomap')

## 2.4 Save to the drive

In [522]:
# df_info.to_csv('df_info_temp.csv', index=None)

# 3. Edge Features

## 3.1 Shortest Path

In [9]:
# Calculate the shortest path between each node - Done
# df_train['shortest_path'] = df_train.swifter.apply(helper_shortest_path, axis=1)
# df_test['shortest_path'] = df_test.swifter.apply(helper_shortest_path, axis=1)

## 3.2 Doc Similarity

In [157]:
# Similarity between A and B - Done
# def sim_A_B(df, df_doc2vec):    
#     Vec = df_doc2vec.set_index('ID')
#     Vec_X1 = Vec.loc[df['X1']].values
#     Vec_X2 = Vec.loc[df['X2']].values
    
#     inner_product = np.sum(Vec_X1 * Vec_X2, axis=1)
#     leng_X1 = np.sum(Vec_X1 ** 2, axis=1) ** 0.5
#     leng_X2 = np.sum(Vec_X2 ** 2, axis=1) ** 0.5
#     sim = inner_product/(leng_X1 * leng_X2)
    
#     res = df.copy()
#     res['similarity_A_B'] = sim
    
#     return res

In [158]:
# df_train = sim_A_B(df_train, df_doc2vec)
# df_test = sim_A_B(df_test, df_doc2vec)

In [159]:
# Similarity between B and those cited by A
# def sim_Aout_B(df, df_doc2vec):
#     Vec = df_doc2vec.set_index('ID')
    
#     # Calculate the mean vectors of those cited by A
#     edges = df_train[df_train['Y'] == 1]
#     Vec_Aout = edges.groupby('X1')['X2'].unique().reset_index().set_index('X1')
#     temp = Vec_Aout['X2'].map(lambda x:Vec.loc[x].mean().values).to_list()
#     Vec_X1 = pd.DataFrame(temp, index=Vec_Aout.index, columns=Vec.columns)
    
#     # Calculate the similarity between B and those cited by A
#     Vec_X1 = Vec_X1.loc[df['X1']].values
#     Vec_X2 = Vec.loc[df['X2']].values
    
#     inner_product = np.sum(Vec_X1 * Vec_X2, axis=1)
#     leng_X1 = np.sum(Vec_X1 ** 2, axis=1) ** 0.5
#     leng_X2 = np.sum(Vec_X2 ** 2, axis=1) ** 0.5
#     sim = inner_product/(leng_X1 * leng_X2)
    
#     res = df.copy()
#     res['similarity_Aout_B'] = sim
    
#     return res

In [160]:
# df_train = sim_Aout_B(df_train, df_doc2vec)
# df_test = sim_Aout_B(df_test, df_doc2vec)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  if sys.path[0] == '':


In [162]:
# Similarity between A and those citing B
# def sim_A_Bin(df, df_doc2vec):
#     Vec = df_doc2vec.set_index('ID')
    
#     # Calculate the mean vectors of those cited by A
#     edges = df_train[df_train['Y'] == 1]
#     Vec_Bin = edges.groupby('X2')['X1'].unique().reset_index().set_index('X2')
#     temp = Vec_Bin['X1'].map(lambda x:Vec.loc[x].mean().values).to_list()
#     Vec_X2 = pd.DataFrame(temp, index=Vec_Bin.index, columns=Vec.columns)
    
#     # Calculate the similarity between B and those cited by A
#     Vec_X1 = Vec.loc[df['X1']].values
#     Vec_X2 = Vec_X2.loc[df['X2']].values
    
#     inner_product = np.sum(Vec_X1 * Vec_X2, axis=1)
#     leng_X1 = np.sum(Vec_X1 ** 2, axis=1) ** 0.5
#     leng_X2 = np.sum(Vec_X2 ** 2, axis=1) ** 0.5
#     sim = inner_product/(leng_X1 * leng_X2)
    
#     res = df.copy()
#     res['similarity_A_Bin'] = sim
    
#     return res

In [163]:
# df_train = sim_A_Bin(df_train, df_doc2vec)
# df_test = sim_A_Bin(df_test, df_doc2vec)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  del sys.path[0]


In [None]:
# Similarity between those cited by A and those citing B

## 3.3 Author reference frequency

In [49]:
# How many common authors between A and B, as well as how many times A has cited B's authors before
# def author_frequency(df):
#     df = df.copy()
    
#     # Create an edge list
#     edges = df_train[df_train['Y'] == 1].reset_index(drop=True)

#     # Clean author names
#     df_info2 = df_info.copy() 
#     aut = df_info2['Author'].fillna('').str.split(',')
#     df_info2['Author'] = aut.map(clean_author)
#     df_info2 = df_info2.set_index('ID')

#     # Find out author lists for each edge
#     edges['aut1'] = df_info2.loc[edges['X1']]['Author'].reset_index(drop=True)
#     edges['aut2'] = df_info2.loc[edges['X2']]['Author'].reset_index(drop=True)
#     temp = edges.groupby('X1')['aut2'].apply(lambda x:[j for i in x.to_list() for j in i])

#     df['aut1'] = df_info2.loc[df['X1']]['Author'].reset_index(drop=True)
#     df['aut2'] = df_info2.loc[df['X2']]['Author'].reset_index(drop=True)
#     df['a_out_aut'] = temp.loc[df['X1']].reset_index(drop=True)

#     # Find out the number of common authors
#     df['aut_common'] = df[['aut1', 'aut2']].swifter.apply(intersection, axis=1)

#     # Find out the number of times A has cited the work of the authors of B
#     df['a_out_aut'][df['a_out_aut'].isna()] = ''
#     df['n_previously_cited'] = df[['a_out_aut', 'aut2']].swifter.apply(lambda x:sum(el in x[1] for el in x[0]), axis=1)
    
#     # Drop useless columns
#     df = df.drop(['aut1','aut2','a_out_aut'], axis=1)
#     return df

In [460]:
# df_test = author_frequency(df_test)
# df_train = author_frequency(df_train)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike


HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=32648.0, style=ProgressStyle(descripti…




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=32648.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=615512.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=615512.0, style=ProgressStyle(descript…




## 3.4 Reverse Citation (if B has cited A)


In [48]:
# Return 1 if B has cited A - Done
# edges = df_train[df_train['Y'] == 1]
# func = lambda x:((edges['X2'] == x[0]) & (edges['X1'] == x[1])).sum()

# df_train['reversed'] = df_train[['X1', 'X2']].swifter.apply(func, axis=1)
# df_test['reversed'] = df_test[['X1', 'X2']].swifter.apply(func, axis=1)

HBox(children=(FloatProgress(value=0.0, description='Dask Apply', max=32.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Dask Apply', max=32.0, style=ProgressStyle(description_wi…




## 3.5 Adamic/Adar Index

In [12]:
# Get pairwise similarities - Done
# M = G.similarity_inverse_log_weighted(mode='out')

In [26]:
# df_train['adar'] = df_train[['X1', 'X2']].apply(similarity, axis=1, M=M)
# df_test['adar'] = df_test[['X1', 'X2']].apply(similarity, axis=1, M=M)

In [7]:
# Get pairwise similarities with weak connections - done
M = G.similarity_inverse_log_weighted(mode='all')

In [9]:
df_train['adar_weak'] = df_train[['X1', 'X2']].apply(similarity, axis=1, M=M)
df_test['adar_weak'] = df_test[['X1', 'X2']].apply(similarity, axis=1, M=M)

## 3.6 Jaccard Similarity

In [8]:
# Get pairwise similarities - Done
# M = G.similarity_jaccard(mode='out')

In [9]:
# df_train['jaccard'] = df_train[['X1', 'X2']].apply(similarity, axis=1, M=M)
# df_test['jaccard'] = df_test[['X1', 'X2']].apply(similarity, axis=1, M=M)

In [7]:
# Get pairwise similarities with weak connections - Done
# M = G.similarity_jaccard(mode='all')

In [8]:
# df_train['jaccard_weak'] = df_train[['X1', 'X2']].apply(similarity, axis=1, M=M)
# df_test['jaccard_weak'] = df_test[['X1', 'X2']].apply(similarity, axis=1, M=M)

## 3.7 Save to the drive

In [11]:
df_train.to_csv('df_train_temp.csv', index=None)
df_test.to_csv('df_test_temp.csv', index=None)

# 5. Combine all the features above

## 5.1 Read saved features

In [12]:
df_info = pd.read_csv('df_info_temp.csv')
df_train = pd.read_csv('df_train_temp.csv')
df_test = pd.read_csv('df_test_temp.csv')

## 5.2 Assign node features to the dfs

In [14]:
# Drop useless features
df_features = df_info.drop(['Title', 'Author', 'Journal', 'Abstract', 'index'], axis=1)

df_train = df_train.merge(df_features, left_on='X1', right_on='ID', how='left')
df_train = df_train.merge(df_features, left_on='X2', right_on='ID', how='left', suffixes=('_X1', '_X2'))
df_test = df_test.merge(df_features, left_on='X1', right_on='ID', how='left')
df_test = df_test.merge(df_features, left_on='X2', right_on='ID', how='left', suffixes=('_X1', '_X2'))

## 5.3 Drop Doc2vecs columns

In [15]:
cols_del = [c for c in df_train.columns if c[:2]=='v_']
df_train = df_train.drop(cols_del, axis=1)
df_test = df_test.drop(cols_del, axis=1)

## 5.4 Drop redundant indices

In [16]:
df_train = df_train.drop(['X1', 'X2'], axis=1)
df_test = df_test.drop(['X1', 'X2'], axis=1)

## 5.5 Drop useless features

In [17]:
df_train = df_train.drop(['shortest_path'], axis=1)
df_test = df_test.drop(['shortest_path'], axis=1)

## 5.6 Split data

In [18]:
X = df_train.drop('Y', axis=1)
X_test = df_test
y = df_train[['Y']]

In [19]:
print('Shape of the training data', X.shape)
print('Shape of the test data', X_test.shape)

Shape of the training data (615512, 30)
Shape of the test data (32648, 30)


In [20]:
X.columns

Index(['similarity_A_B', 'similarity_Aout_B', 'similarity_A_Bin', 'aut_common',
       'n_previously_cited', 'reversed', 'adar', 'jaccard', 'jaccard_weak',
       'adar_weak', 'ID_X1', 'Year_X1', 'betweenness_centrality_X1',
       'eigenvector_centrality_X1', 'in_closeness_centrality_X1',
       'in_degree_centrality_X1', 'out_closeness_centrality_X1',
       'out_degree_centrality_X1', 'pagerank_X1', 'infomap_X1', 'ID_X2',
       'Year_X2', 'betweenness_centrality_X2', 'eigenvector_centrality_X2',
       'in_closeness_centrality_X2', 'in_degree_centrality_X2',
       'out_closeness_centrality_X2', 'out_degree_centrality_X2',
       'pagerank_X2', 'infomap_X2'],
      dtype='object')

## 5.7 Save to the disk

In [21]:
X.to_csv('X.csv', index=None)
X_test.to_csv('X_test.csv', index=None)
y.to_csv('y.csv', index=None)