In [1]:
import pandas as pd
import numpy as np

In [2]:
import networkx as nx

In [3]:
import random

In [4]:
from gensim.models.keyedvectors import KeyedVectors



In [5]:
import pickle

In [6]:
from tqdm import tqdm_notebook

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [8]:
def classify_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
                     
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    precision_train = precision_score(y_train, y_pred_train)
    precision_test = precision_score(y_test, y_pred_test)
    recall_train = recall_score(y_train, y_pred_train)
    recall_test = recall_score(y_test, y_pred_test)
    accuracy_train = accuracy_score(y_train, y_pred_train)
    accuracy_test = accuracy_score(y_test, y_pred_test)
    f1_macro_train = f1_score(y_train, y_pred_train, average='macro')
    f1_macro_test = f1_score(y_test, y_pred_test, average='macro')
    f1_micro_train = f1_score(y_train, y_pred_train, average='micro')
    f1_micro_test = f1_score(y_test, y_pred_test, average='micro')
    logloss_train = log_loss(y_train, y_pred_train)
    logloss_test = log_loss(y_test, y_pred_test)
    roc_auc_train = roc_auc_score(y_train, y_pred_train)
    roc_auc_test = roc_auc_score(y_test, y_pred_test)

    return model, y_pred_train, y_pred_test, \
        precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, \
        f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, \
        roc_auc_train, roc_auc_test, logloss_train, logloss_test 

In [9]:
aan_links_connected = pd.read_csv('aan_links_connected.csv')

In [10]:
aan_links_connected.head()

Unnamed: 0,citing,cited,year_citing,year_cited,out_cites_count,in_cites_count,cite_rank,node_citing,node_cited
0,C08-3004,A00-1002,2008,2000,1,10,1.0,560,0
1,D09-1141,A00-1002,2009,2000,14,10,1.0,1682,0
2,D12-1027,A00-1002,2012,2000,14,10,1.0,1995,0
3,E06-1047,A00-1002,2006,2000,4,10,1.0,2562,0
4,H05-1110,A00-1002,2005,2000,2,10,1.0,3150,0


In [11]:
aan_test = pd.read_csv('test_2013_05_no_isolated.csv')
aan_train = pd.read_csv('train_2013_05_no_isolated.csv')

In [12]:
aan_train.head()

Unnamed: 0,citing,cited,year_citing,year_cited,out_cites_count,in_cites_count,cite_rank,node_citing,node_cited
0,C08-3004,A00-1002,2008,2000,1,10,1.0,560,0
1,D09-1141,A00-1002,2009,2000,14,10,1.0,1682,0
2,D12-1027,A00-1002,2012,2000,14,10,1.0,1995,0
3,E06-1047,A00-1002,2006,2000,4,10,1.0,2562,0
4,H05-1110,A00-1002,2005,2000,2,10,1.0,3150,0


In [13]:
G = nx.DiGraph()

for i in tqdm_notebook(aan_links_connected.index):
    G.add_edge(aan_links_connected.iloc[i]['citing'], aan_links_connected.iloc[i]['cited'])
    
print(nx.info(G))

HBox(children=(IntProgress(value=0, max=74860), HTML(value='')))


Name: 
Type: DiGraph
Number of nodes: 13506
Number of edges: 74860
Average in degree:   5.5427
Average out degree:   5.5427


In [14]:
G_train = nx.DiGraph()

for i in tqdm_notebook(aan_train.index):
    G_train.add_edge(aan_train.iloc[i]['citing'], aan_train.iloc[i]['cited'])
    
print(nx.info(G_train))

HBox(children=(IntProgress(value=0, max=64024), HTML(value='')))


Name: 
Type: DiGraph
Number of nodes: 13493
Number of edges: 64024
Average in degree:   4.7450
Average out degree:   4.7450


In [15]:
def generate_negative_edges(graph, count_gen_edges, part_neg_directed):
    negative_edges = set()
    nodes = list(graph.nodes())
    edges = list(graph.edges())

    count_neg_directed = int(part_neg_directed*count_gen_edges)
    for a, b in edges:
        if len(negative_edges) >= count_neg_directed:
            break
        if not graph.has_edge(b, a):
            negative_edges.add((b, a))       
    
    while len(negative_edges) < count_gen_edges:
        i = random.randint(0, len(nodes) - 1)
        j = random.randint(0, len(nodes) - 1)
        if (i != j) and not graph.has_edge(nodes[i], nodes[j]):
            negative_edges.add((nodes[i], nodes[j]))
    return list(negative_edges)

In [16]:
def generate_negative_edges_test(graph, test_nodes, count_gen_edges):
    negative_edges = set()
    nodes = list(graph.nodes())
    while len(negative_edges) < count_gen_edges:
        i = random.randint(0, len(test_nodes) - 1)
        j = random.randint(0, len(nodes) - 1)
        if i == j:
            continue
        if graph.has_edge(test_nodes[i], nodes[j]):
            continue
        negative_edges.add((test_nodes[i], nodes[j]))
    return list(negative_edges)

In [17]:
def train_test_preprocess(positive_df, neg_list):
    pairs = list(zip(list(positive_df['citing']), list(positive_df['cited']), [1]*len(positive_df)))
    neg_pairs = list(zip(list(zip(*neg_list))[0],list(zip(*neg_list))[1], [0]*len(neg_list)))
    pairs += neg_pairs
    random.shuffle(pairs)
    return pairs

In [18]:
def product(u,v):
    return u*v
def mean(u,v):
    return (u+v)/2
def l1(u,v):
    return np.abs(u-v)
def l2(u,v):
    return (u-v)**2

In [22]:
def concat(u,v):
    return np.concatenate([u, v])

In [23]:
aan_train_neg = generate_negative_edges(G_train, len(G_train.edges()), 0)
aan_test_neg = generate_negative_edges_test(G, list(aan_test['citing']), len(aan_test))

In [24]:
train_pairs = train_test_preprocess(aan_train, aan_train_neg)
test_pairs = train_test_preprocess(aan_test, aan_test_neg)

In [25]:
with open('./asymproj_edge_dnn-master/datasets/aan_graph/index.pkl', 'rb') as f:
    index_graph = pickle.load(f)

In [26]:
ft_mean_emb = np.load('X_mean_text_emb_13506.npy')

In [28]:
embed_dim = 600

k_train = 0
k_test = 0

X_train = np.zeros((len(train_pairs), embed_dim))
for i in range(len(train_pairs)):
    try:
        X_train[i] = concat(ft_mean_emb[index_graph['index'][train_pairs[i][0]]],
                                    ft_mean_emb[index_graph['index'][train_pairs[i][1]]])
    except KeyError:
        k_train += 1
        pass
y_train = np.array(list(zip(*(train_pairs)))[-1])

X_test = np.zeros((len(test_pairs), embed_dim))
for i in range(len(test_pairs)):
    try:
        X_test[i] = concat(ft_mean_emb[index_graph['index'][test_pairs[i][0]]],
                                    ft_mean_emb[index_graph['index'][test_pairs[i][1]]])    
    except:
        k_test += 1
        pass
y_test = np.array(list(zip(*(test_pairs)))[-1])

In [29]:
X_train.shape

(128048, 600)

In [41]:
rand_forest = RandomForestClassifier(n_estimators = 100)

rand_forest, y_pred_train, y_pred_test, precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, roc_auc_train, roc_auc_test, logloss_train, logloss_test = \
classify_model(rand_forest, X_train, y_train, X_test, y_test)

In [42]:
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.7960253596683736
Recall: 0.6025286083425618
Accuracy: 0.7240679217423404
F1-macro: 0.7199307941302755
F1-micro: 0.7240679217423405
Logloss: 9.530418077153314
ROC-AUC: 0.7240679217423404


In [43]:
print(classification_report(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))

             precision    recall  f1-score   support

          0       0.68      0.85      0.75     10836
          1       0.80      0.60      0.69     10836

avg / total       0.74      0.72      0.72     21672

[[9163 1673]
 [4307 6529]]


In [33]:
ft_weight_emb = np.load('X_tfidf_text_emb_13506.npy')

In [34]:
embed_dim = 600

k_train = 0
k_test = 0

X_train = np.zeros((len(train_pairs), embed_dim))
for i in range(len(train_pairs)):
    try:
        X_train[i] = concat(ft_weight_emb[index_graph['index'][train_pairs[i][0]]],
                                    ft_weight_emb[index_graph['index'][train_pairs[i][1]]])
    except KeyError:
        k_train += 1
        pass
y_train = np.array(list(zip(*(train_pairs)))[-1])

X_test = np.zeros((len(test_pairs), embed_dim))
for i in range(len(test_pairs)):
    try:
        X_test[i] = concat(ft_weight_emb[index_graph['index'][test_pairs[i][0]]],
                                    ft_weight_emb[index_graph['index'][test_pairs[i][1]]])    
    except:
        k_test += 1
        pass
y_test = np.array(list(zip(*(test_pairs)))[-1])

In [38]:
rand_forest = RandomForestClassifier(n_estimators=100)

rand_forest, y_pred_train, y_pred_test, precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, roc_auc_train, roc_auc_test, logloss_train, logloss_test = \
classify_model(rand_forest, X_train, y_train, X_test, y_test)

In [39]:
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.7917684416218856
Recall: 0.5982834994462901
Accuracy: 0.7204688076781101
F1-macro: 0.7162323699150104
F1-micro: 0.7204688076781101
Logloss: 9.654728253690818
ROC-AUC: 0.72046880767811


In [40]:
print(classification_report(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))

             precision    recall  f1-score   support

          0       0.68      0.84      0.75     10836
          1       0.79      0.60      0.68     10836

avg / total       0.73      0.72      0.72     21672

[[9131 1705]
 [4353 6483]]


In [44]:
from scipy import sparse

tfidf_matrix = sparse.load_npz("X_tfidf_13506.npz")

In [45]:
from sklearn.decomposition import TruncatedSVD

In [46]:
svd = TruncatedSVD(300)
svd.fit(tfidf_matrix)

TruncatedSVD(algorithm='randomized', n_components=300, n_iter=5,
       random_state=None, tol=0.0)

In [47]:
tfidf_svd = svd.transform(tfidf_matrix)

In [49]:
tfidf_svd.shape

(13506, 300)

In [50]:
embed_dim = 600

k_train = 0
k_test = 0

X_train = np.zeros((len(train_pairs), embed_dim))
for i in range(len(train_pairs)):
    try:
        X_train[i] = concat(tfidf_svd[index_graph['index'][train_pairs[i][0]]],
                                    tfidf_svd[index_graph['index'][train_pairs[i][1]]])
    except KeyError:
        k_train += 1
        pass
y_train = np.array(list(zip(*(train_pairs)))[-1])

X_test = np.zeros((len(test_pairs), embed_dim))
for i in range(len(test_pairs)):
    try:
        X_test[i] = concat(tfidf_svd[index_graph['index'][test_pairs[i][0]]],
                                    tfidf_svd[index_graph['index'][test_pairs[i][1]]])    
    except:
        k_test += 1
        pass
y_test = np.array(list(zip(*(test_pairs)))[-1])

In [51]:
rand_forest = RandomForestClassifier(n_estimators=100)

rand_forest, y_pred_train, y_pred_test, precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, roc_auc_train, roc_auc_test, logloss_train, logloss_test = \
classify_model(rand_forest, X_train, y_train, X_test, y_test)

In [52]:
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.8152764067127345
Recall: 0.6097268364710225
Accuracy: 0.7357881136950905
F1-macro: 0.7315216038639952
F1-micro: 0.7357881136950905
Logloss: 9.125610494398835
ROC-AUC: 0.7357881136950905


In [53]:
print(classification_report(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))

             precision    recall  f1-score   support

          0       0.69      0.86      0.77     10836
          1       0.82      0.61      0.70     10836

avg / total       0.75      0.74      0.73     21672

[[9339 1497]
 [4229 6607]]
