In [23]:
import pandas as pd
import networkx as nx
import numpy as np
import random

In [2]:
from gensim.models.keyedvectors import KeyedVectors



In [3]:
import pickle

In [4]:
from tqdm import tqdm_notebook

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [6]:
def classify_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
                     
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    precision_train = precision_score(y_train, y_pred_train)
    precision_test = precision_score(y_test, y_pred_test)
    recall_train = recall_score(y_train, y_pred_train)
    recall_test = recall_score(y_test, y_pred_test)
    accuracy_train = accuracy_score(y_train, y_pred_train)
    accuracy_test = accuracy_score(y_test, y_pred_test)
    f1_macro_train = f1_score(y_train, y_pred_train, average='macro')
    f1_macro_test = f1_score(y_test, y_pred_test, average='macro')
    f1_micro_train = f1_score(y_train, y_pred_train, average='micro')
    f1_micro_test = f1_score(y_test, y_pred_test, average='micro')
    logloss_train = log_loss(y_train, y_pred_train)
    logloss_test = log_loss(y_test, y_pred_test)
    roc_auc_train = roc_auc_score(y_train, y_pred_train)
    roc_auc_test = roc_auc_score(y_test, y_pred_test)

    return model, y_pred_train, y_pred_test, \
        precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, \
        f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, \
        roc_auc_train, roc_auc_test, logloss_train, logloss_test 

In [7]:
aan_links_connected = pd.read_csv('aan_links_connected.csv')

In [8]:
aan_links_connected.shape

(74860, 9)

In [9]:
aan_links_connected.head()

Unnamed: 0,citing,cited,year_citing,year_cited,out_cites_count,in_cites_count,cite_rank,node_citing,node_cited
0,C08-3004,A00-1002,2008,2000,1,10,1.0,560,0
1,D09-1141,A00-1002,2009,2000,14,10,1.0,1682,0
2,D12-1027,A00-1002,2012,2000,14,10,1.0,1995,0
3,E06-1047,A00-1002,2006,2000,4,10,1.0,2562,0
4,H05-1110,A00-1002,2005,2000,2,10,1.0,3150,0


In [12]:
list(enumerate(G.nodes()))

[(0, 'W08-0608'),
 (1, 'P11-1017'),
 (2, 'N10-1093'),
 (3, 'E12-1075'),
 (4, 'P04-1053'),
 (5, 'W04-3221'),
 (6, 'N10-1064'),
 (7, 'P14-2017'),
 (8, 'P07-1047'),
 (9, 'C02-1056'),
 (10, 'P12-1080'),
 (11, 'C10-1116'),
 (12, 'W12-2514'),
 (13, 'W10-2302'),
 (14, 'W97-0301'),
 (15, 'P06-1099'),
 (16, 'W06-3811'),
 (17, 'P04-1026'),
 (18, 'C10-2049'),
 (19, 'H05-1097'),
 (20, 'P06-1128'),
 (21, 'D11-1124'),
 (22, 'W13-1103'),
 (23, 'D14-1127'),
 (24, 'N03-4009'),
 (25, 'W12-0606'),
 (26, 'P13-1016'),
 (27, 'N10-1067'),
 (28, 'P08-1027'),
 (29, 'W12-3206'),
 (30, 'W08-0201'),
 (31, 'W14-2703'),
 (32, 'P09-2060'),
 (33, 'W10-3405'),
 (34, 'P13-1105'),
 (35, 'P14-1008'),
 (36, 'C10-1104'),
 (37, 'C02-1158'),
 (38, 'S10-1077'),
 (39, 'W06-0602'),
 (40, 'W11-2041'),
 (41, 'W02-1033'),
 (42, 'P13-2074'),
 (43, 'E09-1059'),
 (44, 'P14-2132'),
 (45, 'P98-2143'),
 (46, 'P09-2083'),
 (47, 'W97-0811'),
 (48, 'P11-3014'),
 (49, 'P04-3001'),
 (50, 'W10-4010'),
 (51, 'W06-2208'),
 (52, 'W09-2805'),
 (5

In [82]:
nodes = pd.DataFrame(data = G.nodes())

In [86]:
nodes.reset_index(inplace=True)

In [92]:
nodes[['index']].to_csv('aan_papers_entities.dict', sep = '\t', header=None)

In [11]:
G = nx.DiGraph()

for i in tqdm_notebook(aan_links_connected.index):
    G.add_edge(aan_links_connected.iloc[i]['citing'], aan_links_connected.iloc[i]['cited'])
    
print(nx.info(G))

HBox(children=(IntProgress(value=0, max=74860), HTML(value='')))


Name: 
Type: DiGraph
Number of nodes: 13506
Number of edges: 74860
Average in degree:   5.5427
Average out degree:   5.5427


In [25]:
print(nx.info(G_train))

Name: 
Type: DiGraph
Number of nodes: 13493
Number of edges: 64024
Average in degree:   4.7450
Average out degree:   4.7450


In [10]:
G.edges()[:10]

[('W07-1032', 'P04-2012'),
 ('W07-1032', 'W04-2603'),
 ('W07-1032', 'W05-0617'),
 ('W07-1032', 'W04-0109'),
 ('N03-2011', 'P99-1047'),
 ('P13-4002', 'D10-1124'),
 ('P13-4002', 'D12-1137'),
 ('P13-4002', 'P11-1096'),
 ('P13-4002', 'P12-3005'),
 ('P95-1041', 'P92-1032')]

In [13]:
def generate_negative_edges(graph, count_gen_edges, part_neg_directed):
    negative_edges = set()
    nodes = list(graph.nodes())
    edges = list(graph.edges())

    count_neg_directed = int(part_neg_directed*count_gen_edges)
    for a, b in edges:
        if len(negative_edges) >= count_neg_directed:
            break
        if not graph.has_edge(b, a):
            negative_edges.add((b, a))       
    
    while len(negative_edges) < count_gen_edges:
        i = random.randint(0, len(nodes) - 1)
        j = random.randint(0, len(nodes) - 1)
        if (i != j) and not graph.has_edge(nodes[i], nodes[j]):
            negative_edges.add((nodes[i], nodes[j]))
    return list(negative_edges)

In [14]:
def generate_negative_edges_test(graph, test_nodes, count_gen_edges):
    negative_edges = set()
    nodes = list(graph.nodes())
    while len(negative_edges) < count_gen_edges:
        i = random.randint(0, len(test_nodes) - 1)
        j = random.randint(0, len(nodes) - 1)
        if i == j:
            continue
        if graph.has_edge(test_nodes[i], nodes[j]):
            continue
        negative_edges.add((test_nodes[i], nodes[j]))
    return list(negative_edges)

In [15]:
def link_to_time(links_df, meta_df):
    years = meta_df[['id','year']]
    years.columns = ['citing', 'year_citing']
    links_df = links_df.merge(years, how = 'left', on = 'citing')
    years.columns = ['cited', 'year_cited']
    links_df = links_df.merge(years, how='left', on = 'cited')
    links_df['out_cites_count'] = links_df.groupby('citing')['cited'].transform(lambda x: x.count())
    links_df['in_cites_count'] = links_df.groupby('cited')['citing'].transform(lambda x: x.count())
    links_df['cite_rank'] = links_df.groupby('citing')['cited'].transform(lambda x: x.rank())
    return links_df

In [16]:
def train_test_split_by_year(links_years_df, year, part=None):
    """
    links_years_df - cite edges dataframe with year of citing paper
    year - first year of test period
    part - part of test period edges to include into train"""
    if part:
        train = links_years_df[(links_years_df.year_citing < year)|\
                               ((links_years_df.year_citing >= year)&\
                                ((links_years_df.cite_rank < part*links_years_df.out_cites_count + 1)|\
                                (links_years_df.in_cites_count == 1)))]
        test = links_years_df[((links_years_df.year_citing >= year)&\
                                (links_years_df.cite_rank >= part*links_years_df.out_cites_count + 1)&\
                              (links_years_df.in_cites_count != 1))]
    else:
        train = links_years_df[(links_years_df.year_citing < year)]
        test = links_years_df[(links_years_df.year_citing >= year)]
    return train.reset_index(drop=True), test.reset_index(drop=True)
    

In [17]:
def product(u,v):
    return u*v
def mean(u,v):
    return (u+v)/2
def l1(u,v):
    return np.abs(u-v)
def l2(u,v):
    return (u-v)**2

In [18]:
aan_links_connected.head()

Unnamed: 0,citing,cited,year_citing,year_cited,out_cites_count,in_cites_count,cite_rank,node_citing,node_cited
0,C08-3004,A00-1002,2008,2000,1,10,1.0,560,0
1,D09-1141,A00-1002,2009,2000,14,10,1.0,1682,0
2,D12-1027,A00-1002,2012,2000,14,10,1.0,1995,0
3,E06-1047,A00-1002,2006,2000,4,10,1.0,2562,0
4,H05-1110,A00-1002,2005,2000,2,10,1.0,3150,0


### 0.5 links for test nodes ( > 2013) used in training

In [19]:
aan_train, aan_test = train_test_split_by_year(aan_links_connected, 2013, 0.5)

In [23]:
aan_test.to_csv('test_2013_05.csv', index=False)
aan_train.to_csv('train_2013_05.csv', index=False)

In [93]:
aan_test = pd.read_csv('test_2013_05.csv')
aan_train = pd.read_csv('train_2013_05.csv')

In [102]:
aan_train['relation'] = 'cites'
aan_test['relation'] = 'cites'

In [103]:
aan_train[['node_citing','relation','node_cited']].to_csv('aan_relations_train_05.txt', sep = '\t', index = False, header = None)

In [106]:
aan_test[aan_test['year_citing'] == 2013][['node_citing','relation','node_cited']].to_csv('aan_relations_val_05.txt', sep = '\t', index = False, header = None)

In [107]:
aan_test[aan_test['year_citing'] > 2013][['node_citing','relation','node_cited']].to_csv('aan_relations_test_05.txt', sep = '\t', index = False, header = None)

In [119]:
relations_train = aan_train[['node_citing','relation','node_cited']]
relations_test = aan_test[aan_test['year_citing'] > 2013][['node_citing','relation','node_cited']]
relations_val = aan_test[aan_test['year_citing'] == 2013][['node_citing','relation','node_cited']]

In [111]:
relations_train['relation'] = 'is_cited'
relations_test['relation'] = 'is_cited'
relations_val['relation'] = 'is_cited'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [113]:
relations_train.columns

Index(['node_citing', 'relation', 'node_cited'], dtype='object')

In [114]:
relations_train = relations_train[['node_cited','relation','node_citing']]
relations_test = relations_test[['node_cited','relation','node_citing']]
relations_val = relations_val[['node_cited','relation','node_citing']]

relations_train.columns = ['node_citing', 'relation', 'node_cited']
relations_test.columns = ['node_citing', 'relation', 'node_cited']
relations_val.columns = ['node_citing', 'relation', 'node_cited']

In [118]:
relations_train_inv = relations_train.copy()
relations_test_inv = relations_test.copy()
relations_val_inv = relations_val.copy()

In [None]:
relations_train = relations_train

In [122]:
relations_train.shape

(63410, 3)

In [126]:
relations_train = relations_train.append(relations_train_inv)
relations_test = relations_test.append(relations_test_inv)
relations_val = relations_val.append(relations_val_inv)

In [127]:
relations_train.to_csv('aan_relations_train_05.txt', sep = '\t', index = False, header = None)
relations_test.to_csv('aan_relations_val_05.txt', sep = '\t', index = False, header = None)
relations_val.to_csv('aan_relations_test_05.txt', sep = '\t', index = False, header = None)

In [120]:
relations_val.head()

Unnamed: 0,node_citing,relation,node_cited
0,12499,cites,240
1,12486,cites,632
2,6923,cites,798
6,12693,cites,1522
8,6753,cites,1573


In [108]:
len(aan_test)/(len(aan_links_connected))

0.15295217739780925

In [20]:
G_train = nx.DiGraph()

for i in tqdm_notebook(aan_train.index):
    G_train.add_edge(aan_train.iloc[i]['citing'], aan_train.iloc[i]['cited'])
    
# print(nx.info(G))

HBox(children=(IntProgress(value=0, max=64024), HTML(value='')))




In [21]:
print(nx.info(G_train))

Name: 
Type: DiGraph
Number of nodes: 13493
Number of edges: 64024
Average in degree:   4.7450
Average out degree:   4.7450


In [48]:
aan_train[['node_citing','node_cited']].to_csv('aan_train.edgelist', sep=' ', header=None, index=False)

In [22]:
aan_train_neg = generate_negative_edges(G_train, len(G_train.edges()), 0.5)
aan_test_neg = generate_negative_edges_test(G, list(aan_test['citing']), len(aan_test))

In [31]:
def train_test_preprocess(positive_df, neg_list):
    pairs = list(zip(list(positive_df['citing']), list(positive_df['cited']), [1]*len(positive_df)))
    neg_pairs = list(zip(list(zip(*neg_list))[0],list(zip(*neg_list))[1], [0]*len(neg_list)))
    pairs += neg_pairs
    random.shuffle(pairs)
    return pairs

In [32]:
train_pairs = train_test_preprocess(aan_train, aan_train_neg)
test_pairs = train_test_preprocess(aan_test, aan_test_neg)

In [78]:
test_x_y = pd.DataFrame(data = test_pairs, columns = ['citing', 'cited', 'class'])

In [80]:
test_x_y.to_csv('test_x_y_05.csv', index=False)

In [21]:
hope_embed = np.load('hope_embeddings_aan_200_hidden_test.npy')

In [31]:
with open('./asymproj_edge_dnn-master/datasets/aan_graph/index.pkl', 'rb') as f:
    index_graph = pickle.load(f)

In [36]:
with open('hope_train_05_index.pkl', 'rb') as f:
    index_graph_train_05 = pickle.load(f)

#### Mean edge aggregating

In [42]:
index_graph_train_05[index_graph['index'][train_pairs[i][0]]]

5061

In [44]:
embed_dim = 100

k_train = 0
k_test = 0

X_train = np.zeros((len(train_pairs), embed_dim))
for i in range(len(train_pairs)):
    try:
        X_train[i] = mean(hope_embed[index_graph_train_05[index_graph['index'][train_pairs[i][0]]], :embed_dim],
                                    hope_embed[index_graph_train_05[index_graph['index'][train_pairs[i][1]]], embed_dim:])
    except KeyError:
        k_train += 1
        pass
y_train = np.array(list(zip(*(train_pairs)))[-1])

X_test = np.zeros((len(test_pairs), embed_dim))
for i in range(len(test_pairs)):
    try:
        X_test[i] = mean(hope_embed[index_graph_train_05[index_graph['index'][test_pairs[i][0]]], :embed_dim],
                                    hope_embed[index_graph_train_05[index_graph['index'][test_pairs[i][1]]], embed_dim:])    
    except:
        k_test += 1
        pass
y_test = np.array(list(zip(*(test_pairs)))[-1])

In [42]:
print(classification_report(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))

             precision    recall  f1-score   support

          0       0.61      0.78      0.69     11450
          1       0.70      0.51      0.59     11450

avg / total       0.66      0.64      0.64     22900

[[8927 2523]
 [5637 5813]]


In [None]:
### RandForest

In [45]:
rand_forest = RandomForestClassifier()

rand_forest, y_pred_train, y_pred_test, precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, roc_auc_train, roc_auc_test, logloss_train, logloss_test = \
classify_model(rand_forest, X_train, y_train, X_test, y_test)

In [46]:
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.7361563517915309
Recall: 0.5131877729257642
Accuracy: 0.6646288209606986
F1-macro: 0.6567567459540264
F1-micro: 0.6646288209606986
Logloss: 11.583383697166028
ROC-AUC: 0.6646288209606986


In [47]:
print(classification_report(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))

             precision    recall  f1-score   support

          0       0.63      0.82      0.71     11450
          1       0.74      0.51      0.60     11450

avg / total       0.68      0.66      0.66     22900

[[9344 2106]
 [5574 5876]]


#### Product aggregating

In [50]:
embed_dim = 100

k_train = 0
k_test = 0

X_train = np.zeros((len(train_pairs), embed_dim))
for i in range(len(train_pairs)):
    try:
        X_train[i] = product(hope_embed[index_graph_train_05[index_graph['index'][train_pairs[i][0]]], :embed_dim],
                                    hope_embed[index_graph_train_05[index_graph['index'][train_pairs[i][1]]], embed_dim:])
    except KeyError:
        k_train += 1
        pass
y_train = np.array(list(zip(*(train_pairs)))[-1])

X_test = np.zeros((len(test_pairs), embed_dim))
for i in range(len(test_pairs)):
    try:
        X_test[i] = product(hope_embed[index_graph_train_05[index_graph['index'][test_pairs[i][0]]], :embed_dim],
                                    hope_embed[index_graph_train_05[index_graph['index'][test_pairs[i][1]]], embed_dim:])    
    except:
        k_test += 1
        pass
y_test = np.array(list(zip(*(test_pairs)))[-1])

In [51]:
rand_forest = RandomForestClassifier()

rand_forest, y_pred_train, y_pred_test, precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, roc_auc_train, roc_auc_test, logloss_train, logloss_test = \
classify_model(rand_forest, X_train, y_train, X_test, y_test)

In [52]:
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.9089470908771313
Recall: 0.4516157205240175
Accuracy: 0.7031877729257642
F1-macro: 0.683133786338915
F1-micro: 0.7031877729257642
Logloss: 10.25154922915619
ROC-AUC: 0.7031877729257642


In [53]:
print(classification_report(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))

             precision    recall  f1-score   support

          0       0.64      0.95      0.76     11450
          1       0.91      0.45      0.60     11450

avg / total       0.77      0.70      0.68     22900

[[10932   518]
 [ 6279  5171]]


#### L1 

In [54]:
embed_dim = 100

k_train = 0
k_test = 0

X_train = np.zeros((len(train_pairs), embed_dim))
for i in range(len(train_pairs)):
    try:
        X_train[i] = l1(hope_embed[index_graph_train_05[index_graph['index'][train_pairs[i][0]]], :embed_dim],
                                    hope_embed[index_graph_train_05[index_graph['index'][train_pairs[i][1]]], embed_dim:])
    except KeyError:
        k_train += 1
        pass
y_train = np.array(list(zip(*(train_pairs)))[-1])

X_test = np.zeros((len(test_pairs), embed_dim))
for i in range(len(test_pairs)):
    try:
        X_test[i] = l1(hope_embed[index_graph_train_05[index_graph['index'][test_pairs[i][0]]], :embed_dim],
                                    hope_embed[index_graph_train_05[index_graph['index'][test_pairs[i][1]]], embed_dim:])    
    except:
        k_test += 1
        pass
y_test = np.array(list(zip(*(test_pairs)))[-1])

In [55]:
rand_forest = RandomForestClassifier()

rand_forest, y_pred_train, y_pred_test, precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, roc_auc_train, roc_auc_test, logloss_train, logloss_test = \
classify_model(rand_forest, X_train, y_train, X_test, y_test)

In [56]:
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.6562645011600928
Recall: 0.494061135371179
Accuracy: 0.61764192139738
F1-macro: 0.6117119037895156
F1-micro: 0.61764192139738
Logloss: 13.206283638472646
ROC-AUC: 0.61764192139738


In [57]:
print(classification_report(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))

             precision    recall  f1-score   support

          0       0.59      0.74      0.66     11450
          1       0.66      0.49      0.56     11450

avg / total       0.63      0.62      0.61     22900

[[8487 2963]
 [5793 5657]]


#### L2

In [58]:
embed_dim = 100

k_train = 0
k_test = 0

X_train = np.zeros((len(train_pairs), embed_dim))
for i in range(len(train_pairs)):
    try:
        X_train[i] = l2(hope_embed[index_graph_train_05[index_graph['index'][train_pairs[i][0]]], :embed_dim],
                                    hope_embed[index_graph_train_05[index_graph['index'][train_pairs[i][1]]], embed_dim:])
    except KeyError:
        k_train += 1
        pass
y_train = np.array(list(zip(*(train_pairs)))[-1])

X_test = np.zeros((len(test_pairs), embed_dim))
for i in range(len(test_pairs)):
    try:
        X_test[i] = l2(hope_embed[index_graph_train_05[index_graph['index'][test_pairs[i][0]]], :embed_dim],
                                    hope_embed[index_graph_train_05[index_graph['index'][test_pairs[i][1]]], embed_dim:])    
    except:
        k_test += 1
        pass
y_test = np.array(list(zip(*(test_pairs)))[-1])

In [59]:
rand_forest = RandomForestClassifier()

rand_forest, y_pred_train, y_pred_test, precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, roc_auc_train, roc_auc_test, logloss_train, logloss_test = \
classify_model(rand_forest, X_train, y_train, X_test, y_test)

In [60]:
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.6719362890764293
Recall: 0.4937117903930131
Accuracy: 0.6263318777292577
F1-macro: 0.6196421098512551
F1-micro: 0.6263318777292577
Logloss: 12.906136091709957
ROC-AUC: 0.6263318777292577


In [61]:
print(classification_report(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))

             precision    recall  f1-score   support

          0       0.60      0.76      0.67     11450
          1       0.67      0.49      0.57     11450

avg / total       0.64      0.63      0.62     22900

[[8690 2760]
 [5797 5653]]


* Best result: Random Forest + product edge function

### 0.3 links used for training

In [70]:
aan_train, aan_test = train_test_split_by_year(aan_links_connected, 2013, 0.3)

In [71]:
len(aan_test)/(len(aan_links_connected))

0.2004942559444296

In [63]:
G_train = nx.DiGraph()

for i in tqdm_notebook(aan_train.index):
    G_train.add_edge(aan_train.iloc[i]['citing'], aan_train.iloc[i]['cited'])
    
print(nx.info(G_train))

HBox(children=(IntProgress(value=0, max=59851), HTML(value='')))


Name: 
Type: DiGraph
Number of nodes: 13490
Number of edges: 59851
Average in degree:   4.4367
Average out degree:   4.4367


In [73]:
aan_test.to_csv('test_2013_03.csv', index=False)
aan_train.to_csv('train_2013_03.csv', index=False)

In [62]:
aan_test = pd.read_csv('test_2013_03.csv')
aan_train = pd.read_csv('train_2013_03.csv')

In [64]:
hope_embed = np.load('hope_embeddings_aan_200_hidden_test_07.npy')
with open('hope_train_03_index.pkl', 'rb') as f:
    index_graph_train_03 = pickle.load(f)

In [65]:
aan_train_neg = generate_negative_edges(G_train, len(G_train.edges()), 0.5)
aan_test_neg = generate_negative_edges_test(G, list(aan_test['citing']), len(aan_test))

train_pairs = train_test_preprocess(aan_train, aan_train_neg)
test_pairs = train_test_preprocess(aan_test, aan_test_neg)

In [66]:
embed_dim = 100

k_train = 0
k_test = 0

X_train = np.zeros((len(train_pairs), embed_dim))
for i in range(len(train_pairs)):
    try:
        X_train[i] = product(hope_embed[index_graph_train_03[index_graph['index'][train_pairs[i][0]]], :embed_dim],
                                    hope_embed[index_graph_train_03[index_graph['index'][train_pairs[i][1]]], embed_dim:])
    except KeyError:
        k_train += 1
        pass
y_train = np.array(list(zip(*(train_pairs)))[-1])

X_test = np.zeros((len(test_pairs), embed_dim))
for i in range(len(test_pairs)):
    try:
        X_test[i] = product(hope_embed[index_graph_train_03[index_graph['index'][test_pairs[i][0]]], :embed_dim],
                                    hope_embed[index_graph_train_03[index_graph['index'][test_pairs[i][1]]], embed_dim:])    
    except:
        k_test += 1
        pass
y_test = np.array(list(zip(*(test_pairs)))[-1])

In [67]:
rand_forest = RandomForestClassifier()

rand_forest, y_pred_train, y_pred_test, precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, roc_auc_train, roc_auc_test, logloss_train, logloss_test = \
classify_model(rand_forest, X_train, y_train, X_test, y_test)

In [68]:
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.9331011464228705
Recall: 0.4284096208941302
Accuracy: 0.6988473582517156
F1-macro: 0.6750841257388842
F1-micro: 0.6988473582517156
Logloss: 10.401456033859947
ROC-AUC: 0.6988473582517156


In [69]:
print(classification_report(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))

             precision    recall  f1-score   support

          0       0.63      0.97      0.76     15009
          1       0.93      0.43      0.59     15009

avg / total       0.78      0.70      0.68     30018

[[14548   461]
 [ 8579  6430]]


In [79]:
aan_train, aan_test = train_test_split_by_year(aan_links_connected, 2013, 0.1)

In [80]:
len(aan_test)/(len(aan_links_connected))

0.2612343040341972

In [81]:
G_train = nx.DiGraph()

for i in tqdm_notebook(aan_train.index):
    G_train.add_edge(aan_train.iloc[i]['citing'], aan_train.iloc[i]['cited'])
    
print(nx.info(G_train))

HBox(children=(IntProgress(value=0, max=55304), HTML(value='')))


Name: 
Type: DiGraph
Number of nodes: 13489
Number of edges: 55304
Average in degree:   4.0999
Average out degree:   4.0999


In [83]:
aan_test.to_csv('test_2013_01.csv', index=False)
aan_train.to_csv('train_2013_01.csv', index=False)

In [84]:
aan_train_neg = generate_negative_edges(G_train, len(G_train.edges()), 0.5)
aan_test_neg = generate_negative_edges_test(G, list(aan_test['citing']), len(aan_test))

train_pairs = train_test_preprocess(aan_train, aan_train_neg)
test_pairs = train_test_preprocess(aan_test, aan_test_neg)

In [85]:
embed_dim = 100

k_train = 0
k_test = 0

X_train = np.zeros((len(train_pairs), embed_dim))
for i in range(len(train_pairs)):
    try:
        X_train[i] = product(hope_embed[index_graph['index'][train_pairs[i][0]], :embed_dim],
                                    hope_embed[index_graph['index'][train_pairs[i][1]], embed_dim:])
    except KeyError:
        k_train += 1
        pass
y_train = np.array(list(zip(*(train_pairs)))[-1])

X_test = np.zeros((len(test_pairs), embed_dim))
for i in range(len(test_pairs)):
    try:
        X_test[i] = product(hope_embed[index_graph['index'][test_pairs[i][0]], :embed_dim],
                                    hope_embed[index_graph['index'][test_pairs[i][1]], embed_dim:])    
    except:
        k_test += 1
        pass
y_test = np.array(list(zip(*(test_pairs)))[-1])

In [86]:
rand_forest = RandomForestClassifier()

rand_forest, y_pred_train, y_pred_test, precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, roc_auc_train, roc_auc_test, logloss_train, logloss_test = \
classify_model(rand_forest, X_train, y_train, X_test, y_test)

In [87]:
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.940093671713321
Recall: 0.8826958478216405
Accuracy: 0.9132235631008386
F1-macro: 0.913142617109813
F1-micro: 0.9132235631008386
Logloss: 2.997174438573842
ROC-AUC: 0.9132235631008386
