In [1]:
import pandas as pd
import networkx as nx
import numpy as np
import random

In [2]:
from gensim.models.keyedvectors import KeyedVectors



In [3]:
import pickle

In [4]:
from tqdm import tqdm_notebook

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [6]:
def classify_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
                     
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    precision_train = precision_score(y_train, y_pred_train)
    precision_test = precision_score(y_test, y_pred_test)
    recall_train = recall_score(y_train, y_pred_train)
    recall_test = recall_score(y_test, y_pred_test)
    accuracy_train = accuracy_score(y_train, y_pred_train)
    accuracy_test = accuracy_score(y_test, y_pred_test)
    f1_macro_train = f1_score(y_train, y_pred_train, average='macro')
    f1_macro_test = f1_score(y_test, y_pred_test, average='macro')
    f1_micro_train = f1_score(y_train, y_pred_train, average='micro')
    f1_micro_test = f1_score(y_test, y_pred_test, average='micro')
    logloss_train = log_loss(y_train, y_pred_train)
    logloss_test = log_loss(y_test, y_pred_test)
    roc_auc_train = roc_auc_score(y_train, y_pred_train)
    roc_auc_test = roc_auc_score(y_test, y_pred_test)

    return model, y_pred_train, y_pred_test, \
        precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, \
        f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, \
        roc_auc_train, roc_auc_test, logloss_train, logloss_test 

In [7]:
aan_links_connected = pd.read_csv('aan_links_connected.csv')

In [8]:
aan_links_connected.shape

(74860, 9)

In [9]:
aan_links_connected.head()

Unnamed: 0,citing,cited,year_citing,year_cited,out_cites_count,in_cites_count,cite_rank,node_citing,node_cited
0,C08-3004,A00-1002,2008,2000,1,10,1.0,560,0
1,D09-1141,A00-1002,2009,2000,14,10,1.0,1682,0
2,D12-1027,A00-1002,2012,2000,14,10,1.0,1995,0
3,E06-1047,A00-1002,2006,2000,4,10,1.0,2562,0
4,H05-1110,A00-1002,2005,2000,2,10,1.0,3150,0


In [10]:
G = nx.DiGraph()

for i in tqdm_notebook(aan_links_connected.index):
    G.add_edge(aan_links_connected.iloc[i]['citing'], aan_links_connected.iloc[i]['cited'])
    
print(nx.info(G))

HBox(children=(IntProgress(value=0, max=74860), HTML(value='')))


Name: 
Type: DiGraph
Number of nodes: 13506
Number of edges: 74860
Average in degree:   5.5427
Average out degree:   5.5427


In [9]:
G = nx.read_gpickle('G_aan_13506')

In [10]:
G_train = nx.read_gpickle('G_aan_13506_train_05')

In [15]:
def generate_negative_edges(graph, count_gen_edges, part_neg_directed):
    negative_edges = set()
    nodes = list(graph.nodes())
    edges = list(graph.edges())

    count_neg_directed = int(part_neg_directed*count_gen_edges)
    for a, b in edges:
        if len(negative_edges) >= count_neg_directed:
            break
        if not graph.has_edge(b, a):
            negative_edges.add((b, a))       
    
    while len(negative_edges) < count_gen_edges:
        i = random.randint(0, len(nodes) - 1)
        j = random.randint(0, len(nodes) - 1)
        if (i != j) and not graph.has_edge(nodes[i], nodes[j]):
            negative_edges.add((nodes[i], nodes[j]))
    return list(negative_edges)

In [16]:
def generate_negative_edges_test(graph, test_nodes, count_gen_edges):
    negative_edges = set()
    nodes = list(graph.nodes())
    while len(negative_edges) < count_gen_edges:
        i = random.randint(0, len(test_nodes) - 1)
        j = random.randint(0, len(nodes) - 1)
        if i == j:
            continue
        if graph.has_edge(test_nodes[i], nodes[j]):
            continue
        negative_edges.add((test_nodes[i], nodes[j]))
    return list(negative_edges)

In [17]:
def link_to_time(links_df, meta_df):
    years = meta_df[['id','year']]
    years.columns = ['citing', 'year_citing']
    links_df = links_df.merge(years, how = 'left', on = 'citing')
    years.columns = ['cited', 'year_cited']
    links_df = links_df.merge(years, how='left', on = 'cited')
    links_df['out_cites_count'] = links_df.groupby('citing')['cited'].transform(lambda x: x.count())
    links_df['in_cites_count'] = links_df.groupby('cited')['citing'].transform(lambda x: x.count())
    links_df['cite_rank'] = links_df.groupby('citing')['cited'].transform(lambda x: x.rank())
    return links_df

In [18]:
def train_test_split_by_year(links_years_df, year, part=None):
    """
    links_years_df - cite edges dataframe with year of citing paper
    year - first year of test period
    part - part of test period edges to include into train"""
    if part:
        train = links_years_df[(links_years_df.year_citing < year)|\
                               ((links_years_df.year_citing >= year)&\
                                ((links_years_df.cite_rank < part*links_years_df.out_cites_count + 1)|\
                                (links_years_df.in_cites_count == 1)))]
        test = links_years_df[((links_years_df.year_citing >= year)&\
                                (links_years_df.cite_rank >= part*links_years_df.out_cites_count + 1)&\
                              (links_years_df.in_cites_count != 1))]
    else:
        train = links_years_df[(links_years_df.year_citing < year)]
        test = links_years_df[(links_years_df.year_citing >= year)]
    return train.reset_index(drop=True), test.reset_index(drop=True)
    

In [19]:
def product(u,v):
    return u*v
def mean(u,v):
    return (u+v)/2
def l1(u,v):
    return np.abs(u-v)
def l2(u,v):
    return (u-v)**2

In [16]:
aan_links_connected.head()

Unnamed: 0,citing,cited,year_citing,year_cited,out_cites_count,in_cites_count,cite_rank,node_citing,node_cited
0,C08-3004,A00-1002,2008,2000,1,10,1.0,560,0
1,D09-1141,A00-1002,2009,2000,14,10,1.0,1682,0
2,D12-1027,A00-1002,2012,2000,14,10,1.0,1995,0
3,E06-1047,A00-1002,2006,2000,4,10,1.0,2562,0
4,H05-1110,A00-1002,2005,2000,2,10,1.0,3150,0


## Undirected asym_proj

### 0.5 links for test nodes ( > 2013) used in training

In [17]:
aan_train, aan_test = train_test_split_by_year(aan_links_connected, 2013, 0.5)

In [23]:
aan_test.to_csv('test_2013_05_no_isolated.csv', index=False)
aan_train.to_csv('train_2013_05_no_isolated.csv', index=False)

In [11]:
aan_test = pd.read_csv('test_2013_05_no_isolated.csv')
aan_train = pd.read_csv('train_2013_05_no_isolated.csv')

In [13]:
len(aan_test)/(len(aan_test) + len(aan_train))

0.14475020037403152

In [20]:
G_train = nx.DiGraph()

for i in tqdm_notebook(aan_train.index):
    G_train.add_edge(aan_train.iloc[i]['citing'], aan_train.iloc[i]['cited'])
    
# print(nx.info(G))

HBox(children=(IntProgress(value=0, max=63410), HTML(value='')))




In [21]:
print(nx.info(G_train))

Name: 
Type: DiGraph
Number of nodes: 13444
Number of edges: 63410
Average in degree:   4.7166
Average out degree:   4.7166


In [20]:
aan_train_neg = generate_negative_edges(G_train, len(G_train.edges()), 0)
aan_test_neg = generate_negative_edges_test(G, list(aan_test['citing']), len(aan_test))

In [21]:
def train_test_preprocess(positive_df, neg_list):
    pairs = list(zip(list(positive_df['citing']), list(positive_df['cited']), [1]*len(positive_df)))
    neg_pairs = list(zip(list(zip(*neg_list))[0],list(zip(*neg_list))[1], [0]*len(neg_list)))
    pairs += neg_pairs
    random.shuffle(pairs)
    return pairs

In [22]:
train_pairs = train_test_preprocess(aan_train, aan_train_neg)
test_pairs = train_test_preprocess(aan_test, aan_test_neg)

In [32]:
asym_emb = np.load('asymproj_edge_dnn-master/datasets/aan_graph/dumps/test.d100_f100x100_g32_embeddings.npy.best')

# asym_emb = np.load('asymproj_edge_dnn-master/datasets/aan_custom_datasets/dumps/dumps/test.d100_f100x100_g32_embeddings.npy.best')

In [24]:
asym_emb.shape

(13506, 100)

In [25]:
with open('./asymproj_edge_dnn-master/datasets/aan_graph/index.pkl', 'rb') as f:
    index_graph = pickle.load(f)

#### Mean edge aggregating

In [33]:
embed_dim = 100

k_train = 0
k_test = 0

X_train = np.zeros((len(train_pairs), embed_dim))
for i in range(len(train_pairs)):
    try:
        X_train[i] = mean(asym_emb[index_graph['index'][train_pairs[i][0]]],
                                    asym_emb[index_graph['index'][train_pairs[i][1]]])
    except KeyError:
        k_train += 1
        pass
y_train = np.array(list(zip(*(train_pairs)))[-1])

X_test = np.zeros((len(test_pairs), embed_dim))
for i in range(len(test_pairs)):
    try:
        X_test[i] = mean(asym_emb[index_graph['index'][test_pairs[i][0]]],
                                    asym_emb[index_graph['index'][test_pairs[i][1]]])    
    except:
        k_test += 1
        pass
y_test = np.array(list(zip(*(test_pairs)))[-1])

In [None]:
### RandForest

In [34]:
rand_forest = RandomForestClassifier()

rand_forest, y_pred_train, y_pred_test, precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, roc_auc_train, roc_auc_test, logloss_train, logloss_test = \
classify_model(rand_forest, X_train, y_train, X_test, y_test)

In [28]:
# new test, no isolated, emb on custon train/test
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.5301375144582958
Recall: 0.3806755260243632
Accuracy: 0.521640826873385
F1-macro: 0.5119425314746969
F1-micro: 0.521640826873385
Logloss: 16.522075406711142
ROC-AUC: 0.521640826873385


In [35]:
# new train/test no isolated
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.5333248698743176
Recall: 0.3876891842008121
Accuracy: 0.5242248062015504
F1-macro: 0.5151869325981351
F1-micro: 0.5242248062015504
Logloss: 16.432828660394865
ROC-AUC: 0.5242248062015504


In [40]:
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.539189837008629
Recall: 0.39292576419213976
Accuracy: 0.5285589519650655
F1-macro: 0.5197236138859492
F1-micro: 0.5285589519650655
Logloss: 16.283131197012004
ROC-AUC: 0.5285589519650655


In [41]:
print(classification_report(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))

             precision    recall  f1-score   support

          0       0.52      0.66      0.58     11450
          1       0.54      0.39      0.45     11450

avg / total       0.53      0.53      0.52     22900

[[7605 3845]
 [6951 4499]]


#### Product aggregating

In [29]:
embed_dim = 100

k_train = 0
k_test = 0

X_train = np.zeros((len(train_pairs), embed_dim))
for i in range(len(train_pairs)):
    try:
        X_train[i] = product(asym_emb[index_graph['index'][train_pairs[i][0]]],
                                    asym_emb[index_graph['index'][train_pairs[i][1]]])
    except KeyError:
        k_train += 1
        pass
y_train = np.array(list(zip(*(train_pairs)))[-1])

X_test = np.zeros((len(test_pairs), embed_dim))
for i in range(len(test_pairs)):
    try:
        X_test[i] = product(asym_emb[index_graph['index'][test_pairs[i][0]]],
                                    asym_emb[index_graph['index'][test_pairs[i][1]]])    
    except:
        k_test += 1
        pass
y_test = np.array(list(zip(*(test_pairs)))[-1])

In [30]:
rand_forest = RandomForestClassifier()

rand_forest, y_pred_train, y_pred_test, precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, roc_auc_train, roc_auc_test, logloss_train, logloss_test = \
classify_model(rand_forest, X_train, y_train, X_test, y_test)

In [31]:
# new emb on custom train/test
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.5166193006301742
Recall: 0.38584348468069396
Accuracy: 0.5124123292727943
F1-macro: 0.504474167764043
F1-micro: 0.5124123292727943
Logloss: 16.840825867025103
ROC-AUC: 0.5124123292727943


In [44]:
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.5106407995296884
Recall: 0.3793013100436681
Accuracy: 0.5079039301310043
F1-macro: 0.499628468561906
F1-micro: 0.5079039301310043
Logloss: 16.996541446233756
ROC-AUC: 0.5079039301310044


In [45]:
print(classification_report(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))

             precision    recall  f1-score   support

          0       0.51      0.64      0.56     11450
          1       0.51      0.38      0.44     11450

avg / total       0.51      0.51      0.50     22900

[[7288 4162]
 [7107 4343]]


#### L1 

In [46]:
embed_dim = 100

k_train = 0
k_test = 0

X_train = np.zeros((len(train_pairs), embed_dim))
for i in range(len(train_pairs)):
    try:
        X_train[i] = l1(asym_emb[index_graph['index'][train_pairs[i][0]]],
                                    asym_emb[index_graph['index'][train_pairs[i][1]]])
    except KeyError:
        k_train += 1
        pass
y_train = np.array(list(zip(*(train_pairs)))[-1])

X_test = np.zeros((len(test_pairs), embed_dim))
for i in range(len(test_pairs)):
    try:
        X_test[i] = l1(asym_emb[index_graph['index'][test_pairs[i][0]]],
                                    asym_emb[index_graph['index'][test_pairs[i][1]]])    
    except:
        k_test += 1
        pass
y_test = np.array(list(zip(*(test_pairs)))[-1])

In [47]:
rand_forest = RandomForestClassifier()

rand_forest, y_pred_train, y_pred_test, precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, roc_auc_train, roc_auc_test, logloss_train, logloss_test = \
classify_model(rand_forest, X_train, y_train, X_test, y_test)

In [48]:
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.5063750146215932
Recall: 0.37807860262008736
Accuracy: 0.504759825327511
F1-macro: 0.49668251987460815
F1-micro: 0.504759825327511
Logloss: 17.10513700418504
ROC-AUC: 0.504759825327511


In [49]:
print(classification_report(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))

             precision    recall  f1-score   support

          0       0.50      0.63      0.56     11450
          1       0.51      0.38      0.43     11450

avg / total       0.51      0.50      0.50     22900

[[7230 4220]
 [7121 4329]]


#### L2

In [50]:
embed_dim = 100

k_train = 0
k_test = 0

X_train = np.zeros((len(train_pairs), embed_dim))
for i in range(len(train_pairs)):
    try:
        X_train[i] = l2(asym_emb[index_graph['index'][train_pairs[i][0]]],
                                    asym_emb[index_graph['index'][train_pairs[i][1]]])
    except KeyError:
        k_train += 1
        pass
y_train = np.array(list(zip(*(train_pairs)))[-1])

X_test = np.zeros((len(test_pairs), embed_dim))
for i in range(len(test_pairs)):
    try:
        X_test[i] = l2(asym_emb[index_graph['index'][test_pairs[i][0]]],
                                    asym_emb[index_graph['index'][test_pairs[i][1]]])    
    except:
        k_test += 1
        pass
y_test = np.array(list(zip(*(test_pairs)))[-1])

In [51]:
rand_forest = RandomForestClassifier()

rand_forest, y_pred_train, y_pred_test, precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, roc_auc_train, roc_auc_test, logloss_train, logloss_test = \
classify_model(rand_forest, X_train, y_train, X_test, y_test)

In [52]:
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.5046544100535257
Recall: 0.37877729257641923
Accuracy: 0.5034934497816594
F1-macro: 0.4956487093513078
F1-micro: 0.5034934497816594
Logloss: 17.14887735792117
ROC-AUC: 0.5034934497816593


In [53]:
print(classification_report(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))

             precision    recall  f1-score   support

          0       0.50      0.63      0.56     11450
          1       0.50      0.38      0.43     11450

avg / total       0.50      0.50      0.50     22900

[[7193 4257]
 [7113 4337]]


* Best result: Random Forest + product edge function

### 0.3 links used for training

In [54]:
aan_train, aan_test = train_test_split_by_year(aan_links_connected, 2013, 0.3)

In [55]:
len(aan_test)/(len(aan_links_connected))

0.2004942559444296

In [56]:
G_train = nx.DiGraph()

for i in tqdm_notebook(aan_train.index):
    G_train.add_edge(aan_train.iloc[i]['citing'], aan_train.iloc[i]['cited'])
    
print(nx.info(G_train))

HBox(children=(IntProgress(value=0, max=59851), HTML(value='')))


Name: 
Type: DiGraph
Number of nodes: 13490
Number of edges: 59851
Average in degree:   4.4367
Average out degree:   4.4367


In [73]:
aan_test.to_csv('test_2013_03.csv', index=False)
aan_train.to_csv('train_2013_03.csv', index=False)

In [62]:
aan_test = pd.read_csv('test_2013_03.csv')
aan_train = pd.read_csv('train_2013_03.csv')

In [57]:
aan_train_neg = generate_negative_edges(G_train, len(G_train.edges()), 0)
aan_test_neg = generate_negative_edges_test(G, list(aan_test['citing']), len(aan_test))

train_pairs = train_test_preprocess(aan_train, aan_train_neg)
test_pairs = train_test_preprocess(aan_test, aan_test_neg)

In [66]:
embed_dim = 100

k_train = 0
k_test = 0

X_train = np.zeros((len(train_pairs), embed_dim))
for i in range(len(train_pairs)):
    try:
        X_train[i] = mean(asym_emb[index_graph['index'][train_pairs[i][0]]],
                                    asym_emb[index_graph['index'][train_pairs[i][1]]])
    except KeyError:
        k_train += 1
        pass
y_train = np.array(list(zip(*(train_pairs)))[-1])

X_test = np.zeros((len(test_pairs), embed_dim))
for i in range(len(test_pairs)):
    try:
        X_test[i] = mean(asym_emb[index_graph['index'][test_pairs[i][0]]],
                                    asym_emb[index_graph['index'][test_pairs[i][1]]])    
    except:
        k_test += 1
        pass
y_test = np.array(list(zip(*(test_pairs)))[-1])

In [67]:
rand_forest = RandomForestClassifier()

rand_forest, y_pred_train, y_pred_test, precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, roc_auc_train, roc_auc_test, logloss_train, logloss_test = \
classify_model(rand_forest, X_train, y_train, X_test, y_test)

In [68]:
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.5480158364791455
Recall: 0.39656206276234257
Accuracy: 0.5347458191751616
F1-macro: 0.5256889683491461
F1-micro: 0.5347458191751616
Logloss: 16.06944088064186
ROC-AUC: 0.5347458191751615


In [69]:
print(classification_report(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))

             precision    recall  f1-score   support

          0       0.53      0.67      0.59     15009
          1       0.55      0.40      0.46     15009

avg / total       0.54      0.53      0.53     30018

[[10100  4909]
 [ 9057  5952]]


In [70]:
aan_train, aan_test = train_test_split_by_year(aan_links_connected, 2013, 0.1)

In [103]:
len(aan_test)/(len(aan_links_connected))

0.2612343040341972

In [72]:
G_train = nx.DiGraph()

for i in tqdm_notebook(aan_train.index):
    G_train.add_edge(aan_train.iloc[i]['citing'], aan_train.iloc[i]['cited'])
    
print(nx.info(G_train))

HBox(children=(IntProgress(value=0, max=55304), HTML(value='')))


Name: 
Type: DiGraph
Number of nodes: 13489
Number of edges: 55304
Average in degree:   4.0999
Average out degree:   4.0999


In [83]:
aan_test.to_csv('test_2013_01.csv', index=False)
aan_train.to_csv('train_2013_01.csv', index=False)

In [73]:
aan_train_neg = generate_negative_edges(G_train, len(G_train.edges()), 0)
aan_test_neg = generate_negative_edges_test(G, list(aan_test['citing']), len(aan_test))

train_pairs = train_test_preprocess(aan_train, aan_train_neg)
test_pairs = train_test_preprocess(aan_test, aan_test_neg)

In [74]:
embed_dim = 100

k_train = 0
k_test = 0

X_train = np.zeros((len(train_pairs), embed_dim))
for i in range(len(train_pairs)):
    try:
        X_train[i] = mean(asym_emb[index_graph['index'][train_pairs[i][0]]],
                                    asym_emb[index_graph['index'][train_pairs[i][1]]])
    except KeyError:
        k_train += 1
        pass
y_train = np.array(list(zip(*(train_pairs)))[-1])

X_test = np.zeros((len(test_pairs), embed_dim))
for i in range(len(test_pairs)):
    try:
        X_test[i] = mean(asym_emb[index_graph['index'][test_pairs[i][0]]],
                                    asym_emb[index_graph['index'][test_pairs[i][1]]])    
    except:
        k_test += 1
        pass
y_test = np.array(list(zip(*(test_pairs)))[-1])

In [75]:
rand_forest = RandomForestClassifier()

rand_forest, y_pred_train, y_pred_test, precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, roc_auc_train, roc_auc_test, logloss_train, logloss_test = \
classify_model(rand_forest, X_train, y_train, X_test, y_test)

In [76]:
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.5461166798875027
Recall: 0.3872468807527102
Accuracy: 0.5327009613417877
F1-macro: 0.522600676580169
F1-micro: 0.5327009613417877
Logloss: 16.140065678973418
ROC-AUC: 0.5327009613417877


## Directed

In [36]:
# asym_emb = np.load('asymproj_edge_dnn-master/datasets/aan_graph/dumps/test.d100_f100x100_g32_embeddings.npy.best')

asym_emb = np.load('asymproj_edge_dnn-master/datasets/aan_custom_datasets/directed/dumps/test.d100_f100x100_g32_embeddings.npy.best')

In [37]:
# with open('./asymproj_edge_dnn-master/datasets/aan_graph/dumps/test.d100_f100x100_g32_net.pkl.best', 'rb') as f:
#     test_net = pickle.load(f)

with open('./asymproj_edge_dnn-master/datasets/aan_custom_datasets/directed/dumps/test.d100_f100x100_g32_net.pkl.best', 'rb') as f:
    test_net = pickle.load(f)

In [38]:
test_net = list(test_net)

In [39]:
for i in test_net:
    print(i)
    print(i[1].shape)

('fully_connected/weights:0', array([[ 0.10851946, -0.14215657,  0.05383222, ..., -0.116536  ,
         0.1275872 , -0.00754973],
       [-0.11152562, -0.09869028,  0.01061374, ...,  0.2220751 ,
         0.09539881, -0.02197587],
       [ 0.06320386, -0.12298661, -0.03535729, ..., -0.04209198,
        -0.05899743, -0.11664581],
       ...,
       [-0.03499449,  0.03586356, -0.0738299 , ..., -0.01834275,
        -0.11007538, -0.12166001],
       [ 0.06282356, -0.15001814,  0.11422365, ...,  0.15422584,
         0.04278898, -0.06598209],
       [-0.11736944,  0.01918805, -0.05994429, ..., -0.09034131,
        -0.09074124, -0.04744608]], dtype=float32))
(100, 100)
('fully_connected/BatchNorm/beta:0', array([-6.68858047e-05, -9.59280660e-05, -3.73361290e-05, -4.82153810e-05,
       -4.86363206e-06, -8.79082581e-05,  5.07633195e-05, -7.27905281e-05,
        2.11354145e-05, -6.15264507e-05,  1.32817404e-05, -1.89321290e-05,
       -7.24329730e-05, -4.26166980e-06, -5.79720836e-05, -9.2065398

         0.06116296, -0.22983904]], dtype=float32))
(32, 100)


In [40]:
R = np.zeros((32, 100))
L = np.zeros((100, 32))

In [41]:
for i in test_net:
    if i[0] == 'g_right:0':
        R = i[1]
    if i[0] == 'g_left:0':
        L = i[1]

In [42]:
aan_train_neg = generate_negative_edges(G_train, len(G_train.edges()), 0.5)
aan_test_neg = generate_negative_edges_test(G, list(aan_test['citing']), len(aan_test))

train_pairs = train_test_preprocess(aan_train, aan_train_neg)
test_pairs = train_test_preprocess(aan_test, aan_test_neg)

In [43]:
np.array([asym_emb[index_graph['index'][train_pairs[0][0]]]]).shape

(1, 100)

In [44]:
(np.array([asym_emb[index_graph['index'][train_pairs[0][0]]]]).dot(L)).dot(R.dot(asym_emb[index_graph['index'][train_pairs[0][1]]]))

array([-0.10851453], dtype=float32)

In [45]:
L.transpose().dot(asym_emb[index_graph['index'][train_pairs[0][0]]]).shape

(32,)

In [46]:
R.dot(asym_emb[index_graph['index'][train_pairs[0][1]]]).shape

(32,)

In [47]:
mean(L.transpose().dot(asym_emb[index_graph['index'][train_pairs[0][0]]]), R.dot(asym_emb[index_graph['index'][train_pairs[0][1]]]))

array([ 0.01485016,  0.10077673,  0.01040589, -0.02123228,  0.06818286,
       -0.11407711, -0.04643111,  0.02280432,  0.018169  , -0.09937138,
        0.02358127, -0.05552548,  0.03697368, -0.06532729,  0.04324254,
       -0.01721333,  0.05355959, -0.07544747, -0.04646574, -0.04331411,
        0.07283127, -0.00477601,  0.02687544,  0.02302699,  0.03585663,
       -0.08086304, -0.00852232, -0.0351202 ,  0.03220697, -0.05776621,
        0.12200874, -0.00615464], dtype=float32)

In [48]:
embed_dim = 32

k_train = 0
k_test = 0

X_train = np.zeros((len(train_pairs), embed_dim))
for i in range(len(train_pairs)):
    try:
        X_train[i] = mean(L.transpose().dot(asym_emb[index_graph['index'][train_pairs[i][0]]]), 
                         R.dot(asym_emb[index_graph['index'][train_pairs[i][1]]]))
    except KeyError:
        k_train += 1
        pass
y_train = np.array(list(zip(*(train_pairs)))[-1])

X_test = np.zeros((len(test_pairs), embed_dim))
for i in range(len(test_pairs)):
    try:
        X_test[i] = mean(L.transpose().dot(asym_emb[index_graph['index'][test_pairs[i][0]]]), 
                         R.dot(asym_emb[index_graph['index'][test_pairs[i][1]]]))    
    except:
        k_test += 1
        pass
y_test = np.array(list(zip(*(test_pairs)))[-1])

In [49]:
rand_forest = RandomForestClassifier()

rand_forest, y_pred_train, y_pred_test, precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, roc_auc_train, roc_auc_test, logloss_train, logloss_test = \
classify_model(rand_forest, X_train, y_train, X_test, y_test)

In [50]:
# new emb, new train/test
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.521570996978852
Recall: 0.3983019564414913
Accuracy: 0.5164728682170543
F1-macro: 0.5096250943468668
F1-micro: 0.5164728682170543
Logloss: 16.700581554470986
ROC-AUC: 0.5164728682170543


In [135]:
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.5372179416620804
Recall: 0.3993147883002659
Accuracy: 0.5276641439967273
F1-macro: 0.5197527637241232
F1-micro: 0.5276641439967273
Logloss: 16.314040039156595
ROC-AUC: 0.5276641439967273
