In [8]:
import pandas as pd

In [1]:
import pickle

In [13]:
import networkx as nx

In [47]:
import numpy as np

In [26]:
import random

In [2]:
from tqdm import tqdm_notebook

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [4]:
import sklearn

In [5]:
r = RandomForestClassifier()

In [6]:
def classify_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
                     
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    precision_train = precision_score(y_train, y_pred_train)
    precision_test = precision_score(y_test, y_pred_test)
    recall_train = recall_score(y_train, y_pred_train)
    recall_test = recall_score(y_test, y_pred_test)
    accuracy_train = accuracy_score(y_train, y_pred_train)
    accuracy_test = accuracy_score(y_test, y_pred_test)
    f1_macro_train = f1_score(y_train, y_pred_train, average='macro')
    f1_macro_test = f1_score(y_test, y_pred_test, average='macro')
    f1_micro_train = f1_score(y_train, y_pred_train, average='micro')
    f1_micro_test = f1_score(y_test, y_pred_test, average='micro')
    logloss_train = log_loss(y_train, y_pred_train)
    logloss_test = log_loss(y_test, y_pred_test)
    roc_auc_train = roc_auc_score(y_train, y_pred_train)
    roc_auc_test = roc_auc_score(y_test, y_pred_test)

    return model, y_pred_train, y_pred_test, \
        precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, \
        f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, \
        roc_auc_train, roc_auc_test, logloss_train, logloss_test 

In [9]:
aan_links_connected = pd.read_csv('aan_links_connected.csv')

In [10]:
aan_links_connected.shape

(74860, 9)

In [11]:
aan_links_connected.head()

Unnamed: 0,citing,cited,year_citing,year_cited,out_cites_count,in_cites_count,cite_rank,node_citing,node_cited
0,C08-3004,A00-1002,2008,2000,1,10,1.0,560,0
1,D09-1141,A00-1002,2009,2000,14,10,1.0,1682,0
2,D12-1027,A00-1002,2012,2000,14,10,1.0,1995,0
3,E06-1047,A00-1002,2006,2000,4,10,1.0,2562,0
4,H05-1110,A00-1002,2005,2000,2,10,1.0,3150,0


In [14]:
G = nx.DiGraph()

for i in tqdm_notebook(aan_links_connected.index):
    G.add_edge(aan_links_connected.iloc[i]['citing'], aan_links_connected.iloc[i]['cited'])
    
print(nx.info(G))

HBox(children=(IntProgress(value=0, max=74860), HTML(value='')))


Name: 
Type: DiGraph
Number of nodes: 13506
Number of edges: 74860
Average in degree:   5.5427
Average out degree:   5.5427


In [15]:
G = nx.read_gpickle('G_aan_13506')
G_train = nx.read_gpickle('G_aan_13506_train_05')

In [16]:
def generate_negative_edges(graph, count_gen_edges, part_neg_directed):
    negative_edges = set()
    nodes = list(graph.nodes())
    edges = list(graph.edges())

    count_neg_directed = int(part_neg_directed*count_gen_edges)
    for a, b in edges:
        if len(negative_edges) >= count_neg_directed:
            break
        if not graph.has_edge(b, a):
            negative_edges.add((b, a))       
    
    while len(negative_edges) < count_gen_edges:
        i = random.randint(0, len(nodes) - 1)
        j = random.randint(0, len(nodes) - 1)
        if (i != j) and not graph.has_edge(nodes[i], nodes[j]):
            negative_edges.add((nodes[i], nodes[j]))
    return list(negative_edges)

In [17]:
def generate_negative_edges_test(graph, test_nodes, count_gen_edges):
    negative_edges = set()
    nodes = list(graph.nodes())
    while len(negative_edges) < count_gen_edges:
        i = random.randint(0, len(test_nodes) - 1)
        j = random.randint(0, len(nodes) - 1)
        if i == j:
            continue
        if graph.has_edge(test_nodes[i], nodes[j]):
            continue
        negative_edges.add((test_nodes[i], nodes[j]))
    return list(negative_edges)

In [18]:
def link_to_time(links_df, meta_df):
    years = meta_df[['id','year']]
    years.columns = ['citing', 'year_citing']
    links_df = links_df.merge(years, how = 'left', on = 'citing')
    years.columns = ['cited', 'year_cited']
    links_df = links_df.merge(years, how='left', on = 'cited')
    links_df['out_cites_count'] = links_df.groupby('citing')['cited'].transform(lambda x: x.count())
    links_df['in_cites_count'] = links_df.groupby('cited')['citing'].transform(lambda x: x.count())
    links_df['cite_rank'] = links_df.groupby('citing')['cited'].transform(lambda x: x.rank())
    return links_df

In [19]:
def train_test_split_by_year(links_years_df, year, part=None):
    """
    links_years_df - cite edges dataframe with year of citing paper
    year - first year of test period
    part - part of test period edges to include into train"""
    if part:
        train = links_years_df[(links_years_df.year_citing < year)|\
                               ((links_years_df.year_citing >= year)&\
                                ((links_years_df.cite_rank < part*links_years_df.out_cites_count + 1)|\
                                (links_years_df.in_cites_count == 1)))]
        test = links_years_df[((links_years_df.year_citing >= year)&\
                                (links_years_df.cite_rank >= part*links_years_df.out_cites_count + 1)&\
                              (links_years_df.in_cites_count != 1))]
    else:
        train = links_years_df[(links_years_df.year_citing < year)]
        test = links_years_df[(links_years_df.year_citing >= year)]
    return train.reset_index(drop=True), test.reset_index(drop=True)
    

In [20]:
def product(u,v):
    return u*v
def mean(u,v):
    return (u+v)/2
def l1(u,v):
    return np.abs(u-v)
def l2(u,v):
    return (u-v)**2

In [21]:
aan_links_connected.head()

Unnamed: 0,citing,cited,year_citing,year_cited,out_cites_count,in_cites_count,cite_rank,node_citing,node_cited
0,C08-3004,A00-1002,2008,2000,1,10,1.0,560,0
1,D09-1141,A00-1002,2009,2000,14,10,1.0,1682,0
2,D12-1027,A00-1002,2012,2000,14,10,1.0,1995,0
3,E06-1047,A00-1002,2006,2000,4,10,1.0,2562,0
4,H05-1110,A00-1002,2005,2000,2,10,1.0,3150,0


### 0.5 links for test nodes ( > 2013) used in training

In [88]:
aan_train, aan_test = train_test_split_by_year(aan_links_connected, 2013, 0.5)

In [23]:
aan_test.to_csv('test_2013_05.csv', index=False)
aan_train.to_csv('train_2013_05.csv', index=False)

In [17]:
aan_test = pd.read_csv('test_2013_05.csv')
aan_train = pd.read_csv('train_2013_05.csv')

In [22]:
aan_test = pd.read_csv('test_2013_05_no_isolated.csv')
aan_train = pd.read_csv('train_2013_05_no_isolated.csv')

In [23]:
len(aan_test)/(len(aan_links_connected))

0.14475020037403152

In [19]:
G_train = nx.DiGraph()

for i in tqdm_notebook(aan_train.index):
    G_train.add_edge(aan_train.iloc[i]['citing'], aan_train.iloc[i]['cited'])
    
# print(nx.info(G))

HBox(children=(IntProgress(value=0, max=63410), HTML(value='')))




In [24]:
print(nx.info(G_train))

Name: 
Type: DiGraph
Number of nodes: 13493
Number of edges: 64024
Average in degree:   4.7450
Average out degree:   4.7450


In [27]:
aan_train_neg = generate_negative_edges(G_train, len(G_train.edges()), 0)
aan_test_neg = generate_negative_edges_test(G, list(aan_test['citing']), len(aan_test))

In [28]:
def train_test_preprocess(positive_df, neg_list):
    pairs = list(zip(list(positive_df['citing']), list(positive_df['cited']), [1]*len(positive_df)))
    neg_pairs = list(zip(list(zip(*neg_list))[0],list(zip(*neg_list))[1], [0]*len(neg_list)))
    pairs += neg_pairs
    random.shuffle(pairs)
    return pairs

In [29]:
train_pairs = train_test_preprocess(aan_train, aan_train_neg)
test_pairs = train_test_preprocess(aan_test, aan_test_neg)

In [45]:
with open('aan_node2vec_128_pq4.txt', 'r') as f:
    n2v_emb = f.read()

In [48]:
n2v_emb = [(emb.split()[0],np.array([float(num) for num in emb.split()[1:]])) for emb in n2v_emb.split('\n')[1:] if emb != '']

In [49]:
len(n2v_emb)

13506

In [41]:
train_pairs[0]

('P11-2069', 'W13-1715', 0)

In [50]:
n2v_dict = dict(n2v_emb)

In [51]:
n2v_dict['P11-2069']

array([ 0.02412121, -0.01680263,  0.03892118,  0.02126633, -0.03291467,
        0.01908346, -0.01083011,  0.001769  ,  0.01433998, -0.04676037,
       -0.06441756,  0.00492129,  0.01265838, -0.02210326,  0.02850015,
       -0.08230983,  0.05349142,  0.02193242,  0.01935636,  0.01810938,
       -0.00658814,  0.01739219, -0.01938708,  0.00232244, -0.00019859,
        0.03019859,  0.01274948, -0.0202236 ,  0.02031925,  0.03294629,
       -0.01076858, -0.02412132, -0.03906479,  0.06376281,  0.07908904,
       -0.0249356 , -0.00804732, -0.04933048,  0.0073052 ,  0.01831379,
       -0.0031514 ,  0.01841727, -0.02172787, -0.05733622,  0.03863462,
        0.00191617, -0.00265193, -0.02926864, -0.00222213,  0.01306296,
       -0.00373002, -0.0259631 ,  0.03665261,  0.00354773, -0.00531284,
        0.0129802 , -0.0552841 ,  0.02216919, -0.08386905,  0.03618599,
        0.01265107, -0.02113252, -0.03763052, -0.06579472,  0.04618946,
        0.02085893,  0.00604036,  0.00811745, -0.05084214,  0.01

In [54]:
with open('n2v_pq4_128.pkl', 'wb') as f:
    pickle.dump(n2v_dict, f)

In [53]:
with open('./asymproj_edge_dnn-master/datasets/aan_graph/index.pkl', 'rb') as f:
    index_graph = pickle.load(f)

In [66]:
index_graph['index'][train_pairs[0][0]]

5896

#### Mean edge aggregating

In [55]:
embed_dim = 128

k_train = 0
k_test = 0

X_train = np.zeros((len(train_pairs), embed_dim))
for i in range(len(train_pairs)):
    try:
        X_train[i] = mean(n2v_dict[train_pairs[i][0]],
                                    n2v_dict[train_pairs[i][1]])
    except KeyError:
        k_train += 1
        pass
y_train = np.array(list(zip(*(train_pairs)))[-1])

X_test = np.zeros((len(test_pairs), embed_dim))
for i in range(len(test_pairs)):
    try:
        X_test[i] = mean(n2v_dict[test_pairs[i][0]],
                                    n2v_dict[test_pairs[i][1]])    
    except:
        k_test += 1
        pass
y_test = np.array(list(zip(*(test_pairs)))[-1])

In [57]:
rand_forest = RandomForestClassifier()

rand_forest, y_pred_train, y_pred_test, precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, roc_auc_train, roc_auc_test, logloss_train, logloss_test = \
classify_model(rand_forest, X_train, y_train, X_test, y_test)

In [58]:
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.8014227334645073
Recall: 0.48864894795127356
Accuracy: 0.6837855297157622
F1-macro: 0.6712679779116432
F1-micro: 0.6837855297157622
Logloss: 10.921709288766674
ROC-AUC: 0.6837855297157622


In [59]:
print(classification_report(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))

             precision    recall  f1-score   support

          0       0.63      0.88      0.74     10836
          1       0.80      0.49      0.61     10836

avg / total       0.72      0.68      0.67     21672

[[9524 1312]
 [5541 5295]]


#### Product aggregating

In [60]:
embed_dim = 128

k_train = 0
k_test = 0

X_train = np.zeros((len(train_pairs), embed_dim))
for i in range(len(train_pairs)):
    try:
        X_train[i] = product(n2v_dict[train_pairs[i][0]],
                                    n2v_dict[train_pairs[i][1]])
    except KeyError:
        k_train += 1
        pass
y_train = np.array(list(zip(*(train_pairs)))[-1])

X_test = np.zeros((len(test_pairs), embed_dim))
for i in range(len(test_pairs)):
    try:
        X_test[i] = product(n2v_dict[test_pairs[i][0]],
                                    n2v_dict[test_pairs[i][1]])    
    except:
        k_test += 1
        pass
y_test = np.array(list(zip(*(test_pairs)))[-1])

In [61]:
rand_forest = RandomForestClassifier()

rand_forest, y_pred_train, y_pred_test, precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, roc_auc_train, roc_auc_test, logloss_train, logloss_test = \
classify_model(rand_forest, X_train, y_train, X_test, y_test)

In [62]:
# new train/test, no isolated
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.8284247092584202
Recall: 0.5061830933923958
Accuracy: 0.7006736803248431
F1-macro: 0.6889060578185546
F1-micro: 0.7006736803248431
Logloss: 10.338406737563046
ROC-AUC: 0.7006736803248431


In [63]:
print(classification_report(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))

             precision    recall  f1-score   support

          0       0.64      0.90      0.75     10836
          1       0.83      0.51      0.63     10836

avg / total       0.74      0.70      0.69     21672

[[9700 1136]
 [5351 5485]]


#### L1 

In [64]:
embed_dim = 128

k_train = 0
k_test = 0

X_train = np.zeros((len(train_pairs), embed_dim))
for i in range(len(train_pairs)):
    try:
        X_train[i] = l1(n2v_dict[train_pairs[i][0]],
                                    n2v_dict[train_pairs[i][1]])
    except KeyError:
        k_train += 1
        pass
y_train = np.array(list(zip(*(train_pairs)))[-1])

X_test = np.zeros((len(test_pairs), embed_dim))
for i in range(len(test_pairs)):
    try:
        X_test[i] = l1(n2v_dict[test_pairs[i][0]],
                                    n2v_dict[test_pairs[i][1]])    
    except:
        k_test += 1
        pass
y_test = np.array(list(zip(*(test_pairs)))[-1])

In [65]:
rand_forest = RandomForestClassifier()

rand_forest, y_pred_train, y_pred_test, precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, roc_auc_train, roc_auc_test, logloss_train, logloss_test = \
classify_model(rand_forest, X_train, y_train, X_test, y_test)

In [66]:
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.7560429722470904
Recall: 0.4676079734219269
Accuracy: 0.6583610188261351
F1-macro: 0.6454604873135743
F1-micro: 0.6583610188261351
Logloss: 11.799852702552466
ROC-AUC: 0.6583610188261352


In [78]:
print(classification_report(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))

             precision    recall  f1-score   support

          0       0.76      0.95      0.84     11450
          1       0.93      0.71      0.80     11450

avg / total       0.85      0.83      0.82     22900

[[10823   627]
 [ 3359  8091]]


#### L2

In [67]:
embed_dim = 128

k_train = 0
k_test = 0

X_train = np.zeros((len(train_pairs), embed_dim))
for i in range(len(train_pairs)):
    try:
        X_train[i] = l2(n2v_dict[train_pairs[i][0]],
                                    n2v_dict[train_pairs[i][1]])
    except KeyError:
        k_train += 1
        pass
y_train = np.array(list(zip(*(train_pairs)))[-1])

X_test = np.zeros((len(test_pairs), embed_dim))
for i in range(len(test_pairs)):
    try:
        X_test[i] = l2(n2v_dict[test_pairs[i][0]],
                                    n2v_dict[test_pairs[i][1]])    
    except:
        k_test += 1
        pass
y_test = np.array(list(zip(*(test_pairs)))[-1])

In [68]:
rand_forest = RandomForestClassifier()

rand_forest, y_pred_train, y_pred_test, precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, roc_auc_train, roc_auc_test, logloss_train, logloss_test = \
classify_model(rand_forest, X_train, y_train, X_test, y_test)

In [69]:
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.7546634830622295
Recall: 0.4666851236618679
Accuracy: 0.657484311554079
F1-macro: 0.6445441805673713
F1-micro: 0.657484311554079
Logloss: 11.83013343104454
ROC-AUC: 0.6574843115540789


In [70]:
print(classification_report(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))

             precision    recall  f1-score   support

          0       0.61      0.85      0.71     10836
          1       0.75      0.47      0.58     10836

avg / total       0.68      0.66      0.64     21672

[[9192 1644]
 [5779 5057]]


* Best result: Random Forest + product edge function

### 0.3 links used for training

In [71]:
aan_train, aan_test = train_test_split_by_year(aan_links_connected, 2013, 0.3)

In [72]:
len(aan_test)/(len(aan_links_connected))

0.2004942559444296

In [73]:
G_train = nx.DiGraph()

for i in tqdm_notebook(aan_train.index):
    G_train.add_edge(aan_train.iloc[i]['citing'], aan_train.iloc[i]['cited'])
    
print(nx.info(G_train))

HBox(children=(IntProgress(value=0, max=59851), HTML(value='')))


Name: 
Type: DiGraph
Number of nodes: 13490
Number of edges: 59851
Average in degree:   4.4367
Average out degree:   4.4367


In [73]:
aan_test.to_csv('test_2013_03.csv', index=False)
aan_train.to_csv('train_2013_03.csv', index=False)

In [62]:
aan_test = pd.read_csv('test_2013_03.csv')
aan_train = pd.read_csv('train_2013_03.csv')

In [74]:
aan_train_neg = generate_negative_edges(G_train, len(G_train.edges()), 0)
aan_test_neg = generate_negative_edges_test(G, list(aan_test['citing']), len(aan_test))

train_pairs = train_test_preprocess(aan_train, aan_train_neg)
test_pairs = train_test_preprocess(aan_test, aan_test_neg)

In [75]:
embed_dim = 128

k_train = 0
k_test = 0

X_train = np.zeros((len(train_pairs), embed_dim))
for i in range(len(train_pairs)):
    try:
        X_train[i] = product(n2v_dict[train_pairs[i][0]],
                                    n2v_dict[train_pairs[i][1]])
    except KeyError:
        k_train += 1
        pass
y_train = np.array(list(zip(*(train_pairs)))[-1])

X_test = np.zeros((len(test_pairs), embed_dim))
for i in range(len(test_pairs)):
    try:
        X_test[i] = product(n2v_dict[test_pairs[i][0]],
                                    n2v_dict[test_pairs[i][1]])    
    except:
        k_test += 1
        pass
y_test = np.array(list(zip(*(test_pairs)))[-1])

In [76]:
rand_forest = RandomForestClassifier()

rand_forest, y_pred_train, y_pred_test, precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, roc_auc_train, roc_auc_test, logloss_train, logloss_test = \
classify_model(rand_forest, X_train, y_train, X_test, y_test)

In [77]:
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.8387206532834297
Recall: 0.49270437737357586
Accuracy: 0.6989806116330202
F1-macro: 0.6856030579614383
F1-micro: 0.6989806116330202
Logloss: 10.39687922353117
ROC-AUC: 0.6989806116330202


In [78]:
print(classification_report(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))

             precision    recall  f1-score   support

          0       0.64      0.91      0.75     15009
          1       0.84      0.49      0.62     15009

avg / total       0.74      0.70      0.69     30018

[[13587  1422]
 [ 7614  7395]]


In [79]:
aan_train, aan_test = train_test_split_by_year(aan_links_connected, 2013, 0.1)

In [80]:
len(aan_test)/(len(aan_links_connected))

0.2612343040341972

In [81]:
G_train = nx.DiGraph()

for i in tqdm_notebook(aan_train.index):
    G_train.add_edge(aan_train.iloc[i]['citing'], aan_train.iloc[i]['cited'])
    
print(nx.info(G_train))

HBox(children=(IntProgress(value=0, max=55304), HTML(value='')))


Name: 
Type: DiGraph
Number of nodes: 13489
Number of edges: 55304
Average in degree:   4.0999
Average out degree:   4.0999


In [83]:
aan_test.to_csv('test_2013_01.csv', index=False)
aan_train.to_csv('train_2013_01.csv', index=False)

In [82]:
aan_train_neg = generate_negative_edges(G_train, len(G_train.edges()), 0)
aan_test_neg = generate_negative_edges_test(G, list(aan_test['citing']), len(aan_test))

train_pairs = train_test_preprocess(aan_train, aan_train_neg)
test_pairs = train_test_preprocess(aan_test, aan_test_neg)

In [83]:
embed_dim = 128

k_train = 0
k_test = 0

X_train = np.zeros((len(train_pairs), embed_dim))
for i in range(len(train_pairs)):
    try:
        X_train[i] = product(n2v_dict[train_pairs[i][0]],
                                    n2v_dict[train_pairs[i][1]])
    except KeyError:
        k_train += 1
        pass
y_train = np.array(list(zip(*(train_pairs)))[-1])

X_test = np.zeros((len(test_pairs), embed_dim))
for i in range(len(test_pairs)):
    try:
        X_test[i] = product(n2v_dict[test_pairs[i][0]],
                                    n2v_dict[test_pairs[i][1]])    
    except:
        k_test += 1
        pass
y_test = np.array(list(zip(*(test_pairs)))[-1])

In [84]:
rand_forest = RandomForestClassifier()

rand_forest, y_pred_train, y_pred_test, precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, roc_auc_train, roc_auc_test, logloss_train, logloss_test = \
classify_model(rand_forest, X_train, y_train, X_test, y_test)

In [85]:
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.844543811245362
Recall: 0.4539271834731029
Accuracy: 0.6851861321333607
F1-macro: 0.6673983633394358
F1-micro: 0.6851861321333607
Logloss: 10.873319193412154
ROC-AUC: 0.6851861321333607


In [100]:
import scipy.sparse as sp

In [None]:
def normalize_adj(adj):
    """Symmetrically normalize adjacency matrix."""
    adj = sp.coo_matrix(adj)
    rowsum = np.array(adj.sum(1))
    d_inv_sqrt = np.power(rowsum, -0.5).flatten()
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
    return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()

In [143]:
def normalize_adj(adj):
    """Symmetrically normalize adjacency matrix."""
    adj = sp.coo_matrix(adj)
    rowsum = np.array(adj.sum(1))
    d_inv = np.power(rowsum.astype(float), -1).flatten()
    d_inv[np.isinf(d_inv)] = 0.
    d_mat_inv = sp.diags(d_inv)
    return d_mat_inv.dot(adj).tocoo()

In [144]:
adj = np.array([[1,1,0,0],
            [1,1,0,0],
            [0,0,1,0],
             [0,0,0,0]])

In [145]:
norm = normalize_adj(adj)

  """


In [146]:
norm.todense()

matrix([[0.5, 0.5, 0. , 0. ],
        [0.5, 0.5, 0. , 0. ],
        [0. , 0. , 1. , 0. ],
        [0. , 0. , 0. , 0. ]])

In [125]:
adj = sp.coo_matrix(adj)

In [126]:
adj

<4x4 sparse matrix of type '<class 'numpy.int32'>'
	with 5 stored elements in COOrdinate format>

In [127]:
rowsum = np.array(adj.sum(1))

In [128]:
rowsum

array([[2],
       [2],
       [1],
       [0]], dtype=int32)

In [129]:
rowsum.astype(float)

array([[2.],
       [2.],
       [1.],
       [0.]])

In [130]:
d_inv = np.power(rowsum.astype(float), -1).flatten()

  """Entry point for launching an IPython kernel.


In [131]:
d_inv

array([0.5, 0.5, 1. , inf])

In [132]:
d_inv[np.isinf(d_inv)] = 0.

In [133]:
d_inv

array([0.5, 0.5, 1. , 0. ])

In [134]:
d_mat_inv = sp.diags(d_inv)

In [135]:
d_mat_inv

<4x4 sparse matrix of type '<class 'numpy.float64'>'
	with 4 stored elements (1 diagonals) in DIAgonal format>

In [141]:
d_mat_inv.dot(adj).todense()

matrix([[0.5, 0.5, 0. , 0. ],
        [0.5, 0.5, 0. , 0. ],
        [0. , 0. , 1. , 0. ],
        [0. , 0. , 0. , 0. ]])

In [142]:
adj.dot(d_mat_inv).todense()

matrix([[0.5, 0.5, 0. , 0. ],
        [0.5, 0.5, 0. , 0. ],
        [0. , 0. , 1. , 0. ],
        [0. , 0. , 0. , 0. ]])

In [None]:
D^(-1)*A instead of D^(-1/2)*A*D^(-1/2)

In [107]:
d_inv_sqrt

array([0.57735027, 0.70710678, 0.70710678, 0.57735027])

In [108]:
d = rowsum.flatten()

In [110]:
d

array([3, 2, 2, 3], dtype=int32)