In [1]:
import pandas as pd
import networkx as nx
import numpy as np
import random

In [2]:
from gensim.models.keyedvectors import KeyedVectors



In [3]:
import pickle

In [4]:
from tqdm import tqdm_notebook

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [6]:
def classify_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
                     
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    precision_train = precision_score(y_train, y_pred_train)
    precision_test = precision_score(y_test, y_pred_test)
    recall_train = recall_score(y_train, y_pred_train)
    recall_test = recall_score(y_test, y_pred_test)
    accuracy_train = accuracy_score(y_train, y_pred_train)
    accuracy_test = accuracy_score(y_test, y_pred_test)
    f1_macro_train = f1_score(y_train, y_pred_train, average='macro')
    f1_macro_test = f1_score(y_test, y_pred_test, average='macro')
    f1_micro_train = f1_score(y_train, y_pred_train, average='micro')
    f1_micro_test = f1_score(y_test, y_pred_test, average='micro')
    logloss_train = log_loss(y_train, y_pred_train)
    logloss_test = log_loss(y_test, y_pred_test)
    roc_auc_train = roc_auc_score(y_train, y_pred_train)
    roc_auc_test = roc_auc_score(y_test, y_pred_test)

    return model, y_pred_train, y_pred_test, \
        precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, \
        f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, \
        roc_auc_train, roc_auc_test, logloss_train, logloss_test 

In [7]:
aan_links_connected = pd.read_csv('aan_links_connected.csv')

In [8]:
aan_links_connected.shape

(74860, 9)

In [9]:
aan_links_connected.head()

Unnamed: 0,citing,cited,year_citing,year_cited,out_cites_count,in_cites_count,cite_rank,node_citing,node_cited
0,C08-3004,A00-1002,2008,2000,1,10,1.0,560,0
1,D09-1141,A00-1002,2009,2000,14,10,1.0,1682,0
2,D12-1027,A00-1002,2012,2000,14,10,1.0,1995,0
3,E06-1047,A00-1002,2006,2000,4,10,1.0,2562,0
4,H05-1110,A00-1002,2005,2000,2,10,1.0,3150,0


In [11]:
G = nx.DiGraph()

for i in tqdm_notebook(aan_links_connected.index):
    G.add_edge(aan_links_connected.iloc[i]['citing'], aan_links_connected.iloc[i]['cited'])
    
# print(nx.info(G))

In [12]:
G = nx.read_gpickle('G_aan_13506')

In [13]:
G_train = nx.read_gpickle('G_aan_13506_train_05')

In [14]:
def generate_negative_edges(graph, count_gen_edges, part_neg_directed):
    negative_edges = set()
    nodes = list(graph.nodes())
    edges = list(graph.edges())

    count_neg_directed = int(part_neg_directed*count_gen_edges)
    for a, b in edges:
        if len(negative_edges) >= count_neg_directed:
            break
        if not graph.has_edge(b, a):
            negative_edges.add((b, a))       
    
    while len(negative_edges) < count_gen_edges:
        i = random.randint(0, len(nodes) - 1)
        j = random.randint(0, len(nodes) - 1)
        if (i != j) and not graph.has_edge(nodes[i], nodes[j]):
            negative_edges.add((nodes[i], nodes[j]))
    return list(negative_edges)

In [15]:
def generate_negative_edges_test(graph, test_nodes, count_gen_edges):
    negative_edges = set()
    nodes = list(graph.nodes())
    while len(negative_edges) < count_gen_edges:
        i = random.randint(0, len(test_nodes) - 1)
        j = random.randint(0, len(nodes) - 1)
        if i == j:
            continue
        if graph.has_edge(test_nodes[i], nodes[j]):
            continue
        negative_edges.add((test_nodes[i], nodes[j]))
    return list(negative_edges)

In [16]:
def link_to_time(links_df, meta_df):
    years = meta_df[['id','year']]
    years.columns = ['citing', 'year_citing']
    links_df = links_df.merge(years, how = 'left', on = 'citing')
    years.columns = ['cited', 'year_cited']
    links_df = links_df.merge(years, how='left', on = 'cited')
    links_df['out_cites_count'] = links_df.groupby('citing')['cited'].transform(lambda x: x.count())
    links_df['in_cites_count'] = links_df.groupby('cited')['citing'].transform(lambda x: x.count())
    links_df['cite_rank'] = links_df.groupby('citing')['cited'].transform(lambda x: x.rank())
    return links_df

In [17]:
def train_test_split_by_year(links_years_df, year, part=None):
    """
    links_years_df - cite edges dataframe with year of citing paper
    year - first year of test period
    part - part of test period edges to include into train"""
    if part:
        train = links_years_df[(links_years_df.year_citing < year)|\
                               ((links_years_df.year_citing >= year)&\
                                ((links_years_df.cite_rank < part*links_years_df.out_cites_count + 1)|\
                                (links_years_df.in_cites_count == 1)))]
        test = links_years_df[((links_years_df.year_citing >= year)&\
                                (links_years_df.cite_rank >= part*links_years_df.out_cites_count + 1)&\
                              (links_years_df.in_cites_count != 1))]
    else:
        train = links_years_df[(links_years_df.year_citing < year)]
        test = links_years_df[(links_years_df.year_citing >= year)]
    return train.reset_index(drop=True), test.reset_index(drop=True)
    

In [18]:
def product(u,v):
    return u*v
def mean(u,v):
    return (u+v)/2
def l1(u,v):
    return np.abs(u-v)
def l2(u,v):
    return (u-v)**2

In [16]:
aan_links_connected.head()

Unnamed: 0,citing,cited,year_citing,year_cited,out_cites_count,in_cites_count,cite_rank,node_citing,node_cited
0,C08-3004,A00-1002,2008,2000,1,10,1.0,560,0
1,D09-1141,A00-1002,2009,2000,14,10,1.0,1682,0
2,D12-1027,A00-1002,2012,2000,14,10,1.0,1995,0
3,E06-1047,A00-1002,2006,2000,4,10,1.0,2562,0
4,H05-1110,A00-1002,2005,2000,2,10,1.0,3150,0


### 0.5 links for test nodes ( > 2013) used in training

In [88]:
aan_train, aan_test = train_test_split_by_year(aan_links_connected, 2013, 0.5)

In [23]:
aan_test.to_csv('test_2013_05.csv', index=False)
aan_train.to_csv('train_2013_05.csv', index=False)

In [19]:
# aan_test = pd.read_csv('test_2013_05.csv')
# aan_train = pd.read_csv('train_2013_05.csv')

aan_test = pd.read_csv('test_2013_05_no_isolated.csv')
aan_train = pd.read_csv('train_2013_05_no_isolated.csv')

In [18]:
len(aan_test)/(len(aan_links_connected))

0.15295217739780925

In [19]:
G_train = nx.DiGraph()

for i in tqdm_notebook(aan_train.index):
    G_train.add_edge(aan_train.iloc[i]['citing'], aan_train.iloc[i]['cited'])
    
# print(nx.info(G))

HBox(children=(IntProgress(value=0, max=63410), HTML(value='')))




In [20]:
print(nx.info(G_train))

Name: 
Type: DiGraph
Number of nodes: 13493
Number of edges: 64024
Average in degree:   4.7450
Average out degree:   4.7450


In [21]:
aan_train_neg = generate_negative_edges(G_train, len(G_train.edges()), 0.5)
aan_test_neg = generate_negative_edges_test(G, list(aan_test['citing']), len(aan_test))

In [22]:
def train_test_preprocess(positive_df, neg_list):
    pairs = list(zip(list(positive_df['citing']), list(positive_df['cited']), [1]*len(positive_df)))
    neg_pairs = list(zip(list(zip(*neg_list))[0],list(zip(*neg_list))[1], [0]*len(neg_list)))
    pairs += neg_pairs
    random.shuffle(pairs)
    return pairs

In [23]:
train_pairs = train_test_preprocess(aan_train, aan_train_neg)
test_pairs = train_test_preprocess(aan_test, aan_test_neg)

In [78]:
test_x_y = pd.DataFrame(data = test_pairs, columns = ['citing', 'cited', 'class'])

In [80]:
test_x_y.to_csv('test_x_y_05.csv', index=False)

In [24]:
with open('pctadw.embedding', 'r') as f:
    pctadw_emb = f.read()

In [25]:
pctadw_emb = [[float(num) for num in emb.split()] for emb in pctadw_emb.split('\n') if emb != '']

In [26]:
len(pctadw_emb)

13506

In [27]:
pctadw_emb_matr = np.array(pctadw_emb)

In [28]:
pctadw_emb_matr

array([[-0.00089386,  0.16098227, -0.21167406, ...,  0.21111514,
         0.02308364, -0.07383676],
       [ 0.29594016,  0.13280925, -0.1268356 , ...,  0.46518382,
        -0.03072343,  0.65745521],
       [-0.13309586,  0.400181  , -0.06074005, ..., -0.80929017,
        -0.26320201,  0.01463345],
       ...,
       [-0.1279396 , -0.14702646, -0.25501603, ..., -0.10139858,
         0.0052771 ,  0.21767963],
       [ 0.15952791,  0.21629976,  0.11390562, ...,  0.32770464,
        -0.04475396, -0.13116851],
       [ 0.36052322,  0.01876117,  0.03143245, ...,  0.04357247,
        -0.20669529, -0.2407275 ]])

In [29]:
pctadw_emb_matr.shape

(13506, 200)

In [30]:
with open('./asymproj_edge_dnn-master/datasets/aan_graph/index.pkl', 'rb') as f:
    index_graph = pickle.load(f)

In [31]:
index_graph['index'][train_pairs[i][0]]

425

#### Mean edge aggregating

In [32]:
embed_dim = 100

k_train = 0
k_test = 0

X_train = np.zeros((len(train_pairs), embed_dim))
for i in range(len(train_pairs)):
    try:
        X_train[i] = mean(pctadw_emb_matr[index_graph['index'][train_pairs[i][0]], :embed_dim],
                                    pctadw_emb_matr[index_graph['index'][train_pairs[i][1]], embed_dim:])
    except KeyError:
        k_train += 1
        pass
y_train = np.array(list(zip(*(train_pairs)))[-1])

X_test = np.zeros((len(test_pairs), embed_dim))
for i in range(len(test_pairs)):
    try:
        X_test[i] = mean(pctadw_emb_matr[index_graph['index'][test_pairs[i][0]], :embed_dim],
                                    pctadw_emb_matr[index_graph['index'][test_pairs[i][1]], embed_dim:])    
    except:
        k_test += 1
        pass
y_test = np.array(list(zip(*(test_pairs)))[-1])

In [None]:
### RandForest

In [33]:
rand_forest = RandomForestClassifier()

rand_forest, y_pred_train, y_pred_test, precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, roc_auc_train, roc_auc_test, logloss_train, logloss_test = \
classify_model(rand_forest, X_train, y_train, X_test, y_test)

In [34]:
# new train/test
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.6962891669340178
Recall: 0.6008674787744556
Accuracy: 0.6693890734588409
F1-macro: 0.667829463386362
F1-micro: 0.6693890734588409
Logloss: 11.419001648497453
ROC-AUC: 0.6693890734588409


In [48]:
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.6854469643038748
Recall: 0.588646288209607
Accuracy: 0.6592576419213974
F1-macro: 0.6575501997947237
F1-micro: 0.6592576419213974
Logloss: 11.768932111979895
ROC-AUC: 0.6592576419213975


In [49]:
print(classification_report(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))

             precision    recall  f1-score   support

          0       0.64      0.73      0.68     11450
          1       0.69      0.59      0.63     11450

avg / total       0.66      0.66      0.66     22900

[[8357 3093]
 [4710 6740]]


#### Product aggregating

In [50]:
embed_dim = 100

k_train = 0
k_test = 0

X_train = np.zeros((len(train_pairs), embed_dim))
for i in range(len(train_pairs)):
    try:
        X_train[i] = product(pctadw_emb_matr[index_graph['index'][train_pairs[i][0]], :embed_dim],
                                    pctadw_emb_matr[index_graph['index'][train_pairs[i][1]], embed_dim:])
    except KeyError:
        k_train += 1
        pass
y_train = np.array(list(zip(*(train_pairs)))[-1])

X_test = np.zeros((len(test_pairs), embed_dim))
for i in range(len(test_pairs)):
    try:
        X_test[i] = product(pctadw_emb_matr[index_graph['index'][test_pairs[i][0]], :embed_dim],
                                    pctadw_emb_matr[index_graph['index'][test_pairs[i][1]], embed_dim:])    
    except:
        k_test += 1
        pass
y_test = np.array(list(zip(*(test_pairs)))[-1])

In [51]:
rand_forest = RandomForestClassifier()

rand_forest, y_pred_train, y_pred_test, precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, roc_auc_train, roc_auc_test, logloss_train, logloss_test = \
classify_model(rand_forest, X_train, y_train, X_test, y_test)

In [52]:
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.6403202164843839
Recall: 0.4959825327510917
Accuracy: 0.6086899563318777
F1-macro: 0.6036552029082989
F1-micro: 0.6086899563318777
Logloss: 13.515481484305544
ROC-AUC: 0.6086899563318778


In [53]:
print(classification_report(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))

             precision    recall  f1-score   support

          0       0.59      0.72      0.65     11450
          1       0.64      0.50      0.56     11450

avg / total       0.61      0.61      0.60     22900

[[8260 3190]
 [5771 5679]]


#### L1 

In [55]:
embed_dim = 100

k_train = 0
k_test = 0

X_train = np.zeros((len(train_pairs), embed_dim))
for i in range(len(train_pairs)):
    try:
        X_train[i] = l1(pctadw_emb_matr[index_graph['index'][train_pairs[i][0]], :embed_dim],
                                    pctadw_emb_matr[index_graph['index'][train_pairs[i][1]], embed_dim:])
    except KeyError:
        k_train += 1
        pass
y_train = np.array(list(zip(*(train_pairs)))[-1])

X_test = np.zeros((len(test_pairs), embed_dim))
for i in range(len(test_pairs)):
    try:
        X_test[i] = l1(pctadw_emb_matr[index_graph['index'][test_pairs[i][0]], :embed_dim],
                                    pctadw_emb_matr[index_graph['index'][test_pairs[i][1]], embed_dim:])    
    except:
        k_test += 1
        pass
y_test = np.array(list(zip(*(test_pairs)))[-1])

In [56]:
rand_forest = RandomForestClassifier()

rand_forest, y_pred_train, y_pred_test, precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, roc_auc_train, roc_auc_test, logloss_train, logloss_test = \
classify_model(rand_forest, X_train, y_train, X_test, y_test)

In [57]:
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.5673087387093264
Recall: 0.4552838427947598
Accuracy: 0.5540174672489083
F1-macro: 0.5496270840728776
F1-micro: 0.5540174672489083
Logloss: 15.403829804393242
ROC-AUC: 0.5540174672489083


In [58]:
print(classification_report(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))

             precision    recall  f1-score   support

          0       0.55      0.65      0.59     11450
          1       0.57      0.46      0.51     11450

avg / total       0.56      0.55      0.55     22900

[[7474 3976]
 [6237 5213]]


#### L2

In [59]:
embed_dim = 100

k_train = 0
k_test = 0

X_train = np.zeros((len(train_pairs), embed_dim))
for i in range(len(train_pairs)):
    try:
        X_train[i] = l2(pctadw_emb_matr[index_graph['index'][train_pairs[i][0]], :embed_dim],
                                    pctadw_emb_matr[index_graph['index'][train_pairs[i][1]], embed_dim:])
    except KeyError:
        k_train += 1
        pass
y_train = np.array(list(zip(*(train_pairs)))[-1])

X_test = np.zeros((len(test_pairs), embed_dim))
for i in range(len(test_pairs)):
    try:
        X_test[i] = l2(pctadw_emb_matr[index_graph['index'][test_pairs[i][0]], :embed_dim],
                                    pctadw_emb_matr[index_graph['index'][test_pairs[i][1]], embed_dim:])    
    except:
        k_test += 1
        pass
y_test = np.array(list(zip(*(test_pairs)))[-1])

In [60]:
rand_forest = RandomForestClassifier()

rand_forest, y_pred_train, y_pred_test, precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, roc_auc_train, roc_auc_test, logloss_train, logloss_test = \
classify_model(rand_forest, X_train, y_train, X_test, y_test)

In [61]:
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.5669017905588714
Recall: 0.45624454148471616
Accuracy: 0.5538427947598253
F1-macro: 0.5495520896570568
F1-micro: 0.5538427947598253
Logloss: 15.409863302189796
ROC-AUC: 0.5538427947598252


In [62]:
print(classification_report(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))

             precision    recall  f1-score   support

          0       0.55      0.65      0.59     11450
          1       0.57      0.46      0.51     11450

avg / total       0.56      0.55      0.55     22900

[[7459 3991]
 [6226 5224]]


* Best result: Random Forest + product edge function

### 0.3 links used for training

In [63]:
aan_train, aan_test = train_test_split_by_year(aan_links_connected, 2013, 0.3)

In [64]:
len(aan_test)/(len(aan_links_connected))

0.2004942559444296

In [65]:
G_train = nx.DiGraph()

for i in tqdm_notebook(aan_train.index):
    G_train.add_edge(aan_train.iloc[i]['citing'], aan_train.iloc[i]['cited'])
    
print(nx.info(G_train))

HBox(children=(IntProgress(value=0, max=59851), HTML(value='')))


Name: 
Type: DiGraph
Number of nodes: 13490
Number of edges: 59851
Average in degree:   4.4367
Average out degree:   4.4367


In [73]:
aan_test.to_csv('test_2013_03.csv', index=False)
aan_train.to_csv('train_2013_03.csv', index=False)

In [62]:
aan_test = pd.read_csv('test_2013_03.csv')
aan_train = pd.read_csv('train_2013_03.csv')

In [66]:
aan_train_neg = generate_negative_edges(G_train, len(G_train.edges()), 0.5)
aan_test_neg = generate_negative_edges_test(G, list(aan_test['citing']), len(aan_test))

train_pairs = train_test_preprocess(aan_train, aan_train_neg)
test_pairs = train_test_preprocess(aan_test, aan_test_neg)

In [67]:
embed_dim = 100

k_train = 0
k_test = 0

X_train = np.zeros((len(train_pairs), embed_dim))
for i in range(len(train_pairs)):
    try:
        X_train[i] = mean(pctadw_emb_matr[index_graph['index'][train_pairs[i][0]], :embed_dim],
                                    pctadw_emb_matr[index_graph['index'][train_pairs[i][1]], embed_dim:])
    except KeyError:
        k_train += 1
        pass
y_train = np.array(list(zip(*(train_pairs)))[-1])

X_test = np.zeros((len(test_pairs), embed_dim))
for i in range(len(test_pairs)):
    try:
        X_test[i] = mean(pctadw_emb_matr[index_graph['index'][test_pairs[i][0]], :embed_dim],
                                    pctadw_emb_matr[index_graph['index'][test_pairs[i][1]], embed_dim:])    
    except:
        k_test += 1
        pass
y_test = np.array(list(zip(*(test_pairs)))[-1])

In [68]:
rand_forest = RandomForestClassifier()

rand_forest, y_pred_train, y_pred_test, precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, roc_auc_train, roc_auc_test, logloss_train, logloss_test = \
classify_model(rand_forest, X_train, y_train, X_test, y_test)

In [69]:
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.6908632928784062
Recall: 0.6030381770937437
Accuracy: 0.6666000399760144
F1-macro: 0.6652476033324394
F1-micro: 0.6666000399760144
Logloss: 11.515334550265122
ROC-AUC: 0.6666000399760145


In [70]:
print(classification_report(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))

             precision    recall  f1-score   support

          0       0.65      0.73      0.69     15009
          1       0.69      0.60      0.64     15009

avg / total       0.67      0.67      0.67     30018

[[10959  4050]
 [ 5958  9051]]


In [71]:
aan_train, aan_test = train_test_split_by_year(aan_links_connected, 2013, 0.1)

In [72]:
len(aan_test)/(len(aan_links_connected))

0.2612343040341972

In [73]:
G_train = nx.DiGraph()

for i in tqdm_notebook(aan_train.index):
    G_train.add_edge(aan_train.iloc[i]['citing'], aan_train.iloc[i]['cited'])
    
print(nx.info(G_train))

HBox(children=(IntProgress(value=0, max=55304), HTML(value='')))


Name: 
Type: DiGraph
Number of nodes: 13489
Number of edges: 55304
Average in degree:   4.0999
Average out degree:   4.0999


In [83]:
aan_test.to_csv('test_2013_01.csv', index=False)
aan_train.to_csv('train_2013_01.csv', index=False)

In [74]:
aan_train_neg = generate_negative_edges(G_train, len(G_train.edges()), 0.5)
aan_test_neg = generate_negative_edges_test(G, list(aan_test['citing']), len(aan_test))

train_pairs = train_test_preprocess(aan_train, aan_train_neg)
test_pairs = train_test_preprocess(aan_test, aan_test_neg)

In [75]:
embed_dim = 100

k_train = 0
k_test = 0

X_train = np.zeros((len(train_pairs), embed_dim))
for i in range(len(train_pairs)):
    try:
        X_train[i] = mean(pctadw_emb_matr[index_graph['index'][train_pairs[i][0]], :embed_dim],
                                    pctadw_emb_matr[index_graph['index'][train_pairs[i][1]], embed_dim:])
    except KeyError:
        k_train += 1
        pass
y_train = np.array(list(zip(*(train_pairs)))[-1])

X_test = np.zeros((len(test_pairs), embed_dim))
for i in range(len(test_pairs)):
    try:
        X_test[i] = mean(pctadw_emb_matr[index_graph['index'][test_pairs[i][0]], :embed_dim],
                                    pctadw_emb_matr[index_graph['index'][test_pairs[i][1]], embed_dim:])    
    except:
        k_test += 1
        pass
y_test = np.array(list(zip(*(test_pairs)))[-1])

In [76]:
rand_forest = RandomForestClassifier()

rand_forest, y_pred_train, y_pred_test, precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, roc_auc_train, roc_auc_test, logloss_train, logloss_test = \
classify_model(rand_forest, X_train, y_train, X_test, y_test)

In [77]:
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.6968371680294099
Recall: 0.5621804049907957
Accuracy: 0.6588003681734506
F1-macro: 0.6555851114697098
F1-micro: 0.6588003681734506
Logloss: 11.784715572320303
ROC-AUC: 0.6588003681734507
