In [3]:
import pickle

In [1]:
from tqdm import tqdm_notebook

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [4]:
import sklearn

In [6]:
r = RandomForestClassifier()

In [7]:
r.get_params

<bound method BaseEstimator.get_params of RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)>

In [6]:
# https://github.com/houchengbin/OpenANE

In [7]:
def classify_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
                     
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    precision_train = precision_score(y_train, y_pred_train)
    precision_test = precision_score(y_test, y_pred_test)
    recall_train = recall_score(y_train, y_pred_train)
    recall_test = recall_score(y_test, y_pred_test)
    accuracy_train = accuracy_score(y_train, y_pred_train)
    accuracy_test = accuracy_score(y_test, y_pred_test)
    f1_macro_train = f1_score(y_train, y_pred_train, average='macro')
    f1_macro_test = f1_score(y_test, y_pred_test, average='macro')
    f1_micro_train = f1_score(y_train, y_pred_train, average='micro')
    f1_micro_test = f1_score(y_test, y_pred_test, average='micro')
    logloss_train = log_loss(y_train, y_pred_train)
    logloss_test = log_loss(y_test, y_pred_test)
    roc_auc_train = roc_auc_score(y_train, y_pred_train)
    roc_auc_test = roc_auc_score(y_test, y_pred_test)

    return model, y_pred_train, y_pred_test, \
        precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, \
        f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, \
        roc_auc_train, roc_auc_test, logloss_train, logloss_test 

In [7]:
aan_links_connected = pd.read_csv('aan_links_connected.csv')

In [8]:
aan_links_connected.shape

(74860, 9)

In [9]:
aan_links_connected.head()

Unnamed: 0,citing,cited,year_citing,year_cited,out_cites_count,in_cites_count,cite_rank,node_citing,node_cited
0,C08-3004,A00-1002,2008,2000,1,10,1.0,560,0
1,D09-1141,A00-1002,2009,2000,14,10,1.0,1682,0
2,D12-1027,A00-1002,2012,2000,14,10,1.0,1995,0
3,E06-1047,A00-1002,2006,2000,4,10,1.0,2562,0
4,H05-1110,A00-1002,2005,2000,2,10,1.0,3150,0


In [10]:
G = nx.DiGraph()

for i in tqdm_notebook(aan_links_connected.index):
    G.add_edge(aan_links_connected.iloc[i]['citing'], aan_links_connected.iloc[i]['cited'])
    
print(nx.info(G))

HBox(children=(IntProgress(value=0, max=74860), HTML(value='')))


Name: 
Type: DiGraph
Number of nodes: 13506
Number of edges: 74860
Average in degree:   5.5427
Average out degree:   5.5427


In [8]:
G = nx.read_gpickle('G_aan_13506')
G_train = nx.read_gpickle('G_aan_13506_train_05')

In [9]:
def generate_negative_edges(graph, count_gen_edges, part_neg_directed):
    negative_edges = set()
    nodes = list(graph.nodes())
    edges = list(graph.edges())

    count_neg_directed = int(part_neg_directed*count_gen_edges)
    for a, b in edges:
        if len(negative_edges) >= count_neg_directed:
            break
        if not graph.has_edge(b, a):
            negative_edges.add((b, a))       
    
    while len(negative_edges) < count_gen_edges:
        i = random.randint(0, len(nodes) - 1)
        j = random.randint(0, len(nodes) - 1)
        if (i != j) and not graph.has_edge(nodes[i], nodes[j]):
            negative_edges.add((nodes[i], nodes[j]))
    return list(negative_edges)

In [10]:
def generate_negative_edges_test(graph, test_nodes, count_gen_edges):
    negative_edges = set()
    nodes = list(graph.nodes())
    while len(negative_edges) < count_gen_edges:
        i = random.randint(0, len(test_nodes) - 1)
        j = random.randint(0, len(nodes) - 1)
        if i == j:
            continue
        if graph.has_edge(test_nodes[i], nodes[j]):
            continue
        negative_edges.add((test_nodes[i], nodes[j]))
    return list(negative_edges)

In [11]:
def link_to_time(links_df, meta_df):
    years = meta_df[['id','year']]
    years.columns = ['citing', 'year_citing']
    links_df = links_df.merge(years, how = 'left', on = 'citing')
    years.columns = ['cited', 'year_cited']
    links_df = links_df.merge(years, how='left', on = 'cited')
    links_df['out_cites_count'] = links_df.groupby('citing')['cited'].transform(lambda x: x.count())
    links_df['in_cites_count'] = links_df.groupby('cited')['citing'].transform(lambda x: x.count())
    links_df['cite_rank'] = links_df.groupby('citing')['cited'].transform(lambda x: x.rank())
    return links_df

In [12]:
def train_test_split_by_year(links_years_df, year, part=None):
    """
    links_years_df - cite edges dataframe with year of citing paper
    year - first year of test period
    part - part of test period edges to include into train"""
    if part:
        train = links_years_df[(links_years_df.year_citing < year)|\
                               ((links_years_df.year_citing >= year)&\
                                ((links_years_df.cite_rank < part*links_years_df.out_cites_count + 1)|\
                                (links_years_df.in_cites_count == 1)))]
        test = links_years_df[((links_years_df.year_citing >= year)&\
                                (links_years_df.cite_rank >= part*links_years_df.out_cites_count + 1)&\
                              (links_years_df.in_cites_count != 1))]
    else:
        train = links_years_df[(links_years_df.year_citing < year)]
        test = links_years_df[(links_years_df.year_citing >= year)]
    return train.reset_index(drop=True), test.reset_index(drop=True)
    

In [13]:
def product(u,v):
    return u*v
def mean(u,v):
    return (u+v)/2
def l1(u,v):
    return np.abs(u-v)
def l2(u,v):
    return (u-v)**2

In [16]:
aan_links_connected.head()

Unnamed: 0,citing,cited,year_citing,year_cited,out_cites_count,in_cites_count,cite_rank,node_citing,node_cited
0,C08-3004,A00-1002,2008,2000,1,10,1.0,560,0
1,D09-1141,A00-1002,2009,2000,14,10,1.0,1682,0
2,D12-1027,A00-1002,2012,2000,14,10,1.0,1995,0
3,E06-1047,A00-1002,2006,2000,4,10,1.0,2562,0
4,H05-1110,A00-1002,2005,2000,2,10,1.0,3150,0


### 0.5 links for test nodes ( > 2013) used in training

In [88]:
aan_train, aan_test = train_test_split_by_year(aan_links_connected, 2013, 0.5)

In [23]:
aan_test.to_csv('test_2013_05.csv', index=False)
aan_train.to_csv('train_2013_05.csv', index=False)

In [17]:
aan_test = pd.read_csv('test_2013_05.csv')
aan_train = pd.read_csv('train_2013_05.csv')

In [14]:
aan_test = pd.read_csv('test_2013_05_no_isolated.csv')
aan_train = pd.read_csv('train_2013_05_no_isolated.csv')

In [18]:
len(aan_test)/(len(aan_links_connected))

0.15295217739780925

In [19]:
G_train = nx.DiGraph()

for i in tqdm_notebook(aan_train.index):
    G_train.add_edge(aan_train.iloc[i]['citing'], aan_train.iloc[i]['cited'])
    
# print(nx.info(G))

HBox(children=(IntProgress(value=0, max=63410), HTML(value='')))




In [20]:
print(nx.info(G_train))

Name: 
Type: DiGraph
Number of nodes: 13444
Number of edges: 63410
Average in degree:   4.7166
Average out degree:   4.7166


In [15]:
aan_train_neg = generate_negative_edges(G_train, len(G_train.edges()), 0)
aan_test_neg = generate_negative_edges_test(G, list(aan_test['citing']), len(aan_test))

In [16]:
def train_test_preprocess(positive_df, neg_list):
    pairs = list(zip(list(positive_df['citing']), list(positive_df['cited']), [1]*len(positive_df)))
    neg_pairs = list(zip(list(zip(*neg_list))[0],list(zip(*neg_list))[1], [0]*len(neg_list)))
    pairs += neg_pairs
    random.shuffle(pairs)
    return pairs

In [17]:
train_pairs = train_test_preprocess(aan_train, aan_train_neg)
test_pairs = train_test_preprocess(aan_test, aan_test_neg)

In [18]:
with open('aan_abrw_emb_128.txt', 'r') as f:
    abrw_emb = f.read()

In [19]:
abrw_emb = [(int(emb.split()[0]),[float(num) for num in emb.split()[1:]]) for emb in abrw_emb.split('\n')[1:] if emb != '']

In [20]:
len(abrw_emb)

13506

In [21]:
abrw_emb = sorted(abrw_emb, key = lambda x: x[0])

In [22]:
ids, vecs = zip(*abrw_emb)

In [23]:
abrw_emb_matr = np.array(vecs)

In [24]:
abrw_emb_matr

array([[ 0.2062004 , -0.19029757,  0.137299  , ..., -0.0541796 ,
         0.01296847, -0.12731244],
       [ 0.10474256,  0.1137826 , -0.26420864, ..., -0.14131752,
         0.07429265,  0.37535784],
       [ 0.23076214, -0.12822674,  0.18455397, ...,  0.36645988,
        -0.1084687 , -0.17594132],
       ...,
       [-0.01035675, -0.02917514, -0.00813602, ..., -0.32923788,
         0.32808638, -0.10091236],
       [-0.15382831,  0.03035559,  0.13049498, ...,  0.02312758,
        -0.3340426 ,  0.02827339],
       [ 0.1580649 ,  0.13279736,  0.00079761, ...,  0.20646344,
        -0.53030384,  0.18626145]])

In [25]:
abrw_emb_matr.shape

(13506, 128)

In [63]:
np.save('abrw_emb_matr.npy', abrw_emb_matr)

In [26]:
with open('./asymproj_edge_dnn-master/datasets/aan_graph/index.pkl', 'rb') as f:
    index_graph = pickle.load(f)

In [66]:
index_graph['index'][train_pairs[0][0]]

5896

#### Mean edge aggregating

In [67]:
embed_dim = 128

k_train = 0
k_test = 0

X_train = np.zeros((len(train_pairs), embed_dim))
for i in range(len(train_pairs)):
    try:
        X_train[i] = mean(abrw_emb_matr[index_graph['index'][train_pairs[i][0]]],
                                    abrw_emb_matr[index_graph['index'][train_pairs[i][1]]])
    except KeyError:
        k_train += 1
        pass
y_train = np.array(list(zip(*(train_pairs)))[-1])

X_test = np.zeros((len(test_pairs), embed_dim))
for i in range(len(test_pairs)):
    try:
        X_test[i] = mean(abrw_emb_matr[index_graph['index'][test_pairs[i][0]]],
                                    abrw_emb_matr[index_graph['index'][test_pairs[i][1]]])    
    except:
        k_test += 1
        pass
y_test = np.array(list(zip(*(test_pairs)))[-1])

In [None]:
### RandForest

In [68]:
rand_forest = RandomForestClassifier()

rand_forest, y_pred_train, y_pred_test, precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, roc_auc_train, roc_auc_test, logloss_train, logloss_test = \
classify_model(rand_forest, X_train, y_train, X_test, y_test)

In [69]:
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.8107529075698837
Recall: 0.6940611353711791
Accuracy: 0.7660262008733625
F1-macro: 0.7648081491802047
F1-micro: 0.7660262008733626
Logloss: 8.08123350118622
ROC-AUC: 0.7660262008733626


In [70]:
print(classification_report(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))

             precision    recall  f1-score   support

          0       0.73      0.84      0.78     11450
          1       0.81      0.69      0.75     11450

avg / total       0.77      0.77      0.76     22900

[[9595 1855]
 [3503 7947]]


#### Product aggregating

In [27]:
embed_dim = 128

k_train = 0
k_test = 0

X_train = np.zeros((len(train_pairs), embed_dim))
for i in range(len(train_pairs)):
    try:
        X_train[i] = product(abrw_emb_matr[index_graph['index'][train_pairs[i][0]]],
                                    abrw_emb_matr[index_graph['index'][train_pairs[i][1]]])
    except KeyError:
        k_train += 1
        pass
y_train = np.array(list(zip(*(train_pairs)))[-1])

X_test = np.zeros((len(test_pairs), embed_dim))
for i in range(len(test_pairs)):
    try:
        X_test[i] = product(abrw_emb_matr[index_graph['index'][test_pairs[i][0]]],
                                    abrw_emb_matr[index_graph['index'][test_pairs[i][1]]])    
    except:
        k_test += 1
        pass
y_test = np.array(list(zip(*(test_pairs)))[-1])

In [28]:
rand_forest = RandomForestClassifier()

rand_forest, y_pred_train, y_pred_test, precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, roc_auc_train, roc_auc_test, logloss_train, logloss_test = \
classify_model(rand_forest, X_train, y_train, X_test, y_test)

In [29]:
# new train/test, no isolated
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.9359451290397582
Recall: 0.7429863418235512
Accuracy: 0.8460686600221484
F1-macro: 0.8444154238666042
F1-micro: 0.8460686600221485
Logloss: 5.316620461037565
ROC-AUC: 0.8460686600221484


In [73]:
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.9381544502617801
Recall: 0.75117903930131
Accuracy: 0.8508296943231441
F1-macro: 0.849333538232202
F1-micro: 0.8508296943231441
Logloss: 5.152179630426106
ROC-AUC: 0.8508296943231441


In [74]:
print(classification_report(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))

             precision    recall  f1-score   support

          0       0.79      0.95      0.86     11450
          1       0.94      0.75      0.83     11450

avg / total       0.87      0.85      0.85     22900

[[10883   567]
 [ 2849  8601]]


#### L1 

In [75]:
embed_dim = 128

k_train = 0
k_test = 0

X_train = np.zeros((len(train_pairs), embed_dim))
for i in range(len(train_pairs)):
    try:
        X_train[i] = l1(abrw_emb_matr[index_graph['index'][train_pairs[i][0]]],
                                    abrw_emb_matr[index_graph['index'][train_pairs[i][1]]])
    except KeyError:
        k_train += 1
        pass
y_train = np.array(list(zip(*(train_pairs)))[-1])

X_test = np.zeros((len(test_pairs), embed_dim))
for i in range(len(test_pairs)):
    try:
        X_test[i] = l1(abrw_emb_matr[index_graph['index'][test_pairs[i][0]]],
                                    abrw_emb_matr[index_graph['index'][test_pairs[i][1]]])    
    except:
        k_test += 1
        pass
y_test = np.array(list(zip(*(test_pairs)))[-1])

In [76]:
rand_forest = RandomForestClassifier()

rand_forest, y_pred_train, y_pred_test, precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, roc_auc_train, roc_auc_test, logloss_train, logloss_test = \
classify_model(rand_forest, X_train, y_train, X_test, y_test)

In [77]:
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.9280798348245011
Recall: 0.7066375545851529
Accuracy: 0.8259388646288209
F1-macro: 0.8234257177288802
F1-micro: 0.8259388646288209
Logloss: 6.011880526537237
ROC-AUC: 0.8259388646288209


In [78]:
print(classification_report(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))

             precision    recall  f1-score   support

          0       0.76      0.95      0.84     11450
          1       0.93      0.71      0.80     11450

avg / total       0.85      0.83      0.82     22900

[[10823   627]
 [ 3359  8091]]


#### L2

In [79]:
embed_dim = 128

k_train = 0
k_test = 0

X_train = np.zeros((len(train_pairs), embed_dim))
for i in range(len(train_pairs)):
    try:
        X_train[i] = l2(abrw_emb_matr[index_graph['index'][train_pairs[i][0]]],
                                    abrw_emb_matr[index_graph['index'][train_pairs[i][1]]])
    except KeyError:
        k_train += 1
        pass
y_train = np.array(list(zip(*(train_pairs)))[-1])

X_test = np.zeros((len(test_pairs), embed_dim))
for i in range(len(test_pairs)):
    try:
        X_test[i] = l2(abrw_emb_matr[index_graph['index'][test_pairs[i][0]]],
                                    abrw_emb_matr[index_graph['index'][test_pairs[i][1]]])    
    except:
        k_test += 1
        pass
y_test = np.array(list(zip(*(test_pairs)))[-1])

In [80]:
rand_forest = RandomForestClassifier()

rand_forest, y_pred_train, y_pred_test, precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, roc_auc_train, roc_auc_test, logloss_train, logloss_test = \
classify_model(rand_forest, X_train, y_train, X_test, y_test)

In [81]:
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.9249547920433996
Recall: 0.7147598253275109
Accuracy: 0.8283842794759826
F1-macro: 0.8261396525778912
F1-micro: 0.8283842794759826
Logloss: 5.927420181864307
ROC-AUC: 0.8283842794759825


In [82]:
print(classification_report(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))

             precision    recall  f1-score   support

          0       0.77      0.94      0.85     11450
          1       0.92      0.71      0.81     11450

avg / total       0.85      0.83      0.83     22900

[[10786   664]
 [ 3266  8184]]


* Best result: Random Forest + product edge function

### 0.3 links used for training

In [83]:
aan_train, aan_test = train_test_split_by_year(aan_links_connected, 2013, 0.3)

In [84]:
len(aan_test)/(len(aan_links_connected))

0.2004942559444296

In [85]:
G_train = nx.DiGraph()

for i in tqdm_notebook(aan_train.index):
    G_train.add_edge(aan_train.iloc[i]['citing'], aan_train.iloc[i]['cited'])
    
print(nx.info(G_train))

HBox(children=(IntProgress(value=0, max=59851), HTML(value='')))


Name: 
Type: DiGraph
Number of nodes: 13490
Number of edges: 59851
Average in degree:   4.4367
Average out degree:   4.4367


In [73]:
aan_test.to_csv('test_2013_03.csv', index=False)
aan_train.to_csv('train_2013_03.csv', index=False)

In [62]:
aan_test = pd.read_csv('test_2013_03.csv')
aan_train = pd.read_csv('train_2013_03.csv')

In [86]:
aan_train_neg = generate_negative_edges(G_train, len(G_train.edges()), 0)
aan_test_neg = generate_negative_edges_test(G, list(aan_test['citing']), len(aan_test))

train_pairs = train_test_preprocess(aan_train, aan_train_neg)
test_pairs = train_test_preprocess(aan_test, aan_test_neg)

In [87]:
embed_dim = 128

k_train = 0
k_test = 0

X_train = np.zeros((len(train_pairs), embed_dim))
for i in range(len(train_pairs)):
    try:
        X_train[i] = product(abrw_emb_matr[index_graph['index'][train_pairs[i][0]]],
                                    abrw_emb_matr[index_graph['index'][train_pairs[i][1]]])
    except KeyError:
        k_train += 1
        pass
y_train = np.array(list(zip(*(train_pairs)))[-1])

X_test = np.zeros((len(test_pairs), embed_dim))
for i in range(len(test_pairs)):
    try:
        X_test[i] = product(abrw_emb_matr[index_graph['index'][test_pairs[i][0]]],
                                    abrw_emb_matr[index_graph['index'][test_pairs[i][1]]])    
    except:
        k_test += 1
        pass
y_test = np.array(list(zip(*(test_pairs)))[-1])

In [88]:
rand_forest = RandomForestClassifier()

rand_forest, y_pred_train, y_pred_test, precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, roc_auc_train, roc_auc_test, logloss_train, logloss_test = \
classify_model(rand_forest, X_train, y_train, X_test, y_test)

In [89]:
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.9380396140172677
Recall: 0.7383569858085149
Accuracy: 0.8447931241255247
F1-macro: 0.843014692277462
F1-micro: 0.8447931241255247
Logloss: 5.360675079259369
ROC-AUC: 0.8447931241255247


In [90]:
print(classification_report(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))

             precision    recall  f1-score   support

          0       0.78      0.95      0.86     15009
          1       0.94      0.74      0.83     15009

avg / total       0.86      0.84      0.84     30018

[[14277   732]
 [ 3927 11082]]


In [91]:
aan_train, aan_test = train_test_split_by_year(aan_links_connected, 2013, 0.1)

In [92]:
len(aan_test)/(len(aan_links_connected))

0.2612343040341972

In [93]:
G_train = nx.DiGraph()

for i in tqdm_notebook(aan_train.index):
    G_train.add_edge(aan_train.iloc[i]['citing'], aan_train.iloc[i]['cited'])
    
print(nx.info(G_train))

HBox(children=(IntProgress(value=0, max=55304), HTML(value='')))


Name: 
Type: DiGraph
Number of nodes: 13489
Number of edges: 55304
Average in degree:   4.0999
Average out degree:   4.0999


In [83]:
aan_test.to_csv('test_2013_01.csv', index=False)
aan_train.to_csv('train_2013_01.csv', index=False)

In [94]:
aan_train_neg = generate_negative_edges(G_train, len(G_train.edges()), 0.5)
aan_test_neg = generate_negative_edges_test(G, list(aan_test['citing']), len(aan_test))

train_pairs = train_test_preprocess(aan_train, aan_train_neg)
test_pairs = train_test_preprocess(aan_test, aan_test_neg)

In [95]:
embed_dim = 128

k_train = 0
k_test = 0

X_train = np.zeros((len(train_pairs), embed_dim))
for i in range(len(train_pairs)):
    try:
        X_train[i] = product(abrw_emb_matr[index_graph['index'][train_pairs[i][0]]],
                                    abrw_emb_matr[index_graph['index'][train_pairs[i][1]]])
    except KeyError:
        k_train += 1
        pass
y_train = np.array(list(zip(*(train_pairs)))[-1])

X_test = np.zeros((len(test_pairs), embed_dim))
for i in range(len(test_pairs)):
    try:
        X_test[i] = product(abrw_emb_matr[index_graph['index'][test_pairs[i][0]]],
                                    abrw_emb_matr[index_graph['index'][test_pairs[i][1]]])    
    except:
        k_test += 1
        pass
y_test = np.array(list(zip(*(test_pairs)))[-1])

In [96]:
rand_forest = RandomForestClassifier()

rand_forest, y_pred_train, y_pred_test, precision_train, precision_test, recall_train, recall_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, roc_auc_train, roc_auc_test, logloss_train, logloss_test = \
classify_model(rand_forest, X_train, y_train, X_test, y_test)

In [97]:
print('Precision: ' + str(precision_test))
print('Recall: ' + str(recall_test))
print('Accuracy: ' + str(accuracy_test))
print('F1-macro: ' + str(f1_macro_test))
print('F1-micro: ' + str(f1_micro_test))
print('Logloss: ' + str(logloss_test))
print('ROC-AUC: ' + str(roc_auc_test))

Precision: 0.9200088144557074
Recall: 0.6404683984454899
Accuracy: 0.7923910820208632
F1-macro: 0.7874861558619524
F1-micro: 0.7923910820208631
Logloss: 7.170580258955722
ROC-AUC: 0.7923910820208632
