In [1]:
import pandas as pd
import networkx as nx
import numpy as np

from sklearn.linear_model import LogisticRegression

In [2]:
from tqdm import tqdm

In [3]:
import random

In [4]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss

In [5]:
def class_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
                     
    y_pred_train = model.predict_proba(X_train)
    y_pred_test = model.predict_proba(X_test)
    
    precision_train = precision_score(y_train, y_pred_train)
    precision_test = precision_score(y_test, y_pred_test)
    accuracy_train = accuracy_score(y_train, y_pred_train)
    accuracy_test = accuracy_score(y_test, y_pred_test)
    f1_macro_train = f1_score(y_train, y_pred_train, average='macro')
    f1_macro_test = f1_score(y_test, y_pred_test, average='macro')
    f1_micro_train = f1_score(y_train, y_pred_train, average='micro')
    f1_micro_test = f1_score(y_test, y_pred_test, average='micro')
    logloss_train = log_loss(y_train, y_pred_train)
    logloss_test = log_loss(y_test, y_pred_test)
    roc_auc_train = roc_auc_score(y_train, y_pred_train)
    roc_auc_test = roc_auc_score(y_test, y_pred_test)

    return model, y_pred_train, y_pred_test, \
        precision_train, precision_test, accuracy_train, accuracy_test, \
        f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, \
        roc_auc_train, roc_auc_test, logloss_train, logloss_test 

In [6]:
# LogReg
model = LogisticRegression()

In [21]:
aan_links = pd.read_csv('links_with_abstract.csv')

In [22]:
aan_links.head()

Unnamed: 0,citing,cited
0,C08-3004,A00-1002
1,D09-1141,A00-1002
2,D12-1027,A00-1002
3,E06-1047,A00-1002
4,H05-1110,A00-1002


In [9]:
aan_links.shape

(75009, 2)

In [15]:
aan_meta = pd.read_csv('metadata_with_abstract.csv')

In [16]:
aan_meta.head()

Unnamed: 0,id,author,title,venue,year
0,E03-1062,"Piwek, Paul",A Flexible Pragmatics-Driven Language Generato...,EACL,2003
1,E06-1001,"McConville, Mark",Inheritance And The CCG Lexicon,EACL,2006
2,E06-1002,"Bunescu, Razvan C.; Pa&scedil;ca, Marius",Using Encyclopedic Knowledge For Named Entity ...,EACL,2006
3,E06-1003,"Tanev, Hristo; Magnini, Bernardo",Weakly Supervised Approaches For Ontology Popu...,EACL,2006
4,E06-1004,"Udupa, Raghavendra; Maji, Hemanta K.",Computational Complexity Of Statistical Machin...,EACL,2006


In [24]:
years = aan_meta[['id','year']]

In [25]:
years.columns = ['citing', 'year_citing']

In [27]:
aan_links = aan_links.merge(years, how = 'left', on = 'citing')

In [32]:
aan_links[aan_links.year_citing > 0].shape

(75009, 3)

In [35]:
aan_links.groupby('year_citing').size()

year_citing
1982        2
1983        1
1984        2
1986        5
1987       11
1988       33
1989       18
1990       55
1991       33
1992       49
1993       31
1994       76
1995       73
1996      133
1997      163
1998      167
1999      137
2000      305
2001      290
2002      569
2003      856
2004     1849
2005     1983
2006     3721
2007     3922
2008     4505
2009     5746
2010     9371
2011     7454
2012     8929
2013    12915
2014    11580
2015       22
2016        3
dtype: int64

In [8]:
G = nx.DiGraph()

for i in tqdm(aan_links.index):
    G.add_edge(aan_links.iloc[i]['citing'], aan_links.iloc[i]['cited'])

100%|██████████████████████████████████| 75009/75009 [00:41<00:00, 1793.08it/s]


In [9]:
print(nx.info(G))

Name: 
Type: DiGraph
Number of nodes: 13757
Number of edges: 75009
Average in degree:   5.4524
Average out degree:   5.4524


In [40]:
nodes = list(G.nodes())

In [41]:
len(nodes)

13757

In [38]:
G_train = nx.DiGraph()

for i in tqdm(aan_links[aan_links['year_citing'] < 2013].index):
    G_train.add_edge(aan_links.iloc[i]['citing'], aan_links.iloc[i]['cited'])

100%|███████████████████████████████████| 50489/50489 [00:53<00:00, 952.05it/s]


In [42]:
nodes_train = list(G_train.nodes())

In [43]:
len(nodes_train)

10588

In [54]:
cites = pd.DataFrame({'cites_count': aan_links.groupby('citing').size()}).reset_index()

In [56]:
cites = cites.merge(years, how='left', on='citing')

In [58]:
cites[cites.year_citing > 2012]

Unnamed: 0,citing,cites_count,year_citing
793,C14-1002,14,2014
794,C14-1003,12,2014
795,C14-1004,5,2014
796,C14-1005,12,2014
797,C14-1006,6,2014
798,C14-1008,10,2014
799,C14-1009,18,2014
800,C14-1010,7,2014
801,C14-1011,3,2014
802,C14-1012,4,2014


In [61]:
aan_links

Unnamed: 0,citing,cited,year_citing
0,C08-3004,A00-1002,2008
1,D09-1141,A00-1002,2009
2,D12-1027,A00-1002,2012
3,E06-1047,A00-1002,2006
4,H05-1110,A00-1002,2005
5,N13-1036,A00-1002,2013
6,P13-2001,A00-1002,2013
7,P13-2073,A00-1002,2013
8,W11-2602,A00-1002,2011
9,W13-2702,A00-1002,2013


In [62]:
aan_links.groupby(['citing','year_citing'])['cited'].transform(lambda x: x.rank())

0         1.0
1         1.0
2         1.0
3         1.0
4         1.0
5         1.0
6         1.0
7         1.0
8         1.0
9         1.0
10        1.0
11        1.0
12        1.0
13        1.0
14        1.0
15        1.0
16        1.0
17        1.0
18        1.0
19        1.0
20        1.0
21        1.0
22        1.0
23        1.0
24        1.0
25        1.0
26        1.0
27        1.0
28        1.0
29        1.0
         ... 
74979     1.0
74980     9.0
74981    13.0
74982     1.0
74983     1.0
74984     3.0
74985     5.0
74986     8.0
74987     1.0
74988     1.0
74989     3.0
74990     4.0
74991    11.0
74992     2.0
74993     5.0
74994     5.0
74995     1.0
74996     4.0
74997    13.0
74998    10.0
74999     1.0
75000    20.0
75001     1.0
75002     7.0
75003     8.0
75004    14.0
75005     7.0
75006     1.0
75007     5.0
75008     3.0
Name: cited, Length: 75009, dtype: float64

In [63]:
# df['Rank'] = df.groupby(by=['C1'])['C2'].transform(lambda x: x.rank())
aan_links['cite_rank'] = aan_links.groupby('citing')['cited'].transform(lambda x: x.rank())

In [68]:
years.columns = ['cited', 'year_cited']

In [69]:
aan_links = aan_links.merge(years, how='left', on = 'cited')

In [70]:
aan_links.sort_values(by = ['year_citing','citing'])

Unnamed: 0,citing,cited,year_citing,cite_rank,year_cited
18505,P82-1015,J79-1061,1982,1.0,1979
52144,P82-1015,P82-1015,1982,2.0,1982
52139,A83-1015,P82-1015,1983,1.0,1982
585,P84-1023,A83-1007,1984,1.0,1983
52146,P84-1065,P83-1012,1984,1.0,1983
4904,C86-1041,C86-1019,1986,1.0,1986
52138,C86-1138,P81-1032,1986,1.0,1981
52188,C86-1139,P85-1019,1986,1.0,1985
52156,C86-1147,P85-1004,1986,1.0,1985
52192,H86-1007,P86-1036,1986,1.0,1986


In [71]:
aan_links[aan_links.cite_rank%2 == 0]

Unnamed: 0,citing,cited,year_citing,cite_rank,year_cited
41,P01-1037,A00-1023,2001,2.0,2000
64,W03-0808,A00-1034,2003,2.0,2000
99,W03-1211,A00-1044,2003,2.0,2000
182,E09-1017,A00-2018,2009,2.0,2000
193,H05-1078,A00-2018,2005,2.0,2000
279,P06-1048,A00-2018,2006,2.0,2000
399,W06-2607,A00-2018,2006,2.0,2000
479,N10-1002,A00-2022,2010,2.0,2000
487,C00-2140,A00-2025,2000,2.0,2000
496,D07-1028,A00-2026,2007,2.0,2000


In [76]:
aan_links_train = aan_links[(aan_links.year_citing < 2013)|((aan_links.year_citing >= 2013)&(aan_links.cite_rank%2 == 0))]

In [77]:
aan_links_test = aan_links[(aan_links.year_citing >= 2013)&(aan_links.cite_rank%2 == 1)]

In [78]:
aan_links_train.shape

(61941, 5)

In [79]:
aan_links_test.shape

(13068, 5)

In [81]:
aan_links.to_csv('aan_links_years.csv', index=False)

In [82]:
aan_links_train.to_csv('aan_links_train_2013.csv', index=False)

In [83]:
aan_links_test.to_csv('aan_links_test_2013.csv', index=False)

In [34]:
def generate_negative_edges(graph, num_edges):
    negative_edges = set()
    nodes = list(graph.nodes())
    while len(negative_edges) < num_edges:
        i = random.randint(0, len(nodes) - 1)
        j = random.randint(0, len(nodes) - 1)
        if graph.has_edge(nodes[i], nodes[j]):
            continue
        negative_edges.add((nodes[i], nodes[j]))
    return negative_edges

In [35]:
negative_edges = generate_negative_edges(G, 100)

In [37]:
list(negative_edges)

[('W08-0315', 'E14-1017'),
 ('W14-0401', 'W06-2930'),
 ('W12-1640', 'C94-2149'),
 ('W01-0703', 'W11-0604'),
 ('W10-3707', 'C04-1164'),
 ('W97-0311', 'P13-2138'),
 ('W10-2410', 'D10-1098'),
 ('W97-1304', 'P06-2067'),
 ('W10-3713', 'C10-2011'),
 ('W12-4210', 'E14-1052'),
 ('W12-6212', 'W05-1201'),
 ('W14-1817', 'W04-0405'),
 ('P08-1086', 'C10-2056'),
 ('D11-1148', 'W10-1823'),
 ('S13-2019', 'P09-1042'),
 ('S13-2058', 'P10-4012'),
 ('W11-1418', 'P07-1053'),
 ('C94-2110', 'P11-1052'),
 ('P11-1062', 'W06-1601'),
 ('W13-2204', 'D13-1152'),
 ('I08-2086', 'W10-1612'),
 ('C04-1025', 'S12-1043'),
 ('W14-5814', 'W09-0211'),
 ('C94-2133', 'W10-4343'),
 ('N10-1095', 'P13-1088'),
 ('W12-2503', 'S13-1026'),
 ('N13-1036', 'S12-1098'),
 ('P11-1100', 'P14-1126'),
 ('W02-0905', 'D10-1113'),
 ('I08-3010', 'E09-1046'),
 ('P14-2082', 'W06-1658'),
 ('D07-1035', 'D10-1039'),
 ('N09-2025', 'P06-1083'),
 ('W08-2132', 'C10-1051'),
 ('W04-3242', 'P15-1002'),
 ('W11-1919', 'A94-1009'),
 ('C04-1047', 'P01-1053'),
 