In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import gensim, logging
import re
import collections
import networkx as nx
from networkx.algorithms import approximation as approx

In [2]:
node_info = pd.read_csv("./node_information.csv", names=["ID","year","title","authors","journal","abstracts"])
data = np.loadtxt("./training_set.txt")
graph = nx.Graph()

In [3]:
X, Y = data[:,0:-1], data[:,-1]
n = len(Y)
#X_pd = pd.DataFrame(X.astype(int),columns=["ID1", "ID2"])
X_pd = pd.read_csv("./train_pd.csv",index_col=0)

In [4]:
docvecs_abst = gensim.models.KeyedVectors.load("./docvec_abstract")

In [None]:
abstracts = []
avg_len = 0
for i, t in enumerate(node_info.loc[:,"abstracts"]):
    #abstracts.append(clean_str(gensim.parsing.preprocessing.remove_stopwords(t)))
    arr = gensim.parsing.preprocess_string(t)
    avg_len += len(arr)
    abstracts.append(gensim.models.doc2vec.TaggedDocument(arr, [i]))
avg_len /= len(node_info.loc[:,"abstracts"])
avg_len

In [None]:
model_abst = gensim.models.doc2vec.Doc2Vec(vector_size=20, min_count=5, epochs=100)
model_abst.build_vocab(abstracts)
%time model_abst.train(abstracts, total_examples=model_abst.corpus_count, epochs=model_abst.epochs)
docvecs_abst = model_abst.docvecs
docvecs_abst.save("./docvec_abstract")

In [None]:
# ranks = []
# second_ranks = []
# for doc_id in range(len(abstracts)):
#     inferred_vector = model_abst.infer_vector(abstracts[doc_id].words)
#     sims = model_abst.docvecs.most_similar([inferred_vector], topn=10)
#     tmp = [docid for docid, sim in sims]
#     if doc_id in tmp:
#         rank = tmp.index(doc_id)
#     else:
#         rank = -1
#     ranks.append(rank)
#     second_ranks.append(sims[1])
# collections.Counter(ranks)

In [5]:
def clean_str(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`\-]", " ", string)     
    string = re.sub(r"\'s", " \'s", string) 
    string = re.sub(r"\'ve", " \'ve", string) 
    string = re.sub(r"n\'t", " n\'t", string) 
    string = re.sub(r"\'re", " \'re", string) 
    string = re.sub(r"\'d", " \'d", string) 
    string = re.sub(r"\'ll", " \'ll", string) 
    string = re.sub(r",", " , ", string) 
    string = re.sub(r"!", " ! ", string) 
    string = re.sub(r"\(", " \( ", string) 
    string = re.sub(r"\)", " \) ", string) 
    string = re.sub(r"\?", " \? ", string) 
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().split()

In [None]:
%time X_pd["year1"] = X_pd.apply(lambda row: node_info.loc[row["ID1"] == node_info.ID, "year"].values[0] ,axis=1)
%time X_pd["year2"] = X_pd.apply(lambda row: node_info.loc[row["ID2"] == node_info.ID, "year"].values[0] ,axis=1)
%time X_pd["pos1"] = X_pd.apply(lambda row: node_info.loc[row["ID1"] == node_info.ID].index[0] ,axis=1)
%time X_pd["pos2"] = X_pd.apply(lambda row: node_info.loc[row["ID2"] == node_info.ID].index[0] ,axis=1)
%time X_pd["simi_abst"] = X_pd.apply(lambda row: docvecs_abst.similarity(int(row["pos1"]),int(row["pos2"])),axis=1)

authors = []
avg_len = 0
for i, t in enumerate(node_info.loc[:,"authors"]):
    if not isinstance(t, str):
        authors.append([])
    else:
        arr = t.split(",")
        avg_len += len(arr)
        authors.append(arr)
avg_len /= len(node_info.loc[:,"title"])
avg_len

%time X_pd["author_overlap"] = X_pd.apply(lambda row: len(set(authors[int(row["pos1"])]).intersection(set(authors[int(row["pos2"])]))),axis=1)
%time X_pd["is_self_cite"] = X_pd.apply(lambda row: row["author_overlap"] > 0 ,axis=1)


titles = []
avg_len = 0
for i, t in enumerate(node_info.loc[:,"title"]):
    if not isinstance(t, str):
        titles.append([])
    else:
        arr = clean_str(gensim.parsing.preprocessing.remove_stopwords(t))
        avg_len += len(arr)
        titles.append(arr)
avg_len /= len(node_info.loc[:,"title"])
avg_len
%time X_pd["title_overlap"]= X_pd.apply(lambda row: len(set(titles[int(row["pos1"])]).intersection(set(titles[int(row["pos2"])]))), axis = 1)



In [6]:
for i in range(len(node_info)):
    graph.add_node(i)
for i in range(n):
    if Y[i] == 1:
        graph.add_edge(X_pd.loc[i,"pos1"],X_pd.loc[i,"pos2"])

In [None]:
# %time betwen_centrality = nx.algorithms.centrality.betweenness_centrality(graph)

In [None]:
journals = []
avg_len = 0
for i, t in enumerate(node_info.loc[:,"journal"]):
    if not isinstance(t, str):
        journals.append([])
    else:
        arr = t.split(".")
        avg_len += len(arr)
        journals.append(arr)
avg_len /= len(node_info.loc[:,"journal"])
avg_len

def journal_overlap(row,journals,node_info):
    pos1 = int(row["pos1"])
    pos2 = int(row["pos2"])
    journal1 = journals[pos1]
    journal2 = journals[pos2]
    journal_overlap =len(set(journal1).intersection(set(journal2)))
    return journal_overlap
%time X_pd["journal_overlap"]= X_pd.apply(lambda row: journal_overlap(row,journals,node_info), axis = 1)
str_journal = list(node_info["journal"])
%time X_pd["journal_overlap_original"]= X_pd.apply(lambda row: int(str_journal[int(row["pos1"])] == str_journal[int(row["pos2"])]), axis = 1)
%time X_pd["neighbors_1"] = X_pd.apply(lambda row: graph.degree(row["pos1"]),axis = 1)
%time X_pd["neighbors_2"] = X_pd.apply(lambda row: graph.degree(row["pos2"]),axis = 1)
%time X_pd["common_neighbors"] = X_pd.apply(lambda row: len(list(nx.common_neighbors(graph,row["pos1"],row["pos2"]))), axis =1)
def jaccard_cal(row):
    union = len(set(graph[row["pos1"]]) | set(graph[row["pos2"]]))
    if union == 0:
        return 0
    else:
        return row["common_neighbors"]/union

%time X_pd["jaccard"] = X_pd.apply(lambda row: jaccard_cal(row), axis =1)
%time X_pd["resource_allocation"] = X_pd.apply(lambda row: sum(1./graph.degree(w) for w in nx.common_neighbors(graph,row["pos1"],row["pos2"])), axis = 1)
%time X_pd["adamic_adar_index"] = X_pd.apply(lambda row: sum(1./np.log(graph.degree(w)) for w in nx.common_neighbors(graph,row["pos1"],row["pos2"])), axis = 1)
# %time X_pd["connectivity"] = X_pd.apply(lambda row: approx.local_node_connectivity(graph, row["pos1"], row["pos2"]),axis = 1)
X_pd.to_csv('train_pd.csv', index = True)

In [None]:
%time X_pd["connectivity"] = X_pd.apply(lambda row: approx.local_node_connectivity(graph, row["pos1"], row["pos2"]),axis = 1)

In [7]:
data_test = np.loadtxt("./testing_set.txt")
n_test = len(data_test)
# test_pd = pd.DataFrame(data_test.astype(int),columns=["ID1", "ID2"])
test_pd = pd.read_csv("./test_pd.csv",index_col=0)
# docvecs_abst = gensim.models.KeyedVectors.load("../input/citation-network/docvec_abstract")
docvecs_abst.save("./docvec_abstract")

In [None]:
%time test_pd["year1"] = test_pd.apply(lambda row: node_info.loc[row["ID1"] == node_info.ID, "year"].values[0] ,axis=1)
%time test_pd["year2"] = test_pd.apply(lambda row: node_info.loc[row["ID2"] == node_info.ID, "year"].values[0] ,axis=1)
%time test_pd["pos1"] = test_pd.apply(lambda row: node_info.loc[row["ID1"] == node_info.ID].index[0] ,axis=1)
%time test_pd["pos2"] = test_pd.apply(lambda row: node_info.loc[row["ID2"] == node_info.ID].index[0] ,axis=1)
%time test_pd["simi_abst"] = test_pd.apply(lambda row: docvecs_abst.similarity(int(row["pos1"]),int(row["pos2"])),axis=1)

authors = []
avg_len = 0
for i, t in enumerate(node_info.loc[:,"authors"]):
    if not isinstance(t, str):
        authors.append([])
    else:
        arr = t.split(",")
        avg_len += len(arr)
        authors.append(arr)
avg_len /= len(node_info.loc[:,"title"])
avg_len

%time test_pd["author_overlap"] = test_pd.apply(lambda row: len(set(authors[int(row["pos1"])]).intersection(set(authors[int(row["pos2"])]))),axis=1)
%time test_pd["is_self_cite"] = test_pd.apply(lambda row: row["author_overlap"] > 0 ,axis=1)


titles = []
avg_len = 0
for i, t in enumerate(node_info.loc[:,"title"]):
    if not isinstance(t, str):
        titles.append([])
    else:
        arr = clean_str(gensim.parsing.preprocessing.remove_stopwords(t))
        avg_len += len(arr)
        titles.append(arr)
avg_len /= len(node_info.loc[:,"title"])
avg_len
%time test_pd["title_overlap"]= test_pd.apply(lambda row: len(set(titles[int(row["pos1"])]).intersection(set(titles[int(row["pos2"])]))), axis = 1)

journals = []
avg_len = 0
for i, t in enumerate(node_info.loc[:,"journal"]):
    if not isinstance(t, str):
        journals.append([])
    else:
        arr = t.split(".")
        avg_len += len(arr)
        journals.append(arr)
avg_len /= len(node_info.loc[:,"journal"])
avg_len

def journal_overlap(row,journals,node_info):
    pos1 = int(row["pos1"])
    pos2 = int(row["pos2"])
    journal1 = journals[pos1]
    journal2 = journals[pos2]
    journal_overlap =len(set(journal1).intersection(set(journal2)))
    return journal_overlap

%time test_pd["journal_overlap"]= test_pd.apply(lambda row: journal_overlap(row,journals,node_info), axis = 1)
str_journal = list(node_info["journal"])
%time test_pd["journal_overlap_original"]= test_pd.apply(lambda row: int(str_journal[int(row["pos1"])] == str_journal[int(row["pos2"])]), axis = 1)


%time test_pd["neighbors_1"] = test_pd.apply(lambda row: graph.degree(row["pos1"]),axis = 1)
%time test_pd["neighbors_2"] = test_pd.apply(lambda row: graph.degree(row["pos2"]),axis = 1)
%time test_pd["common_neighbors"] = test_pd.apply(lambda row: len(list(nx.common_neighbors(graph,row["pos1"],row["pos2"]))), axis =1)
def jaccard_cal(row):
    union = len(set(graph[row["pos1"]]) | set(graph[row["pos2"]]))
    if union == 0:
        return 0
    else:
        return row["common_neighbors"]/union

%time test_pd["jaccard"] = test_pd.apply(lambda row: jaccard_cal(row), axis =1)
%time test_pd["resource_allocation"] = test_pd.apply(lambda row: sum(1./graph.degree(w) for w in nx.common_neighbors(graph,row["pos1"],row["pos2"])), axis = 1)
%time test_pd["adamic_adar_index"] = test_pd.apply(lambda row: sum(1./np.log(graph.degree(w)) for w in nx.common_neighbors(graph,row["pos1"],row["pos2"])), axis = 1)


# %time test_pd["connectivity"] = test_pd.apply(lambda row: approx.local_node_connectivity(graph, row["pos1"], row["pos2"]),axis = 1)

test_pd.to_csv('test_pd.csv', index = True)

In [8]:
test_pd.head()

Unnamed: 0,ID1,ID2,year1,year2,pos1,pos2,simi_abst,author_overlap,title_overlap,journal_overlap,...,neighbors_1,neighbors_2,common_neighbors,jaccard,resource_allocation,connectivity,preferential_attachement,community1,community2,same_community
0,9807076,9807139,1998,1998,23774,23835,0.346607,0,0,2,...,59,18,0,0.0,0.0,18,1062,0,0,1
1,109162,1182,2001,2000,5227,172,0.473994,0,1,0,...,302,45,24,0.074303,0.311535,45,13590,6,6,1
2,9702187,9510135,1997,1995,20185,16838,0.520707,0,1,3,...,223,739,59,0.065338,1.342594,223,164797,12,12,1
3,111048,110115,2001,2001,5621,5397,0.280709,0,1,1,...,51,65,21,0.221053,0.298419,51,3315,13,13,1
4,9910176,9410073,1999,1994,27159,14643,0.581315,0,0,1,...,7,150,0,0.0,0.0,7,1050,12,12,1


In [9]:
X_pd.head()

Unnamed: 0,ID1,ID2,year1,year2,pos1,pos2,simi_abst,author_overlap,title_overlap,journal_overlap,...,neighbors_1,neighbors_2,common_neighbors,jaccard,resource_allocation,connectivity,preferential_attachement,community1,community2,same_community
0,9510123,9502114,1995.0,1995.0,16827.0,15446.0,0.228531,0.0,2.0,3,...,6,12,1,0.058824,0.142857,6,72,5,5,1
1,9707075,9604178,1997.0,1996.0,21154.0,18059.0,0.473765,0.0,0.0,0,...,79,147,20,0.097087,0.226401,79,11613,0,0,1
2,9312155,9506142,1993.0,1995.0,13074.0,16171.0,0.240585,0.0,0.0,1,...,1,5,0,0.0,0.0,1,5,14,4,0
3,9911255,302165,1999.0,2003.0,27486.0,9702.0,0.460808,0.0,0.0,0,...,20,14,0,0.0,0.0,14,280,13,13,1
4,9701033,209076,1997.0,2002.0,19856.0,8212.0,0.664643,0.0,0.0,0,...,7,24,0,0.0,0.0,7,168,5,1,0


In [10]:
%time X_pd["preferential_attachement"] = X_pd.apply(lambda row: graph.degree(row["pos1"])*graph.degree(row["pos2"]), axis = 1)

CPU times: user 22.1 s, sys: 334 ms, total: 22.4 s
Wall time: 22.9 s


In [11]:
%time test_pd["preferential_attachement"] = test_pd.apply(lambda row: graph.degree(row["pos1"])*graph.degree(row["pos2"]), axis = 1)

CPU times: user 1.41 s, sys: 20.1 ms, total: 1.43 s
Wall time: 1.32 s


In [12]:
import community #pip install python-louvain

In [13]:
partition = community.best_partition(graph)

In [17]:
%time X_pd["community1"] = X_pd.apply(lambda row: partition[row["pos1"]], axis = 1)
%time X_pd["community2"] = X_pd.apply(lambda row: partition[row["pos2"]], axis = 1)
%time X_pd["same_community"] = X_pd.apply(lambda row: int(partition[row["pos1"]] == partition[row["pos2"]]), axis = 1)

CPU times: user 9.81 s, sys: 365 ms, total: 10.2 s
Wall time: 10.3 s
CPU times: user 9.6 s, sys: 222 ms, total: 9.82 s
Wall time: 9.65 s
CPU times: user 14.9 s, sys: 269 ms, total: 15.1 s
Wall time: 15 s


In [18]:
%time test_pd["community1"] = test_pd.apply(lambda row: partition[row["pos1"]], axis = 1)
%time test_pd["community2"] = test_pd.apply(lambda row: partition[row["pos2"]], axis = 1)
%time test_pd["same_community"] = test_pd.apply(lambda row: int(partition[row["pos1"]] == partition[row["pos2"]]), axis = 1)

CPU times: user 698 ms, sys: 23.8 ms, total: 722 ms
Wall time: 543 ms
CPU times: user 500 ms, sys: 8.75 ms, total: 509 ms
Wall time: 511 ms
CPU times: user 775 ms, sys: 9.95 ms, total: 785 ms
Wall time: 788 ms


In [None]:
def shortest_path_dijk(row,graph):
    try:
        res = nx.dijkstra_path_length(graph,row["pos1"], row["pos2"])
    except:
        res = -1
    
    return res

In [None]:
%time X_pd["shortest_path"] = X_pd.apply(lambda row: shortest_path_dijk(row,graph), axis = 1)

In [None]:
%time test_pd["shortest_path"] = test_pd.apply(lambda row: shortest_path_dijk(row,graph), axis = 1)

In [21]:
X_pd.to_csv('train_pd.csv', index = True)
test_pd.to_csv('test_pd.csv', index = True)

In [11]:
%time test_pd["is_self_cite"] = test_pd.apply(lambda row: row["author_overlap"] > 0 ,axis=1)

CPU times: user 473 ms, sys: 16.7 ms, total: 490 ms
Wall time: 492 ms


In [13]:
%time X_pd["adamic_adar_index"] = X_pd.apply(lambda row: sum(1./np.log(graph.degree(w)) for w in nx.common_neighbors(graph,row["pos1"],row["pos2"])), axis = 1)

CPU times: user 1min 46s, sys: 487 ms, total: 1min 46s
Wall time: 1min 46s
