In [3]:
import numpy as np
import pandas as pd
import gensim, logging
import re
import networkx as nx
from sklearn.model_selection import train_test_split
from networkx.algorithms import approximation as approx
import community

In [4]:

############################################
# first step: data preprocessing
############################################

#load train data et node_information
node_info = pd.read_csv("node_information.csv", names=["ID", "year", "title", "authors", "journal", "abstracts"])
data_train = np.loadtxt("training_set.txt")
data_test = np.loadtxt("testing_set.txt")
data,label= data_train[:, 0:-1], data_train[:, -1]

In [1]:
############################################
#some useful fonction

def clean_str(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`\-]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip()

In [2]:
def node_information_preprocess(node_info):
    node_info["title_"] = node_info["title"].apply(lambda string: clean_str(string).split(" "))
    node_info["title_remove_stw"] = node_info["title"].apply(lambda string:
                                    clean_str(gensim.parsing.preprocessing.remove_stopwords(string)).split(" ") if isinstance(string, str) else [])
    node_info["authors_"] = node_info["authors"].apply(lambda string: string.split(",") if isinstance(string, str) else [])
    node_info["journal_"] = node_info["journal"].apply(lambda string: [string]if isinstance(string, str) else [])
    node_info["journal_divide"] = node_info["journal"].apply(lambda string: string.strip(".").split(".") if isinstance(string, str) else [])
    node_info["abstracts_"] = node_info["abstracts"].apply(lambda string:clean_str(string).split() if isinstance(string, str) else [])
    return node_info

In [5]:
############################################
# second step: feature enginneering
############################################


def feature_engineering(data,node_info,graph,graph2,graph_auth,model_abst,model_title):

    data["title1"] = data.apply(lambda row: node_info.loc[row["pos1"], "title_"], axis=1)
    data["title2"] = data.apply(lambda row: node_info.loc[row["pos2"], "title_"], axis=1)

    data["title_remove_stw1"] = data.apply(lambda row: node_info.loc[row["pos1"], "title_remove_stw"], axis=1)
    data["title_remove_stw2"] = data.apply(lambda row: node_info.loc[row["pos2"], "title_remove_stw"], axis=1)

    data["authors1"] = data.apply(lambda row: node_info.loc[row["pos1"], "authors_"], axis=1)
    data["authors2"] = data.apply(lambda row: node_info.loc[row["pos2"], "authors_"], axis=1)

    data["journal1"] = data.apply(lambda row: node_info.loc[row["pos1"], "journal_"], axis=1)
    data["journal2"] = data.apply(lambda row: node_info.loc[row["pos2"], "journal_"], axis=1)

    data["journal_divide1"] = data.apply(lambda row: node_info.loc[row["pos1"], "journal_divide"], axis=1)
    data["journal_divide2"] = data.apply(lambda row: node_info.loc[row["pos2"], "journal_divide"], axis=1)

    ##############################################
    ### Attribute Features
    ##############################################

    ### find years of the papers of and the difference between papers ,
    # the paper cite another much possible as they are cloed to each other

    data["year1"] = data.apply(lambda row: node_info.loc[row["pos1"], "year"], axis=1)
    data["year2"] = data.apply(lambda row: node_info.loc[row["pos2"], "year"], axis=1)
    data["diff_year"] = np.abs(data["year1"] - data["year2"])

    ### The number of commom word in the papers, it's much possible that the two papers have the bigger common word.

    data["title_overlap"] = data.apply(
        lambda row: len(set(row["title_remove_stw1"]).intersection(set(row["title_remove_stw2"]))),axis=1)

    ### The number of commom authors. MacRoberts and MacRoberts (1989) pointed out that self-citations are often observed
    # because researchers are more familiar with their own research than the research of others.

    data["author_overlap"] = data.apply(
        lambda row: len(set(row["authors1"]).intersection(set(row["authors2"]))), axis=1)

    ###Is_self_citation the same as before

    def is_self_cite(row):
        self_cite = 0
        for author in row["authors1"]:
            if author in row["authors2"]:
                self_cite = 1
                break
        return self_cite

    data["is_self_cite"] = data.apply(is_self_cite, axis=1)

    ### The number of commom word in the journal. The journal contain the information of the paper

    data["journal_overlap"] = data.apply(
        lambda row: len(set(row["journal_divide1"]).intersection(set(row["journal_divide2"]))), axis=1)

    ### Is published in same journal As researchers tend to work in specific scientific communities,
    #  two papers published in the same journal are likely to be connected.

    data["journal_common"] = data.apply(
        lambda row: len(set(row["journal1"]).intersection(set(row["journal2"]))), axis=1)

    ##############################################
    ### Topological Features
    ##############################################

    ##################### For citation network #######################

    ### The number of the neighbors for a paper,which is related that how populair a paper is
    data["neighbors_1"] = data.apply(lambda row: graph.degree(row["pos1"]), axis=1)
    data["neighbors_2"] = data.apply(lambda row: graph.degree(row["pos2"]), axis=1)

    ### The number of common neighbours, nodes are highly clustered locally (Watts & Strogatz, 1998),
    # which means two papers with more common neighbours tend to be connected.
    data["common_neighbors"] = data.apply(lambda row: len(list(nx.common_neighbors(graph, row["pos1"], row["pos2"]))),
                                          axis=1)

    ### Link-based Jaccard coefficient, The link-based Jaccard coefficient represents
    # the relative value of the number of common neighbours.

    def Jaccard_coefficient(row):
        union = len(set(graph[row["pos1"]]) | set(graph[row["pos2"]]))
        if union == 0:
            return 0
        else:
            return row["common_neighbors"] / union

    data["jaccard"] = data.apply(Jaccard_coefficient, axis=1)

    def Cosine(row):
        union = graph.degree(row["pos1"]) * graph.degree(row["pos2"])
        if union == 0:
            return 0
        else:
            return row["common_neighbors"] / union

    data["cosine"] = data.apply(Cosine, axis=1)

    ### adamic_adar_index.Weights connections with rare nodes more heavily.based on the concept that common elements with very large neighbourhoods
    # are lesser significant when predicting a connection between two nodes compared with elements shared between a small number of nodes.


    def adamic_adar_index(graph, u, v):
        return sum(1 / np.log(graph.degree(w)) for w in nx.common_neighbors(graph, u, v))

    data["adamic_adar_index"] = data.apply(lambda row: adamic_adar_index(graph, row["pos1"], row["pos2"]), axis=1)

    data["resource_allocation"] = data.apply(
        lambda row: sum(1. / graph.degree(w) for w in nx.common_neighbors(graph, row["pos1"], row["pos2"])), axis=1)

    ### preferential_attachement : "cumulative advantage",that the probability of co-authorship of x and y
    # is correlated with the product of the number of collaborators of x and y.

    data["preferential_attachement"] = data.apply(lambda row: graph.degree(row["pos1"]) * graph.degree(row["pos2"]),
                                                  axis=1)

    ###same_community that two papers are more likely to be connected if they are in the same clusters.

    partition = community.best_partition(graph)
    #data["community1"] = data.apply(lambda row: partition[row["pos1"]], axis=1)
    #data["community2"] = data.apply(lambda row: partition[row["pos2"]], axis=1)
    data["same_community"] = data.apply(lambda row: int(partition[row["pos1"]] == partition[row["pos2"]]), axis=1)
    # X_pd["connectivity"] = X_pd.apply(lambda row: approx.local_node_connectivity(graph, row["pos1"], row["pos2"]), axis=1)

    ###betweenness_centrality measuring the influence a node has over the spread of information through the network.

    btw = nx.betweenness_centrality(graph, 1000)
    data["betweeness_diff"] = data.apply(lambda row: btw[row["pos2"]] - btw[row["pos1"]], axis=1)


    ### This value represents the attracting force between from and to,

    data["in_link_diff"] = data.apply(lambda row: len(graph2.in_edges(row["pos2"])) - len(graph2.in_edges(row["pos1"])),
                                      axis=1)

    ### Although we are analyzing the citation model, it is the authors who make the decision of citations.
    # Therefore, we think it is also useful to incorporate the authors’ properties in our citation model.
    # the maximum number of collaborations happened between each author of pa- per A and each author of paper B

    def max_author_collaboration(row):
        max_author_collaboration = 0
        if len(row["authors1"]) * len(row["authors2"]) == 0:
            return 0
        for author1 in row["authors1"]:
            for author2 in row["authors2"]:
                if graph_auth.has_edge(author1, author2):
                    max_author_collaboration = max(max_author_collaboration,graph_auth[author1][author2]['weight'])
        return max_author_collaboration

    data["max_author_collaboration"] = data.apply(max_author_collaboration, axis=1)

    # the average number of collaboration between authors of paper A and authors of paper B;
    def mean_author_collaboration(row):
        mean_author_collaboration = 0

        if len(row["authors1"]) * len(row["authors2"]) == 0:
            return 0
        else:
            for author1 in row["authors1"]:
                for author2 in row["authors2"]:
                    if graph_auth.has_edge(author1, author2):
                        mean_author_collaboration += graph_auth[author1][author2]['weight']

        return mean_author_collaboration /(len(row["authors1"]) * len(row["authors2"]))
    data["mean_author_collaboration"] = data.apply(mean_author_collaboration, axis=1)

    ###########################################
    #Semantic Features
    ###########################################

    ### text similarity: the semantic similarity between two paper titles defined by “Word2vec”.
    # Papers with similar content tend to cite each other.

    # Titles similarity

    docvecs_title = model_title.docvecs
    data["simi_title"] = data.apply(lambda row: docvecs_title.similarity(int(row["pos1"]), int(row["pos2"])), axis=1)

    # abstracts similarity

    docvecs_abst = model_abst.docvecs
    data["simi_abst"] = data.apply(lambda row: docvecs_abst.similarity(int(row["pos1"]), int(row["pos2"])), axis=1)


    return data

In [6]:
def feature_creation(data,label,data_test,node_info,validation = False):


    node_info = node_information_preprocess(node_info)

    ### Creation of features
    data = pd.DataFrame(data.astype(int), columns=["ID1", "ID2"])
    data_test = pd.DataFrame(data_test.astype(int), columns=["ID1", "ID2"])

    ### find the position of the papers in node_information
    data["pos1"] = data.apply(lambda row: node_info.loc[row["ID1"] == node_info.ID].index[0], axis=1)
    data["pos2"] = data.apply(lambda row: node_info.loc[row["ID2"] == node_info.ID].index[0], axis=1)

    data_test["pos1"] = data_test.apply(lambda row: node_info.loc[row["ID1"] == node_info.ID].index[0], axis=1)
    data_test["pos2"] = data_test.apply(lambda row: node_info.loc[row["ID2"] == node_info.ID].index[0], axis=1)

    if validation:
        data, data_val, label, label_val = train_test_split(data, label, test_size=0.2, random_state=42)

    graph = nx.Graph()
    graph2 = nx.DiGraph()
    for i in range(len(node_info)):
        graph.add_node(i)
        graph2.add_node(i)
    for i in range(len(label)):
        if label[i] == 1:
            graph.add_edge(data.loc[i, "pos1"], data.loc[i, "pos2"])
            graph2.add_edge(data.loc[i, "pos1"], data.loc[i, "pos2"])


    graph_auth = nx.Graph()
    for authors in node_info["authors_"]:
        for i in range(len(authors)):
            for j in range(i):
                if graph_auth.has_edge(authors[i],authors[j]):
                    graph_auth.add_edge(authors[i], authors[j],weight = graph_auth[authors[i]][authors[j]]['weight']+1)
                else:
                    graph_auth.add_edge(authors[i], authors[j],weight=1)

    ### build a word vector model for abstract
    abstracts = []
    for i, t in enumerate(node_info["abstracts"]):
        arr = gensim.parsing.preprocess_string(t)
        abstracts.append(gensim.models.doc2vec.TaggedDocument(arr, [i]))

    model_abst = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=5, epochs=100)
    model_abst.build_vocab(abstracts)
    model_abst.train(abstracts, total_examples=model_abst.corpus_count, epochs=model_abst.epochs)
    #docvecs_abst.save("./docvec_abstract")

    titles = []
    for i, t in enumerate(node_info["title"]):
        arr = gensim.parsing.preprocess_string(t)
        titles.append(gensim.models.doc2vec.TaggedDocument(arr, [i]))
    model_title = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=5, epochs=100)
    model_title.build_vocab(titles)
    model_title.train(titles, total_examples=model_title.corpus_count, epochs=model_title.epochs)
    # docvecs_abst.save("./docvec_title")

    data = feature_engineering(data, node_info, graph, graph2, graph_auth, model_abst, model_title)
    data_test = feature_engineering(data_test, node_info, graph, graph2, graph_auth, model_abst,
                                    model_title)
    if validation:
        data_val = feature_engineering(data_val, node_info, graph, graph2, graph_auth, model_abst, model_title)
        return data,label,data_val,label_val,data_test

    else:
        return data,label,data_test

In [None]:
data_,label_,data_test = feature_creation(data[:100,:],label[:100],data_test[:100],node_info)

In [None]:
data_,label_,data_val,label_val,data_test = feature_creation(data[:100,:],label[:100],data_test[:100],node_info,validation=True)

In [7]:
############################################
# third step: model selection
############################################