In [None]:
import random
import numpy as np

import igraph
import networkx as nx
from tqdm import tqdm_notebook,tqdm
import pandas as pd

import nltk
from gensim.models import Word2Vec, Doc2Vec
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
path_to_data = "data/"

training_set = pd.read_table(path_to_data+"training_set.txt",sep=" ",names=["source","target","label"])
testing_set = pd.read_table(path_to_data+"testing_set.txt",sep=" ",names=["source","target"])
node_info = pd.read_csv(path_to_data+"node_information.csv",names=["id","year","title","author","classification","description"])

In [None]:
tqdm_notebook().pandas("Desc")

## Semantic Features

In [None]:
stpwds = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.PorterStemmer()
N_VECTORIZATION = 30

### Description

In [None]:
tfidf = TfidfVectorizer(stop_words=stpwds,ngram_range=(1,3),max_features=2000,norm="l2")

In [None]:
vectorized_desc = tfidf.fit_transform(list(node_info.description))

In [None]:
corpus = [element.split(" ") for element in node_info["description"]]
for i in tqdm_notebook(range(len(corpus))):
    corpus[i] = [stemmer.stem(el) for el in corpus[i] if (el.isdigit()==False and len(el)>2 and len(el)<15)]

In [None]:
from collections import namedtuple

docs = []

analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')
for i, words in tqdm_notebook(enumerate(corpus)):
    tags = [i]
    docs.append(analyzedDocument(words, tags))

In [None]:
model = Doc2Vec(docs, size=N_VECTORIZATION, window=8, min_count=5, workers=4)
liste = [list(model.docvecs[i]) for i in tqdm_notebook(range(len(model.docvecs)))]

node_info = node_info.merge(pd.DataFrame(liste, columns=["description_d2v_"+str(i) for i in range(N_VECTORIZATION)]),how='outer', left_index=True, right_index=True)

### Title

In [None]:
tfidf = TfidfVectorizer(stop_words=stpwds,ngram_range=(1,3),max_features=2000,norm="l2")

In [None]:
vectorized_title = tfidf.fit_transform(list(node_info.title))

In [None]:
corpus = [element.split(" ") for element in node_info["title"]]
for i in tqdm_notebook(range(len(corpus))):
    corpus[i] = [stemmer.stem(el) for el in corpus[i] if (el.isdigit()==False and len(el)>2 and len(el)<15)]

In [None]:
from collections import namedtuple

docs = []

analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')
for i, words in tqdm_notebook(enumerate(corpus)):
    tags = [i]
    docs.append(analyzedDocument(words, tags))

In [None]:
model = Doc2Vec(docs, size=N_VECTORIZATION, window=8, min_count=5, workers=4)
liste = [list(model.docvecs[i]) for i in range(len(model.docvecs))]

node_info = node_info.merge(pd.DataFrame(liste, columns=["title_d2v_"+str(i) for i in range(N_VECTORIZATION)]),how='outer', left_index=True, right_index=True)

## Data Cleaning

Where training and testing set receive clean data from node_info to optimize the building of useful features later.

In [None]:
def clear_name(val):
    if val!=val:
        return ""
    ar = val.split(",")
    for i in range(len(ar)):
        if ar[i][0]== " " and len(ar[i])>1: #get rid of the first space
            ar[i] = ar[i][1:]
        ar[i] = ";".join(ar[i].split(" "))
        #or ar[i] = ar[i].split(" ")[-1]
        ar[i] = ar[i].replace("(", "")
        ar[i] = ar[i].replace(")", "")
    return " ".join(ar)

node_info["author"] = node_info["author"].progress_apply(lambda val: clear_name(val))

In [None]:
node_info["pos"] = pd.Series([i for i in range(len(node_info))])

In [None]:
training_set = training_set.merge(node_info, how="left", left_on = "source", right_on = "id")
training_set = training_set.merge(node_info, how="left", left_on = "target", right_on = "id", suffixes= ["_source","_target"])

testing_set = testing_set.merge(node_info, how="left", left_on = "source", right_on = "id")
testing_set = testing_set.merge(node_info, how="left", left_on = "target", right_on = "id", suffixes= ["_source","_target"])

In [None]:
del(tfidf, corpus, model, analyzedDocument, docs)

## Features based on graph topology

In [None]:
edges = training_set[training_set["label"]==1]

ig = igraph.Graph()
ig.add_vertices(node_info.id)
ig.add_edges([(source,target) for source,target in zip(edges.pos_source,edges.pos_target)])

g = nx.Graph()
g.add_nodes_from(node_info.id)
g.add_edges_from([(source,target) for source,target in zip(edges.source,edges.target)])

dg = nx.DiGraph()
dg.add_nodes_from(node_info.id)
dg.add_edges_from([(source,target) for source,target in zip(edges.source,edges.target)])

In [None]:
def common_neighbor(line):
    n_source = set(ig.neighbors(line["pos_source"], mode="ALL"))
    n_target = set(ig.neighbors(line["pos_target"], mode="ALL"))
    return(len(n_source & n_target)>0)

training_set["common_neighbor"] = training_set.progress_apply(common_neighbor,axis=1)
testing_set["common_neighbor"] = testing_set.progress_apply(common_neighbor,axis=1)

In [None]:
def jaccard(line):
    n_source = set(ig.neighbors(line["pos_source"], mode="ALL"))
    n_target = set(ig.neighbors(line["pos_target"], mode="ALL"))
    if(len(n_source) == 0 and len(n_target)==0):
        return 1
    return(len(n_source & n_target)/len(n_source | n_target))

training_set["jaccard"] = training_set.progress_apply(jaccard,axis=1)
testing_set["jaccard"] = testing_set.progress_apply(jaccard,axis=1)

In [None]:
betweenness = ig.betweenness()

training_set["betweenness_source"] = training_set["pos_source"].progress_apply(lambda x : betweenness[x])
testing_set["betweenness_source"] = testing_set["pos_source"].progress_apply(lambda x : betweenness[x])

training_set["betweenness_target"] = training_set["pos_target"].progress_apply(lambda x : betweenness[x])
testing_set["betweenness_target"] = testing_set["pos_target"].progress_apply(lambda x : betweenness[x])

training_set["diff_in_bc"] = training_set["betweenness_target"] - training_set["betweenness_source"]
testing_set["diff_in_bc"] = testing_set["betweenness_target"] - testing_set["betweenness_source"]

del(betweenness)

In [None]:
inlinks = ig.indegree()

def diff_in_inlinks(line):
    return(inlinks[line["pos_target"]]-inlinks[line["pos_source"]])

training_set["diff_in_inlinks"] = training_set.progress_apply(diff_in_inlinks,axis=1)
testing_set["diff_in_inlinks"] = testing_set.progress_apply(diff_in_inlinks,axis=1)

In [None]:
# Cluster = connected component

cluster = ig.clusters().membership

def same_cluster(line):
    return(cluster[line["pos_target"]]-cluster[line["pos_source"]])

training_set["same_cluster"] = training_set.progress_apply(same_cluster,axis=1)
testing_set["same_cluster"] = testing_set.progress_apply(same_cluster,axis=1)

del(cluster)

In [None]:
# Eccentricity : cumulative sum of the distance to other edges

eccentricity = ig.eccentricity(mode="IN")

def target_eccentricty(line):
    return(eccentricity[line["pos_target"]])

training_set["target_eccentricty"] = training_set.progress_apply(target_eccentricty,axis=1)
testing_set["target_eccentricty"] = testing_set.progress_apply(target_eccentricty,axis=1)

del(eccentricity)

In [None]:
# Distance
dijsktra = nx.all_pairs_dijkstra_path_length(dg, cutoff=5)

def shortest_path(line):
    try:
        return(dijsktra[line["source"]][line["target"]])
    except:
        return(6)

training_set["shortest_path"] = training_set.progress_apply(shortest_path,axis=1)
testing_set["shortest_path"] = testing_set.progress_apply(shortest_path,axis=1)

del(dijsktra)

## Features based on graph topology for the author graph

In [None]:
#creation graph for author -> two authors are linked if one cites another
auth_list = []

for author_names in node_info["author"]:
    for name in author_names.split("_"):
        auth_list.append(name)
        
auth_list = set(auth_list)
auth_list.remove("")
auth_list = list(auth_list)

author_to_index = dict(zip(auth_list, range(len(auth_list))))

g_author = igraph.Graph()
g_author.add_vertices(len(auth_list))

lst_edges = []

auth_src_l, auth_tgt_l = training_set["author_source"], training_set["author_target"]
for i in range(len(auth_src_l)):
    auth_src = auth_src_l[i]
    auth_tgt = auth_tgt_l[i]
    
    if auth_src != "" and auth_tgt != "":
        auth_src = auth_src.split("_")
        auth_tgt = auth_tgt.split("_")
        for name_scr in auth_src:
            for name_tgt in auth_tgt:
                if name_scr != "" and name_tgt!="" and not g_author.are_connected(author_to_index[name_scr], author_to_index[name_tgt]):
                    lst_edges.append((author_to_index[name_scr], author_to_index[name_tgt]))

lst_edges = list(set(lst_edges))
g_author.add_edges(lst_edges)    
print("author graph created")

In [None]:
betweenness = g_author.betweenness()

def auth_betweenness(elt):
    auth_list = []
    for name in elt.split("_"):
        auth_list.append(name)
    l = []
    for auth in auth_list:
        if auth != "" and auth in author_to_index:
            l.append(betweenness[author_to_index[auth]])
        else:
            l.append(0)
    return max(l)

training_set["betweenness_author_target"] = training_set["author_target"].progress_apply(auth_betweenness)
testing_set["betweenness_author_target"] = testing_set["author_target"].progress_apply(auth_betweenness)

In [None]:
inlinks = g_author.indegree()

def auth_inlinks(elt):
    auth_list = []
    for name in elt.split("_"):
        auth_list.append(name)
    l = []
    for auth in auth_list:
        if auth != "" and auth in author_to_index:
            l.append(inlinks[author_to_index[auth]])
        else:
            l.append(0)
    return max(l)

training_set["inlinks_author_target"] = training_set["author_target"].progress_apply(auth_inlinks)
testing_set["inlinks_author_target"] = testing_set["author_target"].progress_apply(auth_inlinks)

## Semantic Features

In [None]:
# Cos similarity in Doc2Vec title

def cos_similarity_title(line):
    scal = vectorized_title[line["pos_source"],:].dot(vectorized_title[line["pos_target"],:].T)[0,0]
    source = vectorized_title[line["pos_source"],:].dot(vectorized_title[line["pos_source"],:].T)[0,0]
    target = vectorized_title[line["pos_target"],:].dot(vectorized_title[line["pos_target"],:].T)[0,0]
    return(scal/np.sqrt(target*source))

training_set["cos_similarity_title"] = training_set.progress_apply(cos_similarity_title,axis=1)
testing_set["cos_similarity_title"] = testing_set.progress_apply(cos_similarity_title,axis=1)

In [None]:
# Cos similarity in Doc2Vec description

def cos_similarity_description(line):
    scal = vectorized_desc[line["pos_source"],:].dot(vectorized_desc[line["pos_target"],:].T)[0,0]
    source = vectorized_desc[line["pos_source"],:].dot(vectorized_desc[line["pos_source"],:].T)[0,0]
    target = vectorized_desc[line["pos_target"],:].dot(vectorized_desc[line["pos_target"],:].T)[0,0]
    return(scal/np.sqrt(target*source))

training_set["cos_similarity_desc"] = training_set.progress_apply(cos_similarity_description,axis=1)
testing_set["cos_similarity_desc"] = testing_set.progress_apply(cos_similarity_description,axis=1)

## Attributes Features

In [None]:
def diff_in_year(line):
    return(line["year_target"]-line["year_source"])

training_set["diff_in_year"] = training_set.progress_apply(diff_in_year,axis=1)
testing_set["diff_in_year"] = testing_set.progress_apply(diff_in_year,axis=1)

In [None]:
def nb_common_author(line):
    set1 = list(set(line["author_source"].split(" ")))
    set2 = list(set(line["author_target"].split(" ")))
    
    set1 = list(filter(lambda a: a != "" and a!=" ", set1))
    set2 = list(filter(lambda a: a != "" and a!=" ", set2))
    
    count = 0
    for word in set1:
        for word2 in set2:
            if word in word2 or word2 in word:
                count += 1
                break
    return count


training_set["author_nb_common"] = training_set.progress_apply(nb_common_author,axis=1)
testing_set["author_nb_common"] = testing_set.progress_apply(nb_common_author,axis=1)

In [None]:
def one_common_author(line):
    set1 = list(set(line["author_source"].split(" ")))
    set2 = list(set(line["author_target"].split(" ")))
    
    set1 = list(filter(lambda a: a != "" and a!=" ", set1))
    set2 = list(filter(lambda a: a != "" and a!=" ", set2))
    
    for word in set1:
        for word2 in set2:
            if word in word2 or word2 in word:
                return 1
    return 0

training_set["author_is_one_common"] = training_set.progress_apply(one_common_author,axis=1)
testing_set["author_is_one_common"] = testing_set.progress_apply(one_common_author,axis=1)

In [None]:
def nb_common_classification(line):
    if line["classification_source"]!=line["classification_source"]:
        return 0
    if line["classification_target"]!=line["classification_target"]:
        return 0
    set1 = list(set(line["classification_source"].split(".")))
    set2 = list(set(line["classification_target"].split(".")))
    
    set1 = list(filter(lambda a: a != "" and a!=" ", set1))
    set2 = list(filter(lambda a: a != "" and a!=" ", set2))
    count = 0
    
    for word in set1:
        for word2 in set2:
            if (word in word2 or word2 in word):
                count += 1
                break
    return 2*count/(len(set1)+len(set2))

training_set["common_classification"] = training_set.progress_apply(nb_common_classification,axis=1)
testing_set["common_classification"] = testing_set.progress_apply(nb_common_classification,axis=1)

In [None]:
def one_common_word(line):
    set1 = set(line["title_source"].split(" "))
    set2 = set(line["title_target"].split(" "))
    set1 = set(filter(lambda a: a != "" and a!=" ", set1))
    set2 = set(filter(lambda a: a != "" and a!=" ", set2))
    return 1 if len(set1 & set2)>0 else 0

training_set["title_is_one_common"] = training_set.progress_apply(one_common_word,axis=1)
testing_set["title_is_one_common"] = testing_set.progress_apply(one_common_word,axis=1)

In [None]:
def nb_common_word(line):
    set1 = set(line["title_source"].split(" "))
    set2 = set(line["title_target"].split(" "))
    set1 = set(filter(lambda a: a != "" and a!=" ", set1))
    set2 = set(filter(lambda a: a != "" and a!=" ", set2))
    return len(set1 & set2)

training_set["title_nb_common_word"] = training_set.progress_apply(nb_common_word,axis=1)
testing_set["title_nb_common_word"] = testing_set.progress_apply(nb_common_word,axis=1)

In [None]:
def nb_target_cited(line):
    return(inlinks[line["pos_target"]])

training_set["nb_target_cited"] = training_set.progress_apply(nb_target_cited,axis=1)
testing_set["nb_target_cited"] = testing_set.progress_apply(nb_target_cited,axis=1)

## Output

In [None]:
training_set.to_csv(path_to_data+"improved_training_set.csv")
testing_set.to_csv(path_to_data+"improved_testing_set.csv")