In [63]:
import networkx as nx
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.metrics import roc_auc_score

from tqdm import tqdm

coau_df = pd.read_csv("data/coauthorsCG-edges.csv",delimiter=";")
#El dataset solo contiene edges del  2015 al  2019

G = nx.from_pandas_edgelist(coau_df,"Source","Target",['Weight','year'],create_using=nx.Graph())
nodos = list(G.nodes)


In [64]:
def connected_component_subgraphs(G):
    for c in nx.connected_components(G):
        yield G.subgraph(c)

In [65]:
coau_df2 = coau_df[coau_df["year"]<2018]
G2 = nx.from_pandas_edgelist(coau_df2,"Source","Target",['Weight','year'],create_using=nx.Graph())
nodos2 = list(G2.nodes)
nodosNuevos = set(nodos)-set(nodos2)
print(len(nodosNuevos))#Cantidad de nodos nuevos en el 2018

219


In [66]:
#Obteniendo dataframe de pares de nodos que no tienen coneccion

adj_G = nx.to_numpy_matrix(G, nodelist = nodos)
all_unconnected_pairs = []

offset = 0
for i in range(adj_G.shape[0]):
  for j in range(offset,adj_G.shape[1]):
    if i != j and adj_G[i,j] == 0:
        all_unconnected_pairs.append((nodos[i],nodos[j]))

  offset = offset + 1

node_1_unlinked = [i[0] for i in all_unconnected_pairs]
node_2_unlinked = [i[1] for i in all_unconnected_pairs]

#Creando dataframe para almacenar a los pares de nodos no conectados que se recogieron
data = pd.DataFrame({'Source':node_1_unlinked,'Target':node_2_unlinked})
data['Weight'] = 0
data['link'] = 0

indices = range(len(data['Source']))

#Separando el dataframe en datos para train y para test

noRemoveIn,removeIn = train_test_split(indices,test_size=0.30,random_state=32)

test= data.copy()
test = test.drop(index=noRemoveIn)

data = data.drop(index=removeIn)

data.shape #Cantidad de ejemplos negativos 

(71159, 4)

In [67]:
#Trabajando con los pares de nodos que si estan conectados

#Obteniendo lista de los links que se pueden borrar (que no eliminan nodos, o dividen al grafo en mas de una componente) y que
#estan despues del año establecido

year = 2018
initial_node_count = len(G.nodes)

coau_df_temp = coau_df.copy()

omissible_links_index = []
for i in tqdm(coau_df.index.values):
    if G.adj[coau_df.values[i,0]][coau_df.values[i,1]]['year']>=year:
        # Eliminar una arista y construir un nuevo grafo sin esa unica arista
        G_temp = nx.from_pandas_edgelist(coau_df_temp.drop(index=i), "Source", "Target",["Weight"], create_using=nx.Graph())

        # Verificando que al eliminar este par, no parte el grafo, y que el numero de nodos siga siendo el mismo

        if (nx.number_connected_components(G_temp) == 1) and (len(G_temp.nodes) == initial_node_count):
            omissible_links_index.append(i)
            coau_df_temp = coau_df_temp.drop(i)

#creando dataframe de edges que se pueden remover

coau_df_temp2 = coau_df.copy()
coau_df_temp2["link"]=1

#Separando dataframe en datos para train y test

testlinks = coau_df_temp2.loc[omissible_links_index]
coau_df_temp = coau_df_temp2.drop(index=omissible_links_index)

data = data.append(coau_df_temp[['Source', 'Target', 'link','Weight']], ignore_index=True)
data["Weight"] = data["Weight"].astype('int64')

test = test.append(testlinks[['Source', 'Target', 'link','Weight']], ignore_index=True)
data.shape

100%|█████████████████████████████████████████████████████████████████████████████| 1175/1175 [00:04<00:00, 278.01it/s]


(71922, 4)

In [68]:
#Definiendo varias metricas para link prediction

def commonNeighbors(G,source,target):
    adj = dict(G.adj[source])
    adj2 = dict(G.adj[target])
    commonNeighbors = set(adj.keys()) & set(adj2.keys())
    return commonNeighbors

def jaccardCoeff(G,source,target):
    adj = dict(G.adj[source])
    adj2 = dict(G.adj[target])
    unionNeighbors = set(adj.keys()) | set(adj2.keys())
    commonNeighbors = set(adj.keys()) & set(adj2.keys())
    return len(commonNeighbors)/len(unionNeighbors)

def adamicAdar(G,source,target):
    cn = commonNeighbors(G,source,target)
    coeff = 0
    for neighbor in cn:
        coeff += 1/np.log(len(dict(G.adj[neighbor]).keys()))
    return coeff

def similarity(G,source,target):
    adj = dict(G.adj[source])
    adj2 = dict(G.adj[target])
    y=0.5
    val = 0
    for a in set(adj.keys()):
        for b in set(adj2.keys()):
            if a==b:
                val+= 1
            else:
                val+= similarity(G,a,b)
    return val/(len(adj.keys())*len(adj2.keys()))

In [69]:
#Creando nuevo grafo sin los links eliminados aleatoriamente
G_data = nx.from_pandas_edgelist(coau_df_temp, "Source", "Target",["Weight"], create_using=nx.Graph())


In [72]:
#Obteniendo las features, y armando los arrays para usarlo en la regresion logistica

Xtrain =[]
Ytrain =[]
for i in data.values:
    Xtrain.append([len(commonNeighbors(G_data,i[0],i[1])),adamicAdar(G_data,i[0],i[1])])
    Ytrain.append(i[3])

Xtest = []
Ytest = []
for i in test.values:
    Xtest.append([len(commonNeighbors(G_data,i[0],i[1])),adamicAdar(G_data,i[0],i[1])])
    Ytest.append(i[3])


In [73]:
lr = LogisticRegression(class_weight="balanced")

lr.fit(Xtrain, Ytrain)

predictions = lr.predict(Xtest)
print("Regresion logistica",roc_auc_score(Ytest, predictions))

Regresion logistica 0.8202119434953176


In [74]:
# Accuracy de varios tipos de aciertos 

aciertosTotales = 0
unosAcertados = 0
cerosAcertados = 0

suma = 0
for tst, y in zip(Xtest,Ytest):

    predic = lr.predict([tst])[0]    
    if y == predic:
        aciertosTotales+=1
        
    if y == 1 and predic == 1:
        unosAcertados+=1
    #if tst[0]!= 0 and tst[1]!=0 and y==1:
    #    suma+=1
    if y == 0 and predic ==0:
        cerosAcertados+=1

acc = aciertosTotales/len(Ytest)
print("Aciertos totales:",acc)

acc2 = unosAcertados/sum(Ytest)
print("unos acertados:",acc2)

acc3 = cerosAcertados/(len(Ytest)-sum(Ytest))
print("ceros acertados:",acc3)


Aciertos totales: 0.9688763790481737
unos acertados: 0.6674757281553398
ceros acertados: 0.9729481588352953


In [75]:
from sklearn.svm import SVC
clf = SVC(gamma='scale')

clf.fit(Xtrain, Ytrain)
predictions = clf.predict(Xtest)
print("SVM: ",roc_auc_score(Ytest, predictions))

SVM:  0.5191387597888827
