In [None]:
import networkx as nx
import numpy as np
import pandas as pd
import json

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier

from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

from tqdm import tqdm

In [None]:
#Definiendo varias metricas para link prediction
def commonNeighbors(G,source,target):
  adj = dict(G.adj[source])
  adj2 = dict(G.adj[target])
  commonNeighbors = set(adj.keys()) & set(adj2.keys())
  return commonNeighbors

def jaccardCoeff(G,source,target):
  adj = dict(G.adj[source])
  adj2 = dict(G.adj[target])
  unionNeighbors = set(adj.keys()) | set(adj2.keys())
  commonNeighbors = set(adj.keys()) & set(adj2.keys())
  return len(commonNeighbors)/len(unionNeighbors)

def adamicAdar(G,source,target):
  cn = commonNeighbors(G,source,target)
  coeff = 0
  for neighbor in cn:
    if len(dict(G.adj[neighbor]).keys()) >1:
      coeff += 1/np.log(len(dict(G.adj[neighbor]).keys()))
  return coeff

def similarity(G,source,target):
    adj = dict(G.adj[source])
    adj2 = dict(G.adj[target])
    y=0.5
    val = 0
    for a in set(adj.keys()):
        for b in set(adj2.keys()):
            if a==b:
                val+= 1
            else:
                val+= similarity(G,a,b)
    return val/(len(adj.keys())*len(adj2.keys()))

def commonKeywords(nodes_df,source,target):
  keywords1 = list(nodes_df[nodes_df["ID"]==source]["KeywordsB2018"])
  keywords1 = set(str(keywords1[0]).split(","))
  keywords2 = list(nodes_df[nodes_df["ID"]==target]["KeywordsB2018"])
  keywords2 = set(str(keywords2[0]).split(","))
  common = keywords1 & keywords2
  return len(common)

#Contador binario
def contador(l):
    acarreo =True
    for i in range(len(l)):
        if(acarreo):
            l[i] = not(l[i])
            acarreo = l[i]
    return l

In [None]:
universidades = ['ESPOL']
predictionsDic = {}
cont=0
for univ in tqdm(universidades,desc='Universidades'):
  archivo_edges = "data/grafo_completo/coauthors-edges-"+univ+".csv"
  archivo_nodes = "data/grafo_completo/coauthors-nodes-"+univ+".csv"
  coau_df = pd.read_csv(archivo_edges,delimiter=";")
  nodos_df = pd.read_csv(archivo_nodes,delimiter=";")
  G = nx.from_pandas_edgelist(coau_df,"Source","Target",['Weight','Year'],create_using=nx.Graph())
  nodos = list(G.nodes)
  #Obteniendo dataframe de pares de nodos que no tienen coneccion

  adj_G = nx.to_numpy_matrix(G, nodelist = nodos)
  all_unconnected_pairs = []

  offset = 0
  for i in range(adj_G.shape[0]):
    for j in range(offset,adj_G.shape[1]):
      if i != j:
            if adj_G[i,j] == 0:
              all_unconnected_pairs.append((nodos[i],nodos[j]))

    offset = offset + 1

  node_1_unlinked = [i[0] for i in all_unconnected_pairs]
  node_2_unlinked = [i[1] for i in all_unconnected_pairs]

  #Creando dataframe para almacenar a los pares de nodos no conectados que se recogieron
  data = pd.DataFrame({'Source':node_1_unlinked,'Target':node_2_unlinked})
  data['Weight'] = 0
  data['link'] = 0

  indices = range(len(data['Source']))
  #Separando el dataframe en datos para train y para test
  #(se busca un numero no muy grande de datos de pares que no tienen coneccion en los datos para entrenar, para
  #evitar que los datos de entrenamiento esten desbalanceados)

  if len(indices) > len(coau_df['Source'])*5:
    _, indices = train_test_split(indices,test_size=0.15,random_state=73)
  removeIn, noRemoveIn = train_test_split(indices,test_size=0.25,random_state=32)

  test= data.copy()
  test = test.loc[noRemoveIn]

  data = data.loc[removeIn]

  #Trabajando con los pares de nodos que si estan conectados

  #Obteniendo lista de los links que se pueden borrar (que no eliminan nodos, o dividen al grafo en mas de una componente) y que
  #estan despues del año establecido

  year = 2018
  initial_node_count = len(G.nodes)

  coau_df_temp = coau_df.copy()

  omissible_links_index = []
  for i in tqdm(coau_df.index.values):
      if G.adj[coau_df.values[i,0]][coau_df.values[i,1]]['Year']>=year:
          # Eliminar una arista y construir un nuevo grafo sin esa unica arista
          G_temp = nx.from_pandas_edgelist(coau_df_temp.drop(index=i), "Source", "Target",["Weight"], create_using=nx.Graph())

          # Verificando que al eliminar este par, no parte el grafo, y que el numero de nodos siga siendo el mismo

          if (nx.number_connected_components(G_temp) == 1) and (len(G_temp.nodes) == initial_node_count):
              omissible_links_index.append(i)
              coau_df_temp = coau_df_temp.drop(i)

  #creando dataframe de edges que se pueden remover

  coau_df_temp2 = coau_df.copy()
  coau_df_temp2["link"]=1

  #Separando dataframe en datos para train y test

  testlinks = coau_df_temp2.loc[omissible_links_index]
  coau_df_temp = coau_df_temp2.drop(index=omissible_links_index)

  data = data.append(coau_df_temp[['Source', 'Target', 'link','Weight']], ignore_index=True)
  data["Weight"] = data["Weight"].astype('int64')

  test = test.append(testlinks[['Source', 'Target', 'link','Weight']], ignore_index=True)


  #Creando nuevo grafo sin los links eliminados
  G_data = nx.from_pandas_edgelist(coau_df_temp, "Source", "Target",["Weight"], create_using=nx.Graph())

  #SacandoFeatures:
  Xtrain =[]
  Ytrain =[]
  for i in data.values:
      Xtrain.append([len(commonNeighbors(G_data,i[0],i[1])),jaccardCoeff(G_data,i[0],i[1]),
                     adamicAdar(G_data,i[0],i[1]),commonKeywords(nodos_df,i[0],i[1])])
      Ytrain.append(i[3])
  Xtrain = np.array(Xtrain)
  Ytrain = np.array(Ytrain)

  Xtest = []
  Ytest = []
  for i in test.values:
      Xtest.append([len(commonNeighbors(G_data,i[0],i[1])),jaccardCoeff(G_data,i[0],i[1]),
                    adamicAdar(G_data,i[0],i[1]),commonKeywords(nodos_df,i[0],i[1])])
      Ytest.append(i[3])

  Xtest = np.array(Xtest)
  Ytest = np.array(Ytest)

  #Prediciendo
  lista = ['CommonNeighbors',"JaccardCoeff","AdamicAdar","CommonKeywords"]
  lista = np.array(lista)
  l = [True]*len(lista)
  predictionsDic[univ] ={}
  for i in range(2**len(lista)-1):
    
    lr = LogisticRegression(class_weight="balanced")
    lr.fit(Xtrain[:,l], Ytrain)

    predictions = lr.predict(Xtest[:,l])
    t = "-".join(lista[l])
    
    predictionsDic[univ][t] = {
            'regresionLogistica':{
                'accuracy':accuracy_score(Ytest,predictions),
                'roc_auc':roc_auc_score(Ytest, predictions),
                'recall':recall_score(Ytest,predictions),
                'precision':precision_score(Ytest,predictions)
                }
            }
        
                
    clf = SVC(gamma='scale')

    clf.fit(Xtrain[:,l], Ytrain)
    predictions = clf.predict(Xtest[:,l])

    predictionsDic[univ][t]['SVC'] = {
        'accuracy':accuracy_score(Ytest,predictions),
        'roc_auc':roc_auc_score(Ytest, predictions),
        'recall':recall_score(Ytest,predictions),
        'precision':precision_score(Ytest,predictions)
    }

    bagging = BaggingClassifier(base_estimator=LogisticRegression(class_weight="balanced"),n_estimators=10, random_state=0).fit(Xtrain[:,l], Ytrain)
    predictions = bagging.predict(Xtest[:,l])

    predictionsDic[univ][t]['Bagging'] = {
        'accuracy':accuracy_score(Ytest,predictions),
        'roc_auc':roc_auc_score(Ytest, predictions),
        'recall':recall_score(Ytest,predictions),
        'precision':precision_score(Ytest,predictions)
    }
    l = contador(l)



In [None]:
predictionsDic