FEATURE ENGINEERING

In [4]:
import numpy as np
import csv
import pandas as pd
#pd.set_option('display.max_columns', None)
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import networkx.algorithms.community as nx_community
import community

In [3]:
node_info = pd.read_csv("/Users/jojolapatate/Documents/GitHub/Predicting-missing-links-co-occurrence-network/data/node_information.csv", header=None)
test_set = pd.read_csv("/Users/jojolapatate/Documents/GitHub/Predicting-missing-links-co-occurrence-network/data/test.txt", sep=" ", header=None, names=['source', 'target'])
train_set = pd.read_csv("/Users/jojolapatate/Documents/GitHub/Predicting-missing-links-co-occurrence-network/data/train.txt", sep=" ", header=None, names=['source', 'target', 'label'])

# Graph creation
G = nx.from_pandas_edgelist(train_set, 'source', 'target', 'label')

In [5]:
def salton_similarity(G, edges):
    for u, v in edges:
        common_neighbors = len(list(nx.common_neighbors(G, u, v)))
        degree_u = G.degree(u)
        degree_v = G.degree(v)
        yield u, v, common_neighbors / ((degree_u * degree_v) ** 0.5)

def sorenson_similarity(G, edges):
    for u, v in edges:
        common_neighbors = len(list(nx.common_neighbors(G, u, v)))
        degree_u = G.degree(u)
        degree_v = G.degree(v)
        yield u, v, 2 * common_neighbors / (degree_u + degree_v)


def hub_promoted_similarity(G, edges):
    for u, v in edges:
        common_neighbors = len(list(nx.common_neighbors(G, u, v)))
        degree_u = G.degree(u)
        degree_v = G.degree(v)
        yield u, v, common_neighbors / min(degree_u, degree_v)

def hub_depressed_similarity(G, edges):
    for u, v in edges:
        common_neighbors = len(list(nx.common_neighbors(G, u, v)))
        degree_u = G.degree(u)
        degree_v = G.degree(v)
        yield u, v, common_neighbors / max(degree_u, degree_v)

def adamic_adar_index(G, edges):
    for u, v in edges:
        score = 0
        for w in nx.common_neighbors(G, u, v):
            degree_w = G.degree(w)
            if degree_w > 1:
                score += 1 / np.log(degree_w)
        yield u, v, score

# Creer une fonction qui ajoute des features (degree, centrality, clustering) à un graphes et qui merge node_info avec le graphe
def add_node_attributes(df, node_info):
     # Créer un graphe à partir du df
     if 'label' in df.columns:
        G = nx.from_pandas_edgelist(df, 'source', 'target', 'label')
     else:
        G = nx.from_pandas_edgelist(df, 'source', 'target')

     # Calculer les caractéristiques
     df['degree_source'] = df['source'].apply(lambda x: G.degree(x))
     df['centrality_source'] = df['source'].apply(lambda x: nx.degree_centrality(G)[x])
     #df['clustering_source'] = df['source'].apply(lambda x: nx.clustering(G)[x])

     df['degree_target'] = df['target'].apply(lambda x: G.degree(x))
     df['centrality_target'] = df['target'].apply(lambda x: nx.degree_centrality(G)[x])
     #df['clustering_target'] = df['target'].apply(lambda x: nx.clustering(G)[x])

     # Détecter les communautés et créer une caractéristique de communauté
     communities = nx_community.greedy_modularity_communities(G)
     community_map = {}
     for i, community in enumerate(communities):
          for node in community:
               community_map[node] = i
     df['community_source'] = df['source'].apply(lambda x: community_map[x])
     df['community_target'] = df['target'].apply(lambda x: community_map[x])

     # Calculer le coefficient de Jaccard
     df['jaccard'] = [i[2] for i in nx.jaccard_coefficient(G, df[['source', 'target']].values)]

     # Calculer le coefficient de similarité de Salton
     df['salton'] = [i[2] for i in salton_similarity(G, df[['source', 'target']].values.tolist())]

     # Calculer le coefficient de similarité de Sorenson
     df['sorenson'] = [i[2] for i in sorenson_similarity(G, df[['source', 'target']].values.tolist())]

     # Calculer le coefficient de similarité de Hub Promoted
     df['hub_promoted'] = [i[2] for i in hub_promoted_similarity(G, df[['source', 'target']].values.tolist())]

     # Calculer le coefficient de similarité de Hub Depressed
     df['hub_depressed'] = [i[2] for i in hub_depressed_similarity(G, df[['source', 'target']].values.tolist())]

     # Calculer le coefficient de similarité de Leicht-Holme-Newman
     df['leicht_holme_newman'] = [i[2] for i in nx.preferential_attachment(G, df[['source', 'target']].values)]

     # Calculer le coefficient de similarité de adamic_adar
     df['adamic_adar'] = [i[2] for i in adamic_adar_index(G, df[['source', 'target']].values.tolist())]

     # Calculer le coefficient de similarité de resource_allocation_index
     df['resource_allocation'] = [i[2] for i in nx.resource_allocation_index(G, df[['source', 'target']].values)]

     # Calculer les common neighbors
     df['common_neighbors'] = df.apply(lambda x: len(list(nx.common_neighbors(G, x['source'], x['target']))), axis=1)

     df['resource_allocation'] = [i[2] for i in nx.resource_allocation_index(G, df[['source', 'target']].values)]

     # Calculer les common neighbors
     df['common_neighbors'] = df.apply(lambda x: len(list(nx.common_neighbors(G, x['source'], x['target']))), axis=1)
    
     shortest_paths = dict(nx.all_pairs_shortest_path_length(G))
     df['shortest_path_length'] = [shortest_paths.get((source, target), -1) for source, target in zip(df['source'], df['target'])]

    # Calculate eigenvector centrality
     eigenvector_centrality = nx.eigenvector_centrality_numpy(G)
     df['eigenvector_centrality_source'] = df['source'].apply(lambda x: eigenvector_centrality.get(x, 0))
     df['eigenvector_centrality_target'] = df['target'].apply(lambda x: eigenvector_centrality.get(x, 0))

     # Fusionner node_info avec le df
     node_info.rename(columns={0: 'node_id'}, inplace=True)
     df = df.merge(node_info, left_on='source', right_on='node_id', how='left')
     # rename toutes les colonnes de node_info ajouter par "nom_col"+"source"
     df.rename(columns={col: str(col) + '_source' for col in node_info.columns[1:]}, inplace=True)
     df.drop('node_id', axis=1, inplace=True)


     df = df.merge(node_info, left_on='target', right_on='node_id', how='left')
     # rename toutes les colonnes de node_info ajouter par "nom_col"+"source"
     df.rename(columns={col: str(col) + '_target' for col in node_info.columns[1:]}, inplace=True)
     df.drop('node_id', axis=1, inplace=True)
     return df

# Ajouter les caractéristiques au train_set
train_set = add_node_attributes(train_set, node_info)
test_set = add_node_attributes(test_set, node_info)

train_set.head()


In [None]:
train_set.to_csv('../data/train_set_final.csv', index=False)
test_set.to_csv('../data/test_set_final.csv', index=False)