# Conception data for Experiment 2 and 3

In [None]:
# importation of the packages

import numpy as np
from graph_tool.all import *
import random
import graph_tool.topology as gt
import graph_tool.clustering as gc
import graph_tool.centrality as gcent
import graph_tool.generation as gg
import graph_tool.stats as gs

import matplotlib.pyplot as plt
import pickle
import matplotlib.colors as colors
import matplotlib
import copy
from matplotlib.lines import Line2D 
import matplotlib.pyplot as plt

from scipy.stats.stats import pearsonr
import pandas as pd

%matplotlib inline
import seaborn as sns
from scipy.stats import gaussian_kde
from collections import Counter
import math
import scipy.stats
import collections

import matplotlib.ticker as mtick
from matplotlib.ticker import FormatStrFormatter
from scipy.stats import norm 
from sklearn.neighbors import KernelDensity 
from sklearn.utils.fixes import parse_version 

from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score

from collections import Counter
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC

from multiprocessing import Pool
from  matplotlib.colors import LinearSegmentedColormap

from scipy.stats.stats import pearsonr 
from collections import defaultdict
import itertools

import networkx as nx
import os
import random

## A - Useful functions

### 1-Function which creates a network

In [None]:
def make_random_network(network,N,mean_degree):
    
    """
    Function which computes a graph
    
    Parameters:
    - network: type of network between 'BA', 'SBM', 'ER', 'WS', 'kregular', 'bimodal_distribution', 'lognormal', 'cm' and 'twitter' (twitter is not a synthetic network but is uploaded)
    - N: number of nodes
    - mean_degree: average degree
    
    Return:
    - g: the network
    
    """
    
    if network == 'BA':
        E = mean_degree*N/2
        m = E/(N-1)
        m = round(m)
        g = gg.price_network(N, m=m, c=None, gamma=1.5, directed=False, seed_graph=None)

    elif network == 'SBM':
        nber_group = 20
        b = [[k]*int(N/nber_group) for k in range(nber_group)]
        b = list(itertools.chain.from_iterable(b))
        p_between_groups = 0.001
        E = mean_degree*N/2
        nber_intergroup = nber_group*(nber_group-1)/2
        nber_edges_between_groups = p_between_groups*E*nber_intergroup
        nber_edges_in_groups = E - nber_edges_between_groups
        nber_edges_one_group = nber_edges_in_groups / nber_group
        ers = np.ones((nber_group, nber_group))*p_between_groups*E + np.eye(nber_group)*(nber_edges_one_group*2 - p_between_groups*E)
        g = gg.generate_sbm(b, ers)
        gs.remove_parallel_edges(g)
        gs.remove_self_loops(g)

    elif network == 'ER':
        E = mean_degree*N/2
        p = E*2/N/(N-1)
        g = random_graph(N, lambda: np.random.binomial(N, p, size=None), directed=False)
        
    elif network == 'WS':
        # p proportion of edges to rewire
        p = 0.05
        g = gg.circular_graph(N, k=2)
        E = mean_degree*N/2
        list_edges = list(g.edges())
        list_nodes = list(g.vertices())

        # list of edges to rewire
        deleted_edges = random.sample(list_edges,int(p*E))
        
        # rewiring
        for e in deleted_edges:
            g.remove_edge(e)
            new_edge_nodes = random.sample(list_nodes,2)
            g.add_edge(new_edge_nodes[0], new_edge_nodes[1]) 
        
    elif network == 'kregular':
        g = random_graph(N, lambda: mean_degree, directed=False)
        
    elif network == 'bimodal_distribution' :
        mu = mean_degree
        delta = 0.5
        sigma = 0.5
        w1 = mu + sigma*math.sqrt((1-delta)/delta)
        w2 = mu - sigma*math.sqrt(delta/(1-delta))
        
        E1 = w1*N/2
        p1 = E1*2/N/(N-1)

        E2 = w2*N/2
        p2 = E2*2/N/(N-1)
        
        def deg_sample():

            if random.uniform(0, 1) > 0.5:
                return np.random.binomial(N, p1, size=None)
            else:
                return np.random.binomial(N, p2, size=None)
        
        g = random_graph(N, deg_sample, directed = False)
        
    elif (network == 'lognormal'):

        g = random_graph(N, lambda: np.random.lognormal(mean=mean_degree, sigma=1.0, size=None), directed=False)
            
    elif (network[:2] == 'cm'): # configuration model
        
        with open('/mnt/sdb1/elsa/ML_6/twitter_egonet/degreedistribution_'+network.split('_')[1]+'_oneweek.pickle', 'rb') as handle:
            degree_distribution_oneweek = pickle.load(handle)
            
        if ((np.sum(degree_distribution_oneweek) % 2) != 0):
            degree_distribution_oneweek[-1] = degree_distribution_oneweek[-1] + 1
            
        G_oneweek = nx.configuration_model([int(k) for k in degree_distribution_oneweek])
        G_oneweek.remove_edges_from(nx.selfloop_edges(G_oneweek))
        list_edges = list(set(list(G_oneweek.edges())))
        g = Graph(directed=False)
        for edge in list_edges:
            node1=edge[0]
            node2=edge[1]
            g.add_edge(node1, node2)
            
    elif (network == 'twitter'):
        
        with open('g_twitter_mention.pickle', 'rb') as handle:
            g = pickle.load(handle)
            
    else:
        print('It is not a name of network')
        
    return(g)

### 2-Function to compute the propagation

In [None]:
# Function which takes as an input the network g and returns the network g after a CP propagation.
# The output g has as nodes properties the different characteristics of their own contamination

class Mixed_Propagation:
    
    """
    This class makes a synthetic propagation

    Parameters:
    - g: network 
    - seed: list of nodes which are the seeds
    - L_beta: list of the values of beta used in this experiment
    - L_phi: list of the values of beta used in this experiment
    - rate: probability of getting infected through spontaneous adoption
    - phi_distri

    """
    
    def __init__(self,g,seed,L_beta,L_phi,rate = 0.005):

        self.g = g
        self.rate = rate

        # definiton of the quantities
        self.N = g.num_vertices() # number of nodes
        E = g.num_edges() # number of edges
        p = E*2/self.N/(self.N-1) # density

        # initialization of the trajectory
        self.T = np.zeros((self.N,1))

        # node property: threshold
        self.threshold = self.g.new_vp("double")
        self.g.vp.threshold = self.threshold
        self.threshold.a = np.random.choice(L_phi, size = self.N, replace = True)

        # node property: beta
        self.beta = self.g.new_vp("double")
        self.g.vp.beta = self.beta
        self.beta.a = np.random.choice(L_beta, size = self.N, replace = True)
        # all the states are now null

        # node property: type contagion
        self.type_contagion = self.g.new_vp("double")
        self.g.vp.type_contagion = self.type_contagion
        self.type_contagion.a =  np.random.choice([0,1], size = self.N, replace = True) 
             # all the number of infected neighbors are now nan

        # node property: state
        self.state = self.g.new_vp("int")
        self.g.vp.state = self.state
        self.previous_state = [0 for i in range(self.N)]
            # all the states are now null

        # node property: time of infection
        self.time_infection = self.g.new_vp("double")
        self.g.vp.time_infection = self.time_infection
        self.time_infection.a = [np.nan for i in range(self.N)]
            # all the time of infection are now nan

        # role of the node: 0 for seed, 1 for vulnerable, 2 for the others
        self.role = self.g.new_vp("int")
        self.g.vp.role = self.role
        #print('ici',[1/v.out_degree()>=threshold[v] if v.out_degree()!=0 else True for v in g.vertices()])
        self.condition_vulnerable = [1/v.out_degree()>=self.threshold[v] if v.out_degree()!=0 else True for v in self.g.vertices()]
        self.role.a = np.where(self.condition_vulnerable,1,2)

        # node property: nber stimuli
        self.nber_stimuli = self.g.new_vp("double")
        self.g.vp.nber_stimuli = self.nber_stimuli
        self.nber_stimuli.a = [np.nan for i in range(self.N)]
            # all the number of stimuli are now nan

        # node property: nber stimuli by neighbors
        self.nber_stimuli_by_neighbor = self.g.new_vp("double")
        self.g.vp.nber_stimuli_by_neighbor = self.nber_stimuli_by_neighbor
        self.nber_stimuli_by_neighbor.a = [np.nan for i in range(self.N)]
             # all the number of infected neighbors are now nan

        # node property: nber infected neighbors
        self.nber_infected_neighbors = self.g.new_vp("double")
        self.g.vp.nber_infected_neighbors = self.nber_infected_neighbors
        self.nber_infected_neighbors.a = [np.nan for i in range(self.N)]
             # all the number of infected neighbors are now nan

        # node property: prop infected neighbors
        self.prop_infected_neighbors = self.g.new_vp("double")
        self.g.vp.prop_infected_neighbors = self.prop_infected_neighbors
        self.prop_infected_neighbors.a = [np.nan for i in range(self.N)]
             # all the number of infected neighbors are now nan 
            
        # node property: is_seed
        self.is_seed = self.g.new_vp("double")
        self.g.vp.is_seed = self.is_seed
        self.is_seed.a = [np.nan for i in range(self.N)]
             # all the number of infected neighbors are now nan

        # characteristics infection neighbors
        # time infection neighbor, number stimuli, number of other neighbors that are connected
        self.charac_infected_neighbors = self.g.new_vp("object")
        self.g.vp.charac_infected_neighbors = self.charac_infected_neighbors
             # all the number of infected neighbors are now nan

        # node property: waiting time: list of the difference between the time of contamination of a node and
        # the contamination of its neighbors
        self.waiting_time = self.g.new_vp("object")
        self.g.vp.waiting_time = self.waiting_time
             # all the number of infected neighbors are now nan

        # node property: rank of infection
        self.rank = self.g.new_vp("double")
        self.g.vp.rank = self.rank
        self.rank.a = [np.nan for i in range(self.N)]
            # all the number of stimuli are now nan
        self.r = 0

        # contamination of the seeds
        self.state[seed] = 1
        self.time_infection[seed] = 0
        self.role[seed] = 0
        self.nber_stimuli[seed] = 0
        self.nber_stimuli_by_neighbor[seed] = 0
        self.nber_infected_neighbors[seed] = 0
        self.prop_infected_neighbors[seed] = 0
        self.waiting_time[seed] = [0]
        self.charac_infected_neighbors[seed] = []
        self.is_seed[seed] = 1
        self.T[int(seed),0] = 1
        self.rank[seed] = self.r
        self.r+=1

        #graph_draw(g,vertex_text=g.vertex_index,vertex_fill_color=g.vertex_properties['state'])
        
    def run(self):
        
        # initialisation of the time
        t = 1

        while (list(self.state.a).count(1) <= self.N*0.9) :

            self.previous_state = self.state.a.copy()
            susceptible_nodes = [v for v in self.g.vertices() if self.state[v]==0]

            for v in susceptible_nodes: # for every susceptible nodes, we look if the threshold is reached

                if (random.random() <= self.rate):
                    self.infect_node(v,t,2)
                    
                elif (self.type_contagion[v] == 0):
                    beta_v = self.beta[v]

                    infected_neighbors = [n for n in list(v.out_neighbors()) if self.previous_state[self.g.vertex_index[n]]==1]
                    # infection neighbors of the infected node v
                    for n in range(len(infected_neighbors)):
                        # go through all the susceptible neighbors
                        if random.random() <= beta_v:
                            # with a probability beta, we infect the susceptible node
                            self.infect_node(v,t,0)
                            break

                else:
                    if len(list(v.out_neighbors()))!=0:
                        prop_infected_neighbors_v = len([n for n in list(v.out_neighbors()) if self.previous_state[self.g.vertex_index[n]]==1])/len(list(v.out_neighbors()))
                    else:
                        prop_infected_neighbors_v = 0
                    if (prop_infected_neighbors_v>=self.threshold[v]): # if it is reached, we infect it
                        self.infect_node(v,t,0)

            susceptible_nodes = [v for v in self.g.vertices() if self.state[v]==0]
            
            state_t = np.asarray(self.state.a).reshape((self.N,1))
            self.T = np.concatenate((self.T, state_t), axis=1)
            t+=1

    def infect_node(self,v,t,spontaneous):
        self.state[v] = 1
        self.time_infection[v] = t
        self.nber_stimuli[v] = np.sum([t-self.time_infection[m] for m in v.out_neighbors() if self.previous_state[self.g.vertex_index[m]]==1])
        self.nber_infected_neighbors[v] = len([m for m in v.out_neighbors() if self.previous_state[self.g.vertex_index[m]]==1])                    
        
        if (v.out_degree() != 0) :
            self.nber_stimuli_by_neighbor[v] = self.nber_stimuli[v] / v.out_degree() 
            self.prop_infected_neighbors[v] = self.nber_infected_neighbors[v]/v.out_degree()
        else: 
            self.nber_stimuli_by_neighbor[v] = 0
            self.prop_infected_neighbors[v] = 0
                    
        self.waiting_time[v] = [t-self.time_infection[m] for m in v.out_neighbors() if self.previous_state[self.g.vertex_index[m]]==1] 
        self.charac_infected_neighbors[v] = [[self.time_infection[m]-t,t-self.time_infection[m],-1,len(list(set.intersection(set(list(v.out_neighbors())),set(list(m.out_neighbors())))))] for m in v.out_neighbors() if self.previous_state[self.g.vertex_index[m]]==1]
        self.charac_infected_neighbors[v] = sorted(self.charac_infected_neighbors[v], key=lambda x: x[0])
        self.is_seed[v] = spontaneous
        self.rank[v] = self.r
        self.r+=1
        
def from_adjmatrix_to_adjlist(A):
    
    """
    Function which takes as an input an adjacency matrix of a network and return an adjacency matrix
    
    """
    A = A.toarray()
    adjList = defaultdict(list)
    for i in range(np.shape(A)[0]):
        for j in range(np.shape(A[i])[0]):
            if A[i][j]== 1:
                adjList[i].append(j)
    return(adjList)

### 3-Function to make the data base

In [None]:
def conception_data_base(network, mean_degree, N_network, N, L_beta, L_phi, rate):
    
    """
    This class makes a synthetic propagation

    Parameters:
    - network : type of network between 'BA', 'SBM', 'ER', 'WS', 'kregular', 'bimodal_distribution', 'lognormal', 'cm' and 'twitter'
    - mean_degree: mean degree of the network
    - N_network: number of different network in the sample
    - N: number of nodes
    - L_beta: list of the values of beta used in this experiment
    - L_phi: list of the values of phi used in this experiment
    - rate: probability of getting infected through spontaneous adoption

    """
   
    name_columns = ['degree',
                    'clustering_coefficient',
                    'nber_infected_neighbors',
                    'prop_infected_neighbors',
                    'charac_infected_neighbors',
                    'is_seed',
                    'beta',
                    'phi',
                    'contagion',
                    'trajectory',
                    'adjacency_list',
                    'index_node',
                    'rank']
    
    df_set = pd.DataFrame(columns = name_columns) 

    for it in range(N_network): # for a certain number of networks

        
        g = make_random_network(network,N,mean_degree)
        
        A = graph_tool.spectral.adjacency(g)
        adj_list = from_adjmatrix_to_adjlist(A)
        #print('adj_list', adj_list)
            
        for i in range(1): ######## 10 ###########

            # propagations in the network
            g_work = g.copy()
            N = len(list(g_work.vertices())) # number of nodes
            seed = random.choice(list(g.vertices())) # pick up a seed
            
            M=Mixed_Propagation(g,seed,L_beta,L_phi)
            M.run()

            # features on the structusre of the ego-network
            degree = np.array(g.get_out_degrees(g.get_vertices()))
            clustering_coefficient = np.array(list(gc.local_clustering(g, weight=None, prop=None, undirected=True)))

            # features on the propagation in the ego-network
            nber_infected_neighbors = M.nber_infected_neighbors.a
            prop_infected_neighbors = M.prop_infected_neighbors.a

            # features on every alters    
            # list infected neighbors [[list of infected neighbors of node 0],[list of infected neighbors of node 1], ...]
            charac_infected_neighbors = list(M.g.vp.charac_infected_neighbors)
                
            # is_seed
            is_seed = M.is_seed.a
            
            # phi
            phi_nodes = M.threshold.a
            
            # beta
            beta_nodes = M.beta.a
            
            # type contagion
            contagion = M.type_contagion.a
                
            # node index
            index_node = list(g.vertex_index)
                
            # node index
            rank = M.rank.a
                
            # node adjacency list
            adj_list_node = [adj_list[i] for i in index_node]
            
            
            array_one_propagation = np.array([[degree[k], clustering_coefficient[k], nber_infected_neighbors[k], prop_infected_neighbors[k], charac_infected_neighbors[k], is_seed[k], beta_nodes[k], phi_nodes[k], contagion[k], M.T, adj_list_node[k], index_node[k], rank[k]] for k in range(N)])

            df_temp = pd.DataFrame(array_one_propagation, columns=name_columns)

            df_set = pd.concat([df_set, df_temp], ignore_index=True)
            
            #array_one_propagation = np.array([[degree[k], clustering_coefficient[k], nber_infected_neighbors[k], prop_infected_neighbors[k], charac_infected_neighbors[k], is_seed[k], beta_nodes[k], phi_nodes[k], contagion[k], M.T, adj_list_node[k], index_node[k], rank[k]] for k in range(N)])
                
            #df_set = df_set.append(pd.DataFrame(array_one_propagation,columns = name_columns),ignore_index = True)
                
    return(df_set)

### 4-Function to process the data base

In [None]:
def preprocessing_df(df_set):
    
    """
    Function which pre-process the data
    
    Input: the row data
    
    Return: the process data
    """

    df = df_set.copy()
    
    df.dropna(subset = ["nber_infected_neighbors"], inplace=True)
    df.reset_index(inplace = True)
    
    # creation of the column 'total number of stimuli received'
    df['sum_stimuli'] = df['charac_infected_neighbors'].apply(lambda x:sum([k[1] for k in x]))
    
    # creation of the column 'std on the number of received stimuli'
    df['std_stimuli'] = df['charac_infected_neighbors'].apply(lambda x:np.std([k[1] for k in x]) if len(x)!=0 else 0)
    # np.std of an empty list is NaN
    
    # creation of the column 'number of stimuli by degree'
    df['nber_stimuli_by_neighbors'] = [i / j if j!=0 else 0 for i, j in zip(list(df['sum_stimuli']), list(df['degree']))]
    
    # creation of the column 'time since last infected neighbor get infected'
    df['time_last_infected_neighbor'] = df['charac_infected_neighbors'].apply(lambda x:min([-k[0] for k in x]) if len(x)!=0 else 0)
    
    # creation of the column 'time since first infected neighbor get infected'
    df['time_first_infected_neighbor'] = df['charac_infected_neighbors'].apply(lambda x:max([-k[0] for k in x]) if len(x)!=0 else 0)
    
    df['trajectory_node'] = [traj[index,:] for traj, index in zip(list(df['trajectory']), list(df['index_node']))]
    
    df['trajectory_neighbors'] = [[traj[index,:] for index in list_index] for traj, list_index in zip(list(df['trajectory']), list(df['adjacency_list']))]
    
    df['parameter'] = [df.iloc[k]['beta'] if df.iloc[k]['contagion'] == 0 else df.iloc[k]['phi'] for k in range(len(df))] 
    
    # selection of the features we are interested in
    df_final = df[["degree","clustering_coefficient","nber_infected_neighbors","prop_infected_neighbors","sum_stimuli","std_stimuli","nber_stimuli_by_neighbors","time_last_infected_neighbor","time_first_infected_neighbor",'is_seed','parameter','contagion',"trajectory_node","trajectory_neighbors", "index_node","rank"]]
    
    return(df_final)


## B - Test of the codes

In [None]:
rate = 0.005
mean_degree = 4
N_network = 20 # 20 
N = 10
L_beta = [0.1, 0.3, 0.5, 0.7, 0.9]
L_phi = [0.1, 0.3, 0.5, 0.7, 0.9]
network = 'ER'

df_set = conception_data_base(network, mean_degree, N_network, N, L_beta, L_phi, rate)
df = preprocessing_df(df_set)

## C - Make the whole data set

In [None]:
rate = 0.005
mean_degree = 4
N_network = 20 # 20 
N = 1000
L_beta = [0.1, 0.3, 0.5, 0.7, 0.9]
L_phi = [0.1, 0.3, 0.5, 0.7, 0.9]
network = 'ER'

df_set = conception_data_base(network, mean_degree, N_network, N, L_beta, L_phi, rate)
df = preprocessing_df(df_set)
    
with open('df_experiment2_3/df_'+name_dataset+'_premix_different_parameter.pickle', 'wb') as handle:
    pickle.dump(df, handle)

In [None]:
# end of the code :)