In [None]:
import pandas as pd
import numpy as np
import networkx as nx
import seaborn as sns
from statistics import mean
import community
import matplotlib.cm as cm
import matplotlib.pyplot as plt

from collections import Counter
from collections import defaultdict

import random
import copy


from deap import base
from deap import creator
from deap import tools
from math import dist



In [None]:
threshold = 0.3
egoradius = 2

# AMARE - Attribute-aware MARginalization Estimator

In [None]:
def get_weight(node, node_to_com, com_to_nodes):
    node_com_id = node_to_com[node]
    com = com_to_nodes[node_com_id]
    net = nx.subgraph(g, com)
    count = dict(Counter(list(nx.get_node_attributes(net, 'gender').values())))
    if '0' in count:
        males = count['0']
    else:
        males = 0
    if '1' in count:
        females = count['1']
    else:
        females = 0
        
    if attrs[node] == '1':
        return 1 - females / len(com)
    else:
        return 1 - males / len(com)
    
    print(f'com size: {len(com)}, males: {males}, females: {females}')

In [None]:
g = nx.Graph()
g.name = 'copenhagen'
with open('bt_symmetric.csv') as f:
    for l in f.readlines()[1:]:
        tid, a, b, rssi = l.rstrip().split(',')
        g.add_edge(int(a),int(b), tid=tid)
print('loaded')

attrs = {n: None for n in g.nodes()} # also fix missing data
with open('genders.csv') as f:
    for l in f.readlines()[1:]:
        node, gender = l.rstrip().split(',')
        attrs[int(node)] = gender
    nx.set_node_attributes(g, attrs, name='gender')
print('attributes')

In [None]:
to_remove = []
for n in attrs:
    if attrs[n] is None:
        to_remove.append(n)

g.remove_nodes_from(to_remove)

In [None]:
print(nx.info(g))

In [None]:
attrs = nx.get_node_attributes(g, 'gender')

In [None]:
#g = nx.convert_node_labels_to_integers(g)
sizes = dict(Counter(list(nx.get_node_attributes(g, 'gender').values())))
sizes['0'] = sizes['0'] / (len(g))
sizes['1'] = sizes['1'] / (len(g))

In [None]:
weights = dict(Counter(list(nx.get_node_attributes(g, 'gender').values())))
weights['0'] = 1 - sizes['0']
weights['1'] = 1 - sizes['1']

In [None]:
def homogeneity(node, attr, center=True):
    
    egonet = nx.ego_graph(g, node, center=center)
    egonet_attrs = list(nx.get_node_attributes(egonet, 'gender').values())
    count = dict(Counter(egonet_attrs))[attr]
    size = len(egonet)
    if size > 2:
        return count/size


In [None]:
margs = []

marg_dict = dict()

for node in list(g.nodes()):
    attr = attrs[node]
    
    # COMPUTE MARGINALIZATION
    marg = 0
    egonet = nx.ego_graph(g, node, center=True)
    egonet_attrs = list(nx.get_node_attributes(egonet, 'gender').values())
    
    #try
    count = dict(Counter(egonet_attrs))[attr]
    #except:
    #count = 0
    
    size = len(egonet)
    if size > 2:
        marg = ((count * weights[attr] / (count * weights[attr] + (size-count)* (1 - weights[attr]))) - 0.5) * 2
        margs.append(marg)  
        if abs(marg) > threshold:
            marg_dict[node] = marg        
        else:
            marg_dict[node] = 0
    else:
        marg_dict[node] = 0

disc_nodes = [k for k,v in marg_dict.items() if abs(v) > threshold]
disc = len(disc_nodes)

In [None]:
plausible = nx.Graph() # stores plausible links
for node in disc_nodes:
    egonet = nx.ego_graph(g, node, center=True)
    egonet2 = nx.ego_graph(g, node, center=True, radius=egoradius)
    egonet2.remove_nodes_from(egonet)
    for n in egonet2.nodes():
        if node != n and n in disc_nodes:
            if marg_dict[node] > 0:
                if attrs[n] != attrs[node]:
                    plausible.add_edge(node, n)
            elif marg_dict[node] < 0:
                if attrs[n] == attrs[node]:
                    plausible.add_edge(node, n)         
            else:
                print ("ERROR")
links = list(plausible.edges)

In [None]:
print("=== STATS ===")
print("Marginalized nodes:", disc)
print("Global Discrimination:", disc * 100 / len(g.nodes()))
print("Overall Marginalization Score:", mean([abs(x) for x in marg_dict.values()]))
sns.kdeplot(margs)

# MASK - MArginalization Shrinking using linK

In [None]:
def random_individual(links):
    
    individual = []
    
    for e in links:
        individual.append(random.randint(0,1))
    

    return individual 

In [None]:
def evaluate(individual, g):
    
    eva = 0
    
    eva_g = copy.deepcopy(g)
    
    individual = individual[0] #<- because DEAP
    
    nodes = set()

    
    index = [i for i, j in enumerate(individual) if j == 1]

    new_links = [links[i] for i in index]

    #print (new_links)
    for l in new_links:
        eva_g.add_edge(l[0], l[1])
            
    for node in disc_nodes:
        marg = 0
        egonet = nx.ego_graph(eva_g, node, center=True)
        egonet_attrs = list(nx.get_node_attributes(egonet, 'gender').values())
        #try:
        count = dict(Counter(egonet_attrs))[attr]
        #except:
         #   count = 0
        
        size = len(egonet)
        if size > 2:
            marg = ((count * weights[attr] / (count * weights[attr] + (size-count)* (1 - weights[attr]))) - 0.5) * 2
            if abs(marg) > threshold:
                eva = eva+1
                
        
    budget = sum(individual)
            
    return eva, budget, 

    #Fitness 1: nodi marginalizzati rimasti
    #Fitness 2: link usati
    
    #A parità di nodi marginalizzati (il meno possibile), la soluzione con meno link usati è la migliore

In [None]:
creator.create("Fitness", base.Fitness, weights=(-1.0,-1.0)) # <- -1 perché vogliamo minimizzare la fitness
creator.create("Individual", list, fitness=creator.Fitness) #<- l'individuo è definito come lista

toolbox = base.Toolbox() #creiamo il toolbox

toolbox.register("random_individual", random_individual, links) 
#"nome_della_funzione per deap", nome_della_funzione vera e propria di python, parametri che passi alla funzione

toolbox.register("individual", tools.initRepeat, creator.Individual, 
                 toolbox.random_individual, n=1) 
# n = numero di individui nella popolazione. Lasciamo 1

toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox.register("evaluate", evaluate, g=g) #funzione di valutazione. Vedi quanto detto sopra
toolbox.register("mate", tools.cxTwoPoint) #funzione di crossover
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05) #funzione di mutazione custom
toolbox.register("select", tools.selTournament, tournsize=3)
#tools.selNSGA2) #funzione di selezione

In [None]:
#def GA():
print ('Marginalized nodes:', disc, '· Available links:', len(links))
NUM_GENERATIONS = 50 #numero di generazioni
POPULATION_SIZE = 150 #popolazione per gen

CXPB, MUTPB = 0.5, 0.25 #crossover e mutation probability

n_HOF = 10 #top soluzioni da ritornare (la "Hall of Fame" di DEAP è il set di tutte le top n soluzioni)

pop = toolbox.population(n=POPULATION_SIZE)

hof = tools.HallOfFame(n_HOF)

stats = tools.Statistics(lambda ind: ind.fitness.values[0])   
stats.register('min', np.min, axis = 0)
stats.register('avg', np.mean, axis = 0)

logbook = tools.Logbook()
logbook.header = ['gen', 'nevals'] + stats.fields

invalid_individuals = [ind for ind in pop if not ind.fitness.valid]
fitnesses = toolbox.map(toolbox.evaluate, invalid_individuals)
for ind, fit in zip(invalid_individuals, fitnesses):
    ind.fitness.values = fit

hof.update(pop)
hof_size = len(hof.items)

record = stats.compile(pop)
logbook.record(gen=0, best="-", nevals=len(invalid_individuals), **record)
print(logbook.stream)

for gen in range(1, NUM_GENERATIONS + 1):

            # Select the next generation individuals
    offspring = toolbox.select(pop, len(pop))
    # Clone the selected individuals
    offspring = list(map(toolbox.clone, offspring))


    # Apply crossover and mutation on the offspring
    for child1, child2 in zip(offspring[::2], offspring[1::2]):
        if random.random() < CXPB:
            toolbox.mate(child1[0], child2[0])
            del child1.fitness.values
            del child2.fitness.values

    for mutant in offspring:
        if random.random() < MUTPB:
            toolbox.mutate(mutant[0])
            del mutant.fitness.values


    # Evaluate the individuals with an invalid fitness
    invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
    fitnesses = map(toolbox.evaluate, invalid_ind)
    for ind, fit in zip(invalid_ind, fitnesses):
        ind.fitness.values = fit

    # Update the hall of fame with the generated individuals
    hof.update(offspring)

    # Replace the current population by the offspring
    pop[:] = offspring

    # Append the current generation statistics to the logbook
    record = stats.compile(pop) if stats else {}
    logbook.record(gen=gen, nevals=len(invalid_ind), **record)
    print(logbook.stream)


hof.update(pop) # la HoF è aggiornata con la nuova popolazione (o meglio, i suoi individui migliori w.r.t. fitness)

plt.figure(1)

minFitnessValues, meanFitnessValues = logbook.select("min", "avg")
plt.figure(2)
sns.set_style("whitegrid")
#plt.plot(maxFitnessValues, color='red')
plt.plot(minFitnessValues, color='blue')
plt.plot(meanFitnessValues, color='green')
plt.xlabel('Generation')
plt.ylabel('Fitness Value')
plt.title('Avg and Min Fitness')
# show both plots:
plt.show()


#return hof.items

In [None]:
for e in hof.items:
    print ('Marginalized nodes:', e.fitness.values[0], '· Links:', e.fitness.values[1])
    print (e[0])
    print ("")

In [None]:
best = hof.items[0][0]

index = [i for i, j in enumerate(best) if j == 1]

new_links = [links[i] for i in index]

new_links

Random Benchmark

In [None]:
rans = []
c = 0
print ("Marginalized nodes:", end = ' ')
while c < 100:
    ran = random_individual (links)
    print (evaluate([ran], g)[0], end = ' ')
    rans.append(evaluate([ran], g)[0])
    c = c+1
print("")
print("")
print("Avg:", mean(rans), "· Min:", min(rans))