In [300]:
from datetime import datetime
import pandas as pd
import numpy as np
import networkx as nx
from tqdm import tqdm
from pprint import pprint
import pickle
import heapq as heap
import sys
import math

In [4]:
with open('data/graph_with_loops.pkl', 'rb') as f:
    G = pickle.load(f)

with open('data/graph_without_loops.pkl', 'rb') as f:
    G_loops = pickle.load(f)

In [72]:
with open('data/graph_simple.pkl', 'rb') as f:
    G_simple = pickle.load(f)

In [301]:
with open('data/graph_total.pkl', 'rb') as f:
    G_total = pickle.load(f)

In [302]:
def interval_time(G, time_inter):
    '''
    :param G: initial graph, with u, v and weights and time_list as attributes
    :param time_inter: [start_date, end_date]
    :return: new_G: graph with only u, v and weight, which is the length of all
                    the dates inside the time interval
    '''
    start = time_inter[0]
    end = time_inter[1]
    new_G = nx.DiGraph()
    for u, v, data in tqdm(G.edges(data=True)):
        time_list = data['time_list']
        # leave only the dates inside the time interval
        new_time_list = [time for time in time_list if start <= time <= end]
        # new weight, which is the length of the new_time_list
        w = len(new_time_list)
        # if the weight differ from 0, then the edge exists
        if w != 0:
            new_G.add_edge(u, v, weight=w)
    return new_G

In [None]:
def functionality_N(params, time_inter):
    new_g = interval_time(G, time_inter)

In [None]:
nodes_list = list(G.nodes)
df = nx.to_pandas_edgelist(G, nodelist=nodes_list)

In [344]:
def in_degree(df_g, node):
    # number of edges where node is target
    return df_g[df_g['target']==node].target.count()

def out_degree(df_g, node):
    # number of edges where node is source
    return df_g[df_g['source']==node].source.count()

def degree_centrality(df_g, node, G):
    degree = in_degree(df_g, node) + out_degree(df_g, node)
    num_nodes = len(list(G.nodes))
    return degree/(num_nodes-1)

## Functionality 2
- **Betweenness**
- **PageRank**
- **ClosenessCentrality**
- **DegreeCentrality** of a node v is defined as  $degree\_centrality(v) = \frac{degree(v)}{n-1} $ ,  where degree(v) is the number of neighbors of v.


In [340]:
def dijkstra(df, source, target, G):
    '''
    :param df: dataframe of the graph: source, target, weight
    :param source: source node
    :param target: target node
    :return: final_weight, path
    '''
    # init the distance dictionary with the source node as key and as value None (parent node) and 0 (distance)
    distance_par = {source: (None, 0)}
    visited = set()
    # store the current node
    curr_node = source

    while curr_node != target:
        visited.add(curr_node)
        dest_list = df[df['source']==curr_node].target.to_list()
        # taking only the weight of curr_node
        curr_weight = distance_par[curr_node][1]
        for node in dest_list:
            weight = G[curr_node][node]['weight'] + curr_weight
            if node not in distance_par:
                distance_par[node] = (curr_node, weight)
            else:
                node_weight = distance_par[node][1]
                if node_weight > weight:
                    distance_par[node] = (curr_node, weight)

        # create a list of nodes to visit
        next_dest_list = {n: distance_par[n] for n in distance_par if n not in visited}

        # check if there are nodes to visit
        if not next_dest_list:
            return 'No possible path'

        # next curr node is the one in next_dest_list (nodes still to visit) with the lowest weight
        curr_node = min(next_dest_list, key=lambda k: next_dest_list[k][1])

    # now we wont to compute the list of the shortest path from source to target
    short_path = []
    # at the beginning, curr_node is the target node, since the first while is finished
    final_weight = distance_par[curr_node][1]
    while curr_node is not None:
        short_path.append(curr_node)
        par_node = distance_par[curr_node][0]
        curr_node = par_node

    # reverse the list now
    short_path = short_path[::-1]

    return [final_weight, short_path]

In [317]:
new_g = interval_time(G_total, ["2015-08-01 ","2015-08-02"])
# 9927 nodi

100%|██████████| 10300590/10300590 [00:09<00:00, 1041504.51it/s]


In [343]:
nodes_list = list(new_g.nodes)
df = nx.to_pandas_edgelist(new_g, nodelist=nodes_list)
df.head()

Unnamed: 0,source,target,weight
0,149341,5181076,1
1,149341,1391026,1
2,149341,5182088,1
3,149341,1800840,1
4,149341,366782,1


In [390]:
g = new_g.subgraph(nodes_list[800:850])

In [391]:
nodes_list_g = nodes_list[0:30]
dfg = nx.to_pandas_edgelist(g, nodelist=nodes_list)
dfg

Unnamed: 0,source,target,weight
0,3501958,928007,1
1,3501958,2581174,1
2,3501958,5011413,1
3,3501958,4795214,1
4,3501958,5055411,2
5,3501958,2403836,1
6,928007,3501958,1
7,2581174,3501958,1
8,418556,3988057,2
9,418556,4090458,2


In [392]:
# for each node v, we store the shortest paths between v and all the other nodes in new_g
# as dictionary of dictionaries
shortest_paths = {}
for v in g.nodes:
    shortest_paths[v] = {}
    for u in g.nodes:
        shortest_paths[v][u] = dijkstra(dfg, v, u, g)
        
#with open('data/shortest_paths.pkl', 'wb') as f:
    #shortest_paths = pickle.dump(shortest_paths, f)

In [447]:
shortest_paths

{4786688: {4786688: [0, [4786688]],
  2464386: 'No possible path',
  2065540: 'No possible path',
  3501958: 'No possible path',
  928007: 'No possible path',
  2770572: 'No possible path',
  3919629: 'No possible path',
  4391054: 'No possible path',
  52239: 'No possible path',
  2464528: 'No possible path',
  3788176: 'No possible path',
  2406926: 'No possible path',
  4519059: 'No possible path',
  895245: 'No possible path',
  258961: 'No possible path',
  3216793: 'No possible path',
  1630618: 'No possible path',
  4135581: 'No possible path',
  1565982: 'No possible path',
  4755876: 'No possible path',
  5182501: 'No possible path',
  2410148: 'No possible path',
  5055411: 'No possible path',
  895795: 'No possible path',
  2581174: 'No possible path',
  4013751: 'No possible path',
  5016888: 'No possible path',
  48503: 'No possible path',
  2180161: 'No possible path',
  364100: 'No possible path',
  429002: 'No possible path',
  4795214: 'No possible path',
  5005262: 'N

In [455]:
# AGGIUNGERE INTERVALLO TEMPO IN INPUT
def BestUsers(G, v, metric):
    output = None
    #G = interval_time(G, time_inter)
    shortest_paths = {}
    for v in g.nodes:
        shortest_paths[v] = {}
        for u in g.nodes:
            shortest_paths[v][u] = dijkstra(dfg, v, u, g)
            
    if metric == "Betweenness":
        sp_total = len(shortest_paths) # all the shortest paths of node v
        paths_via_v = 0
        for target in shortest_paths.values():
            for path in target.values():
                if path != 'No possible path':
                    if v in path and path[0].all() != v and path[-1].all() != v:
                        paths_via_v += 1
        
        Betweenness = ((paths_via_v/sp_total)/math.comb(len(G.nodes)-1,2))
        output = Betweenness
    
    if metric == "PageRank":
        a = 0.15
        output = PageRank(G, a)[v]
                          
    if metric == "ClosenessCentrality":
        d = 0
        nodes = list(G.nodes).remove(v)
        for n in nodes:
            d += shortest_path(G,n,v)[1]
        Closeness = (len(G.nodes)-1)/d
        output = Closeness
    
    if metric == "DegreeCentrality":
        
        DegreeC = G.degree(v)/(len(G.nodes)-1)
        output = DegreeC  
                       
    return output                    

In [456]:
BestUsers(g, 2464386,"Betweenness")

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

Given:
- The transition matrix $P$, where the generic element $p_{ij} = Pr$(to go from node i to node j)
- probability of teleporting $a$
- probability vector $\vec{q^t}$, where the generic element $q_v^t = Pr$(to be in state v), in general: 
$$\begin{aligned}
\vec{q^t} = \vec{q^{t-1}}P
\end{aligned}$$

After a high number of steps, it converges to the stationary distribution $\vec{\pi}$, corresponding to the (left) eigenvector with eigenvalue 1:  

$$\begin{aligned}
\vec{\pi} = \vec{\pi}P \implies \pi_i = \sum_{j=1}^{n}\pi_jP{ji}
\end{aligned}$$

where $n$ is the number of nodes.

In [415]:
def PageRank(G, a):
    '''
    input:
    G = graph
    a = probability of teleporting
    
    output:
    pi_i = pagerank score of node i
    '''
    # initialize pagerank score at 1 for each node
    n = len(G.nodes)
    nodes_list = list(G.nodes)
    pi = {v:1 for v in nodes_list}
    
    # build adjacency matrix
    A = np.zeros((n, n))
    for v in nodes_list:
        for u in nodes_list:
            if (v,u) in G.edges:
                A[nodes_list.index(v),nodes_list.index(u)] += 1
    
    # add links if a node doesn't have and normalize
    for row in range(n):
        if np.all(A[row,:] == 0):
            A[row,:] == [(1/n)*n]
        
        else:
            non_zeros = 0
            for i in range(len(A)):
                for j in range(len(A[i])):
                    if A[i,j] != 0:
                        non_zeros += 1
            
            A[row,:] = np.where(A[row,:] != 0, 1/non_zeros, 0)
    
    # build P matrix
    P = (a/n)*np.ones((n, n)) + (1-a)*A
    
    for i in nodes_list:
        for j in nodes_list:
            pi[i] = pi[j]*P[nodes_list.index(j),nodes_list.index(i)]
            
    return pi

In [417]:
p = PageRank(g, 0.15)
sum(p.values())

0.1905897705897437