### In this notebook we find nodes with the highest degree and/or betweenness centrality in each community and manually investigate the those nodes to get a better understanding of what that community is about

In [1]:
import snap

from joblib import Parallel, delayed
from datetime import datetime

import ast, operator
from copy import deepcopy

import time, pandas as pd, pickle, json, networkx as nx, numpy as np
from networkx.readwrite import json_graph

In [2]:
data = json.load(open("../REST/static/networks/latest_tw_ntw.json"))
graph = json_graph.node_link_graph(data, directed=True)
del data

In [3]:
bidir_edges = 0
for f, t in graph.edges:
    bidir_edges += int(graph.has_edge(t, f))
print("There are {0} ({1}%%) bidirectional connections out of {2}".format(bidir_edges, float(bidir_edges)/len(graph.edges)*100, len(graph.edges)))

There are 670 (29.3088363955%%) bidirectional connections out of 2286


In [4]:
list(graph.nodes(data=True))[0]

(396662786,
 {u'betweenness': 0.0,
  u'closeness_centrality': 0.0,
  u'clustering_coefficient': 1.0,
  u'community': u'foci',
  u'degree': 2,
  u'eigenvector_centrality': 2.4260626929564234e-24,
  u'followers_count': 33.0,
  u'friends_count': 284.0,
  u'in_degree': 0,
  u'lang': u'tr',
  u'match_name': u' Sehir MBA',
  u'name': u'nemasehir',
  u'out_degree': 2,
  u'pagerank': 0.00018450710615505045,
  u'screen_name': u'nemasehir'})

In [5]:
def users_to_community(graph, key="screen_name"):
    ''' returns: {key: (user_id, community_id)} '''
    
    return {user_data[key]: (user_id, user_data["community"]) for user_id, user_data in graph.nodes(data=True)}

In [6]:
communities_dict = dict()  # {community: {user_id:(betweenness, degree)}}
uc = dict() # {screen_name: (user_id, community_id)}

for user_id, user_data in graph.nodes(data=True):
    
    community = user_data["community"]
    communities_dict.setdefault(community, dict())
    
    btw = user_data["betweenness"]
    deg = user_data["degree"]
    communities_dict[community][user_id] = (btw, deg)
    
    uc[user_data['screen_name']] = (user_id, community)

### Getting top 5 users by betweenness in each community

In [7]:
def sort_com_by_users_metric(communities_dict, metric_idx, top_k=2, name='screen_name'):
    """comms: {community: {user_id:(metric 0, metric 1, ...)}}
    returns {community: [top k user_ids data sorted by metric at metric_idx]}
    e.g. [(179250667, (0.005290261280310729, 17)), (231908931, (0.0, 4))]"""
    sorted_coms = {comm: sorted(users_data.items(), key=lambda x:x[1][metric_idx], reverse=True)[:top_k]
               for comm, users_data in communities_dict.items()}
    if name is not None:
        for k, v in sorted_coms.items():
            named_v = [(graph.nodes[i][name], j) for i,j in v]
            sorted_coms[k] = named_v
    return sorted_coms

In [8]:
sorted_coms_btw = sort_com_by_users_metric(communities_dict, 0, 5)
sorted_coms_btw['10']

[(u'akalikoc', (0.005290261280310729, 17)),
 (u'bayramogluali', (0.0, 4)),
 (u'celikmusta', (0.0, 1)),
 (u'cemyav', (0.0, 1)),
 (u'HALILYILDIZ63', (0.0, 1))]

In [9]:
sorted_coms_btw = sort_com_by_users_metric(communities_dict, 1, 5)
sorted_coms_btw['10']

[(u'akalikoc', (0.005290261280310729, 17)),
 (u'bayramogluali', (0.0, 4)),
 (u'hilalebruuuu', (0.0, 2)),
 (u'burakkayhan11', (0.0, 2)),
 (u'salihcolak96', (0.0, 2))]

### Investigating SCC dominant communities
#### 1)  Investigating the top nodes in _the biggest_ community in each SCC

In [10]:
scc_data = json.load(open("../REST/static/networks/SCC_graph.json"))
scc_graph = json_graph.node_link_graph(scc_data, directed=True)
del scc_data

In [11]:
def influential_nodes_dict_to_df(influential_nodes_dict, user_to_community, metrics=("betweenness", "degree"), user_def='user_id'):
    """
    influential_nodes_dict: {scc_id: [(user_id, (metric 0, metric 1, ...))] ... }
    user_to_community: {screen_name: (user_id, community_id)}
    """
    data = [(idx, user, user_to_community[user][1])+metrics_vals
            for idx, users in influential_nodes_dict.iteritems()
                for user, metrics_vals in users]

    return pd.DataFrame(data, columns=('SCC_id', user_def, 'user_community')+tuple(metrics)).set_index("SCC_id")

In [12]:
def get_top_k_nodes_in_scc(communities_dict, scc_graph, metric_idx, top_k=2, min_degree=1,
                           by_community=True, users_graph=None, as_df=True, user_def='screen_name'):
    """comms: {community: {user_id:(metric 0, metric 1, ...)}}
    returns {scc_id: [(user_id, (metric 0, metric 1, ...))] top k users sorted by metric_index}"""
    
    if by_community:
        sorted_coms = sort_com_by_users_metric(communities_dict, metric_idx, top_k, name=user_def)
        
        influential_nodes_per_scc = dict()  # {scc: top_k_nodes}

        for n, d in scc_graph.nodes(data=True):
            if d['degree']>min_degree:
                influential_nodes_per_scc[n] = sorted_coms[d['biggest_community']]
        
    else:
        # {SCC: {user_id:(betweenness, degree) in the user-to-user graph}}
        scc_users = dict()
        for n, d in scc_graph.nodes(data=True):
            if len(d['nodes']) > min_degree:
                for user in d['nodes']:
                    scc_users.setdefault(n, dict())
                    
                    user_data = users_graph.nodes[user]
                    btw = user_data["betweenness"]
                    deg = user_data["degree"]
                    
                    scc_users[n][user] = (btw, deg)
        influential_nodes_per_scc = sort_com_by_users_metric(scc_users, metric_idx, top_k, name=user_def)
        
    if as_df:
        user2community = users_to_community(graph, key=user_def)
        return influential_nodes_dict_to_df(influential_nodes_per_scc, user_to_community=user2community,
                                            user_def=user_def)
    else:
        return influential_nodes_per_scc

In [13]:
btw_index = 0
deg_index = 1
k = 2
influential_nodes_per_scc_by_com = get_top_k_nodes_in_scc(communities_dict, scc_graph, deg_index, k, as_df=True)
print("Top {} nodes in the biggest community in each SCC (by betweenness)".format(k))
influential_nodes_per_scc_by_com

Top 2 nodes in the biggest community in each SCC (by betweenness)


Unnamed: 0_level_0,screen_name,user_community,betweenness,degree
SCC_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,hakanbayrakk,6,0.025098,81
0,Cengizbeyza,6,0.018702,62
1,AkinciMehmet,11,0.007097,20
1,aytenkoldemir,11,0.0,3
2,dilaraboyraz,12,0.002963,11
2,SCansuYaman,12,0.001264,4
3,Ali_Ulker,2,0.037181,143
3,babadem,2,0.0,5
537,ahmet_ademoglu,15,9.1e-05,20
537,EbruAdemoglu,15,0.0,2


#### 2) Investigating the top nodes in the _all_ communities in each SCC

In [14]:
influential_nodes_per_scc = get_top_k_nodes_in_scc(
    communities_dict, scc_graph, deg_index, k, by_community=False, users_graph=graph, as_df=True, min_degree=1)
print("top {} nodes in the each SCC (by betweenness)".format(k))
influential_nodes_per_scc

top 2 nodes in the each SCC (by betweenness)


Unnamed: 0_level_0,screen_name,user_community,betweenness,degree
SCC_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,alimarli_,3,0.059915,202
0,Talha_Kose1,4,0.03791,176
1,AkinciMehmet,11,0.007097,20
1,aytenkoldemir,11,0.0,3
2,dilaraboyraz,12,0.002963,11
2,SCansuYaman,12,0.001264,4
3,Ali_Ulker,2,0.037181,143
3,mehmetkaracaitu,2,0.0,3
