In [90]:
import snap

from joblib import Parallel, delayed
from datetime import datetime

import ast, operator
from copy import deepcopy

import time, pandas as pd, pickle, json, networkx as nx, numpy as np
from networkx.readwrite import json_graph

In [2]:
cons = pd.read_csv("../REST/static/filtered_twitter_connections.csv", index_col="id")
str2dict = lambda d : ast.literal_eval(d)
cons.formation = cons.formation.apply(str2dict)
cons.sample(5)

Unnamed: 0_level_0,from_user_id,to_user_id,formation
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2429,450639507,315899534,{u'2018.05.08': True}
4990,1650479899,3064906390,{u'2018.05.08': True}
6905,3035004082,2545544532,{u'2018.05.08': True}
6042,2529427087,806218540328120320,{u'2018.05.08': True}
3507,630367566,4904375001,{u'2018.05.24': True}


In [3]:
twu_with_orgs = pd.read_csv("../REST/static/filtered_twitter_users.csv", index_col="id")
twu_with_orgs = twu_with_orgs[twu_with_orgs.is_org==False]
twu_with_orgs.sample(5,random_state=42)

Unnamed: 0_level_0,followers_count,friends_count,is_org,lang,match_name,match_ratio,name,screen_name,truncated_id,community
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2507063000.0,2062.0,2010.0,False,tr,elif demirel,91.0,Elif Demir,eelifdem,250706324,1
9.384949e+17,0.0,53.0,False,tr,abdullah kaya,100.0,Abdullah Kaya,Abdulla23830994,938494926,1
222408200.0,151.0,1188.0,False,en,aslan bakirov,100.0,Aslan Bakirov,abekir,222408208,1
610743600.0,88.0,70.0,False,en,mohammed elkhateeb,94.0,Mohammad Elkhateeb,Hatib2014,610743648,1
3004022000.0,58.0,190.0,False,tr,rabia sila aydin,95.0,rabia aydin,benrabiaay,300402232,1


### Filtering the network by link-formation dates
#### Refer to <a href="https://github.com/AmmarRashed/EventOrient/blob/master/notebooks/calculating_closures.ipynb"> Calculating Closures</a>

In [4]:
def present_in_date(changes_dates, queried_date):
    """
    checking if a connection is present in a queried date
    changes_dates: {d1:True, d2:False, d3:True} connection added or removed
    queried_date: e.g. "2018.05.08"
    """
    str2date = lambda strdate: datetime.strptime(strdate, '%Y.%m.%d')  # 2018.05.08
    changes = sorted(changes_dates,key=lambda d: str2date(d))
    queried_date = datetime.strptime(queried_date, '%Y.%m.%d')
    present = False
    for d in changes:
        if queried_date < str2date(d):
            break
        present = changes_dates[d]
    return present

def get_dates(cons):
    all_dates = set()
    str2date = lambda strdate: datetime.strptime(strdate, '%Y.%m.%d')  # 2018.05.08

    for dates in cons.formation.apply(lambda x: list(x)):
        for date in dates:
            all_dates.add(str2date(date))
    return [d.strftime('%Y.%m.%d') for d in sorted(all_dates)]

def get_connections_by_date(cons, date, present=True):
    nw = deepcopy(cons)
    for_col = nw.formation.apply(lambda dates: present_in_date(dates,date))
    return cons[for_col == present]

def networkx_to_snappy(nxg, directed=False):
    if directed:
        g = snap.PNGraph.New()
    else:
        g = snap.PUNGraph.New()
        
    for n in nxg.nodes():
        g.AddNode(n)
    for f,t in nxg.edges():
        g.AddEdge(f, t)
        
    return g

In [5]:
dates = get_dates(cons)
dates

['2018.05.08', '2018.05.24']

In [6]:
date = dates[-1]

In [7]:
ondate_cons = get_connections_by_date(cons, date)
ondate_cons.sample(5)

Unnamed: 0_level_0,from_user_id,to_user_id,formation
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5481,2234861247,3064906390,{u'2018.05.08': True}
8395,806720898036338688,443009716,{u'2018.05.08': True}
8353,796350154639048704,995806488,{u'2018.05.24': True}
6011,2529427087,609301446,{u'2018.05.08': True}
1316,253229470,3064906390,{u'2018.05.08': True}


### Constructing the graph

In [8]:
non_orgs = twu_with_orgs[~twu_with_orgs.is_org]

nxg = nx.DiGraph()
truncate = lambda x: int(str(int(x))[:9])
for _, row in ondate_cons.iterrows():    
    from_ = truncate(row["from_user_id"])
    to = truncate(row["to_user_id"])
    if from_ in non_orgs.truncated_id and to in non_orgs.truncated_id:
        nxg.add_edge(from_, to)
nxg = nxg.to_directed()

### Calculating Strongly-Connected components 
#### <a href="https://www.geeksforgeeks.org/strongly-connected-components/">Kosaraju's algorithm</a>
<img src="https://github.com/AmmarRashed/EventOrient/blob/master/misc/pics/scc.jpeg?raw=true">

In [17]:
snappy_directed = networkx_to_snappy(nxg, True)
components = snap.TCnComV()
sccs = snap.GetSccs(snappy_directed, components)

In [32]:
for CnCom in components:
    if (CnCom.Len()>1):
        print ("Size of component: %d" % CnCom.Len())

Size of component: 163
Size of component: 6
Size of component: 5
Size of component: 3


In [86]:
super_graph = nx.Graph()

In [87]:
for i,c1 in enumerate(components):
#     if c1.Len()>1:
    for j, c2 in enumerate(components):
        if  i != j:
            weight = 0
            for n1 in c1:
                for n2 in c2:
                    if nxg.has_edge(n1, n2):
                        weight += 1
            if weight>0:
                super_graph.add_edge(i, j, weight=weight)

In [88]:
def get_biggest_community(component):
    truncated_twu = twu_with_orgs.set_index('truncated_id')
    coms = dict()
    for node in component:
        com = truncated_twu.loc[node]['community']
        coms.setdefault(com, 0)
        coms[com] += 1
    
    return max(coms.iteritems(), key=operator.itemgetter(1))[0]

In [91]:
for ix,deg in super_graph.degree(super_graph.nodes()):
    nodes = list(set(components[ix]))
    super_graph.node[ix]['degree'] = deg
    super_graph.nodes[ix]['nodes'] = nodes
    super_graph.nodes[ix]['biggest_community'] = get_biggest_community(nodes)

In [92]:
data = nx.node_link_data(super_graph)
with open('../REST/static/networks/SCC_graph.json', 'w') as f:
    json.dump(data, f, indent=4)