In [1]:
import numpy as np
import pandas as pd
import random
import networkx as nx
import matplotlib.pyplot as plt
import json
import pickle

# Interaction Network Construction
## **SNLP team project**
Build 3 different network layers including following relationships: 
1. **all interactions** (edge weights are sum of layers 2. and 3.)
2. **retweets** 
3. **mentions** (includes replies) 

This notebook will output a pickled list consisting of 3 networkx.DiGraph objects, each representing a layer of different interactions. Edge weight attributes are named as: `interactions`,`retweets`,`mentions`. 

#### Load data

In [2]:
fpath = '../../tweet_data/preprocessed/network/'
date = '20200812'

In [3]:
attrs  = ['interactions', 'retweets', 'mentions']
edges = []

with open('{}edges_{}.json'.format(fpath,date), 'r') as edge_f:
    edges = json.load(edge_f)

#### Transform routine

In [4]:
def transform_GX(G, g_type='G1', directed=True):
    """
    Transforms original "hypergraph" to any of the three types 
    of interaction networks. Link direction is from the origin node of the 
    interaction towards the node that is the target of the interaction.
    
    G1 = interactions network (includes retweets, replies and mentions)
    G2 = retweet network
    G3 = mentions (includes replies and other mentions)
    
    output: networkx graph object (directed or undirected depending 
            on input parameter "directed")
    """
    assert any([e in g_type for e in ['G1','G2','G3']])
    
    trf = None
    attr = None
    
    # Retweet / endorsement network
    if g_type == 'G2':
        attr = 'retweets'
        trf  = lambda x: (x[0],x[1],{attr: x[2]['retweets']})
    # Mention / endorsement network
    elif  g_type == 'G3':
        attr = 'mentions'
        trf = lambda x: (x[0],x[1],{attr: (x[2]['replies'] + x[2]['orig_mentions'])})
    # Interaction network
    elif g_type == 'G1':
        attr = 'interactions'
        trf  = lambda x: (x[0],x[1],{attr: (x[2]['replies'] + x[2]['retweets'] + x[2]['orig_mentions'])})
    
    # Map attributes
    edges = list(map(trf, G.edges(data=True)))
    
    # Select edges with only non-zero weight
    edges = list(filter(lambda x: x[2][attr] != 0, edges))
    
    G_i = nx.DiGraph()
    G_i.add_edges_from(edges)
    G_i.add_nodes_from(G.nodes())
    
    if not directed:
        G_i = G_i.to_undirected()
    
    return G_i

In [5]:
def print_number_of_nodes_edges(GX, attrs):
    for G, attr in zip(GX, attrs):
        print(attr.upper())
        print('Number of nodes: {}'.format(G.number_of_nodes()))
        print('Number of edges: {}'.format(G.number_of_edges()))    
        print(50*'--')

---

## Constructing the network representation

### Full network

In [6]:
Gs = nx.DiGraph()
Gs.add_edges_from(edges) 

### Directed $G_I$, $G_R$ and $G_M$
Transform the data into three directed diffusion networks.

In [7]:
GX = [transform_GX(Gs, g_type=g_type, directed=True) for g_type in ['G1','G2','G3']]

In [8]:
print_number_of_nodes_edges(GX, attrs)

INTERACTIONS
Number of nodes: 809157
Number of edges: 1653004
----------------------------------------------------------------------------------------------------
RETWEETS
Number of nodes: 809157
Number of edges: 1089988
----------------------------------------------------------------------------------------------------
MENTIONS
Number of nodes: 809157
Number of edges: 603392
----------------------------------------------------------------------------------------------------


## Bounded version
We select largest the component and remove any self-loops.

#### Self loops

In [9]:
for i in range(3):
    print(attrs[i].upper())
    es = [u for u in nx.selfloop_edges(GX[i])]
    print('Self-edges: {} / ({:.2f} %)'.format(len(es), len(es)/GX[i].number_of_edges()*100))
    print(50*'--')

INTERACTIONS
Self-edges: 8991 / (0.54 %)
----------------------------------------------------------------------------------------------------
RETWEETS
Self-edges: 3717 / (0.34 %)
----------------------------------------------------------------------------------------------------
MENTIONS
Self-edges: 5771 / (0.96 %)
----------------------------------------------------------------------------------------------------


In [10]:
GX_bounded = []
for G in GX: 
    nbunch = list(nx.connected_components(G.to_undirected()))[0]
    G = nx.subgraph(G, nbunch)
    
    Gx = G.copy()
    for e in nx.selfloop_edges(G):
        Gx.remove_edge(e[0],e[1]) 
    
    GX_bounded.append(Gx)

In [11]:
print_number_of_nodes_edges(GX_bounded, attrs)

INTERACTIONS
Number of nodes: 745362
Number of edges: 1599989
----------------------------------------------------------------------------------------------------
RETWEETS
Number of nodes: 625893
Number of edges: 1066722
----------------------------------------------------------------------------------------------------
MENTIONS
Number of nodes: 273823
Number of edges: 552618
----------------------------------------------------------------------------------------------------


#### Save

In [12]:
fpath = '../../tweet_data/graphs/'
fname = 'full_networks_directed-{}.pickle'.format(date)
fname_bounded = 'lc_networks_directed-{}.pickle'.format(date)

pd.to_pickle(GX, fpath+fname)
pd.to_pickle(GX_bounded, fpath+fname_bounded)