## Higgs dataset preprocessing

From the paper:

[M. De Domenico, A. Lima, P. Mougel and M. Musolesi. The Anatomy of a Scientific Rumor. (Nature Open Access) Scientific Reports 3, 2980 (2013).](http://www.nature.com/srep/2013/131018/srep02980/full/srep02980.html)



In [1]:
import os
import time
import numpy as np
import scipy as sp
import pandas as pd
import networkx as nx
import matplotlib as mpl
import matplotlib.pyplot as plt
import datetime as dt


# Customize plot colors for dark backgrounds
%matplotlib inline
mpl.rcParams['axes.edgecolor'] = 'grey'
mpl.rcParams['grid.color'] = '#66CCCC'
mpl.rcParams['text.color'] = '#0EBFE9'
mpl.rcParams['xtick.color'] = '#66CCCC'
mpl.rcParams['ytick.color'] = '#66CCCC'
mpl.rcParams['axes.labelcolor'] = '#0EBFE9'

import IPython.utils.path
DATA_DIR = os.path.join(IPython.utils.path.get_home_dir(), 'local/higgs/')
print 'Data directory:', DATA_DIR
dataset_name = 'higgs'

%load_ext autoreload
%autoreload 2

Data directory: /Users/kikohs/local/higgs/


## Parse data

### Parse activity

In [None]:
def create_graph_from_activity(activity_df, action='RT'):
    g = nx.DiGraph()
    df = activity_df[activity_df['action'] == action]
    g.name = 'Higgs ' + action
    for idx, d in df.iterrows():
        src = d['src_id']
        tgt = d['tgt_id']
        if not g.has_edge(src, tgt):
            g.add_edge(src, tgt, weight=1)
        else:
            g[src][tgt]['weight'] += 1

    return g

ACTIVITY = pd.read_csv(os.path.join(DATA_DIR, 'HiggsDiscovery_multiplex_time.txt'), 
                       sep=' ', header=None, names=['src_id', 'tgt_id', 'timestamp', 'action'],
                       dtype={'src_id': np.int64, 'tgt_id': np.int64, 'timestamp': np.int64, 'action': str},
                       index_col=2)

ACTIVITY['action'] = ACTIVITY['action'].astype(str)
ACTIVITY.index = pd.to_datetime(ACTIVITY.index.values * 1e9)
# G = create_graph_from_activity(ACTIVITY, 'RT')
# print nx.info(G)

In [29]:
G = create_graph_from_activity(ACTIVITY, 'RE')
print nx.info(G)

Name: Higgs RE
Type: DiGraph
Number of nodes: 39940
Number of edges: 33728
Average in degree:   0.8445
Average out degree:   0.8445


### Parse retweet graph

In [3]:
start = time.time()
# path = os.path.join(DATA_DIR, 'HiggsDiscovery_RT.edges.gz')
# RETWEET = nx.read_edgelist(path, create_using=nx.DiGraph(),
#                            nodetype=int, data=(('weight', int),))
# RETWEET.name = 'Higgs RT'
# nx.write_gpickle(RETWEET, os.path.join(DATA_DIR, 'retweet.gpickle'))
RETWEET = nx.read_gpickle(os.path.join(DATA_DIR, 'retweet.gpickle'))
print 'Retweet graph loaded in:', time.time() - start
print nx.info(RETWEET)

Retweet graph loaded in: 9.10212087631
Name: Higgs RT
Type: DiGraph
Number of nodes: 257827
Number of edges: 334208
Average in degree:   1.2962
Average out degree:   1.2962


### Parse mention graph    

In [25]:
start = time.time()
# MENTION = nx.read_edgelist(os.path.join(DATA_DIR, 'HiggsDiscovery_MT.edges.gz'), 
#                            create_using=nx.DiGraph(), nodetype=int, data=(('weight', int),))
# MENTION.name = 'Higgs MT'
# nx.write_gpickle(MENTION, os.path.join(DATA_DIR, 'mention.gpickle'))

MENTION = nx.read_gpickle(os.path.join(DATA_DIR, 'mention.gpickle'))
print 'Mention graph loaded in:', time.time() - start
print nx.info(MENTION)

Mention graph loaded in: 7.37287902832
Name: Higgs MT
Type: DiGraph
Number of nodes: 118659
Number of edges: 156371
Average in degree:   1.3178
Average out degree:   1.3178


### Parse reply graph

In [28]:
start = time.time()
# REPLY = nx.read_edgelist(os.path.join(DATA_DIR, 'HiggsDiscovery_RE.edges.gz'),
#                          create_using=nx.DiGraph(), nodetype=int, data=(('weight', int),))
# REPLY.name = 'Higgs RE'
# nx.write_gpickle(REPLY, os.path.join(DATA_DIR, 'reply.gpickle'))

REPLY = nx.read_gpickle(os.path.join(DATA_DIR, 'reply.gpickle'))
print 'Reply graph loaded in:', time.time() - start
print nx.info(REPLY)

Reply graph loaded in: 1.68447113037
Name: Higgs RE
Type: DiGraph
Number of nodes: 39940
Number of edges: 33728
Average in degree:   0.8445
Average out degree:   0.8445


### Parse social network (follower network)

In [2]:
start = time.time()
SOCIAL = nx.read_edgelist(os.path.join(DATA_DIR, 'HiggsDiscovery_social.edges.gz'),
                          create_using=nx.DiGraph(), nodetype=int, )
SOCIAL.name = 'Higgs SOCIAL'
nx.write_gpickle(SOCIAL, os.path.join(DATA_DIR, 'social.gpickle'))

# SOCIAL = nx.read_gpickle(os.path.join(DATA_DIR, 'social.gpickle'))
print 'Social graph loaded in:', time.time() - start
print nx.info(SOCIAL)

Social graph loaded in: 334.526389122
Name: Higgs SOCIAL
Type: DiGraph
Number of nodes: 456626
Number of edges: 14855842
Average in degree:  32.5339
Average out degree:  32.5339


## Analysis

### Overlap between retweet and social network

In [4]:
def overlap_graph(g1, g2):
    common_edges = 0
    for u, v in g1.edges_iter():
        if g2.has_edge(u, v):
            common_edges += 1

    res = common_edges * 100 / float(nx.number_of_edges(g1))
    print 'Percentage of overlap (', g1.name, ',', g2.name, '):', res
    print 'Number of common edges:', common_edges
    return res, common_edges

# overlap_graph(REPLY, SOCIAL)
overlap_graph(RETWEET, SOCIAL)
# overlap_graph(MENTION, SOCIAL)

Percentage of overlap ( Higgs RT , Higgs SOCIAL ): 59.1912820758
Number of common edges: 197822


(59.191282075833016, 197822)