In [197]:
from datetime import datetime
import pandas as pd
import numpy as np
import networkx as nx
from tqdm import tqdm
from pprint import pprint
import pickle

In [61]:
def date_parser(date):
    return datetime.fromtimestamp(date).strftime('%Y-%m-%d')

In [137]:
df_a2q = pd.read_csv('data/sx-stackoverflow-a2q.txt', sep=" ", header=None)
df_c2a = pd.read_csv('data/sx-stackoverflow-c2a.txt', sep=" ", header=None)
df_c2q = pd.read_csv('data/sx-stackoverflow-c2q.txt', sep=" ", header=None)

In [138]:
df_a2q.columns = ['u', 'v', 't_a2q']
df_c2a.columns = ['u', 'v', 't_c2a']
df_c2q.columns = ['u', 'v', 't_c2q']

In [4]:
df_a2q.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17823525 entries, 0 to 17823524
Data columns (total 3 columns):
 #   Column  Dtype
---  ------  -----
 0   u       int64
 1   v       int64
 2   t_a2q   int64
dtypes: int64(3)
memory usage: 407.9 MB


In [5]:
df_c2a.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25405374 entries, 0 to 25405373
Data columns (total 3 columns):
 #   Column  Dtype
---  ------  -----
 0   u       int64
 1   v       int64
 2   t_c2a   int64
dtypes: int64(3)
memory usage: 581.5 MB


In [6]:
df_c2q.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20268151 entries, 0 to 20268150
Data columns (total 3 columns):
 #   Column  Dtype
---  ------  -----
 0   u       int64
 1   v       int64
 2   t_c2q   int64
dtypes: int64(3)
memory usage: 463.9 MB


In [63]:
min1, max1 = min(df_a2q.t_a2q), max(df_a2q.t_a2q)
min2, max2 = min(df_c2a.t_c2a), max(df_c2a.t_c2a)
min3, max3 = min(df_c2q.t_c2q), max(df_c2q.t_c2q)

In [64]:
print('from', date_parser(min1), 'to', date_parser(max1))
print('from', date_parser(min2), 'to', date_parser(max2))
print('from', date_parser(min3), 'to', date_parser(max3))

from 2008-08-01 to 2016-03-06
from 2008-08-02 to 2016-03-06
from 2008-08-02 to 2016-03-06


We are going to pick dates from 2014-09-01 to 2016-03-06 to construnct the graph.

In [139]:
def drop_rows_int(df):
    start = '2014-09-01'
    index = []
    for i, row in tqdm(df.iterrows()):
        if start > date_parser(row[2]):
            index.append(i)

    return index

In [140]:
df_a2q = df_a2q.drop(drop_rows_int(df_a2q))
df_c2a = df_c2a.drop(drop_rows_int(df_c2a))
df_c2q = df_c2q.drop(drop_rows_int(df_c2q))

17823525it [07:00, 42410.43it/s]
25405374it [09:48, 43165.35it/s]
20268151it [07:47, 43348.09it/s]


In [180]:
df_a2q.info()
df_c2a.info()
df_c2q.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4740682 entries, 13082828 to 17823524
Data columns (total 3 columns):
 #   Column  Dtype
---  ------  -----
 0   u       int64
 1   v       int64
 2   t_a2q   int64
dtypes: int64(3)
memory usage: 144.7 MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7095961 entries, 18309232 to 25405373
Data columns (total 3 columns):
 #   Column  Dtype
---  ------  -----
 0   u       int64
 1   v       int64
 2   t_c2a   int64
dtypes: int64(3)
memory usage: 216.6 MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7515487 entries, 12750641 to 20268150
Data columns (total 3 columns):
 #   Column  Dtype
---  ------  -----
 0   u       int64
 1   v       int64
 2   t_c2q   int64
dtypes: int64(3)
memory usage: 229.4 MB


Saving the new dataframes into txt files, because in this way we can just upload them without "cleaning" the original datasets.

In [145]:
df_a2q.to_csv('data/df_a2q.txt', header=None, index=None, sep=' ', mode='w')
df_c2a.to_csv('data/df_c2a.txt', header=None, index=None, sep=' ', mode='w')
df_c2q.to_csv('data/df_c2q.txt', header=None, index=None, sep=' ', mode='w')


The intention is to create

In [None]:
df_a2q = pd.read_csv('data/df_a2q.txt', sep=" ", header=None)
df_c2a = pd.read_csv('data/df_c2a.txt', sep=" ", header=None)
df_c2q = pd.read_csv('data/df_c2q.txt', sep=" ", header=None)

In [200]:
def create_graph(G, df, type, self_loops=True):
    for _, row in tqdm(df.iterrows()):
        if G.has_edge(row[0], row[1]) and ((row[0] != row[1]) or self_loops):
            if G.has_edge(row[0], row[1], key=type):
                G[row[0]][row[1]][type]['time_list'].append(date_parser(row[2]))
                G[row[0]][row[1]][type]['weight'] += 1
            else:
                G.add_edge(row[0], row[1], key=type, weight=1, time_list=[date_parser(row[2])])
        else:
            G.add_edge(row[0], row[1], key=type, weight=1, time_list=[date_parser(row[2])])

In [184]:
G = nx.MultiDiGraph()
create_graph(G, df_a2q, 'a2q')

4740682it [02:52, 27540.64it/s]


This is to visualize how the graph is made of just for a single node only after it was added just one dataset

In [194]:
node_prova = list(nx.to_dict_of_dicts(G).keys())[27]
print(node_prova)
pprint(nx.to_dict_of_dicts(G)[node_prova])

3994425
{39656: {'a2q': {'time_list': ['2015-01-06'], 'weight': 1}},
 80002: {'a2q': {'time_list': ['2014-09-04'], 'weight': 1}},
 124201: {'a2q': {'time_list': ['2014-09-03'], 'weight': 1}},
 145173: {'a2q': {'time_list': ['2015-09-18'], 'weight': 1}},
 181771: {'a2q': {'time_list': ['2014-12-16'], 'weight': 1}},
 226927: {'a2q': {'time_list': ['2014-09-05'], 'weight': 1}},
 249991: {'a2q': {'time_list': ['2014-09-03'], 'weight': 1}},
 416631: {'a2q': {'time_list': ['2014-09-02'], 'weight': 1}},
 427155: {'a2q': {'time_list': ['2014-11-08'], 'weight': 1}},
 436493: {'a2q': {'time_list': ['2014-12-11'], 'weight': 1}},
 589119: {'a2q': {'time_list': ['2015-09-17'], 'weight': 1}},
 638443: {'a2q': {'time_list': ['2014-09-03'], 'weight': 1}},
 659570: {'a2q': {'time_list': ['2014-09-03'], 'weight': 1}},
 698971: {'a2q': {'time_list': ['2014-09-02'], 'weight': 1}},
 950413: {'a2q': {'time_list': ['2014-09-01'], 'weight': 1}},
 969241: {'a2q': {'time_list': ['2014-09-03'], 'weight': 1}},
 1

In [195]:
create_graph(G, df_c2a, 'c2a')
create_graph(G, df_c2q, 'c2q')

7095961it [04:22, 27052.78it/s]
7515487it [04:30, 27793.97it/s]


This are the outer connection of the node `node_prova` of before, after the addition of the other two dataset to the initial graph.

In [196]:
pprint(nx.to_dict_of_dicts(G)[node_prova])

{1228: {'c2a': {'time_list': ['2014-10-10'], 'weight': 1}},
 39656: {'a2q': {'time_list': ['2015-01-06'], 'weight': 1}},
 80002: {'a2q': {'time_list': ['2014-09-04'], 'weight': 1}},
 124201: {'a2q': {'time_list': ['2014-09-03'], 'weight': 1}},
 145173: {'a2q': {'time_list': ['2015-09-18'], 'weight': 1},
          'c2q': {'time_list': ['2015-09-18', '2015-09-18'], 'weight': 2}},
 178757: {'c2q': {'time_list': ['2016-02-17'], 'weight': 1}},
 181771: {'a2q': {'time_list': ['2014-12-16'], 'weight': 1}},
 197913: {'c2a': {'time_list': ['2014-09-03'], 'weight': 1}},
 209406: {'c2a': {'time_list': ['2014-11-04', '2014-11-04'], 'weight': 2}},
 226927: {'a2q': {'time_list': ['2014-09-05'], 'weight': 1}},
 249991: {'a2q': {'time_list': ['2014-09-03'], 'weight': 1}},
 369072: {'c2a': {'time_list': ['2016-02-21', '2016-02-22'], 'weight': 2}},
 416631: {'a2q': {'time_list': ['2014-09-02'], 'weight': 1}},
 427155: {'a2q': {'time_list': ['2014-11-08'], 'weight': 1}},
 436493: {'a2q': {'time_list': ['

Saving the graph $G$ with self loops on a pickle file, to load everytime needed.

In [198]:
with open('data/graph_with_loops.pkl', 'wb') as f:
    pickle.dump(G, f)

Before we created the graph with self loops of a node, but it could be useful to have also the graph without the self loops, so now we are going to create this kind of graph, using exactly the same function `create_graph`, but changing the parameter `self_loops=False`. Then, we are going to save also this graph.

In [201]:
G_loops = nx.MultiDiGraph()
create_graph(G_loops, df_a2q, 'a2q', self_loops=False)
create_graph(G_loops, df_c2a, 'c2a', self_loops=False)
create_graph(G_loops, df_c2q, 'c2q', self_loops=False)

4740682it [02:56, 26834.45it/s]
7095961it [04:16, 27642.53it/s]
7515487it [04:34, 27331.83it/s]


In [203]:
with open('data/graph_without_loops.pkl', 'wb') as f:
    pickle.dump(G_loops, f)

## **HOW TO**

**read the graphs:**

In [None]:
with open('data/graph_with_loops.pkl', 'rb') as f:
    G = pickle.load(f)

with open('data/graph_without_loops.pkl', 'rb') as f:
    G_loops = pickle.load(f)