# Building the retweet network (part 3)

Having a list of edges does not imply a connected underlying graph.  
Many isolated graphs (components) could be found.  
The biggest (giant) component will be extracted.

In [1]:
import pandas as pd
import networkx as nx

In [2]:
edgelist_df = pd.read_csv("edgelist.csv", index_col=0)

In [3]:
edgelist_df

Unnamed: 0,source,retweeter,weight
0,1248341835770200064,1248341835770200064,1308
1,150725695,3613396888,379
2,150725695,841701064118284288,326
3,52424550,841701064118284288,320
4,331617619,1084450777748459520,308
...,...,...,...
1006454,1683455144,751020639587332102,1
1006455,1683455144,752255117232054276,1
1006456,1683455144,752845778,1
1006457,1683455144,756947491787010048,1


In [4]:
G = nx.from_pandas_edgelist(edgelist_df,
                            source='source',
                            target='retweeter',
                            edge_attr='weight',
                            create_using=nx.DiGraph)

In [5]:
# Number of isolated directed subgraphs
nx.number_weakly_connected_components(G)

2327

In [6]:
# Create sub-graph for each connected component
S = [G.subgraph(c).copy() for c in nx.weakly_connected_components(G)]

In [7]:
# Score each connected component and rank by the number of nodes
wcc_rank = [(s, s.number_of_nodes(), s.number_of_edges()) for s in S]
wcc_rank.sort(key=lambda x: x[1], reverse=True)

In [8]:
# Top 5
wcc_rank[:5]

[(<networkx.classes.digraph.DiGraph at 0x16646c750>, 168684, 1002330),
 (<networkx.classes.digraph.DiGraph at 0x2842ea9d0>, 118, 132),
 (<networkx.classes.digraph.DiGraph at 0x2844a8490>, 114, 114),
 (<networkx.classes.digraph.DiGraph at 0x2844d5bd0>, 102, 102),
 (<networkx.classes.digraph.DiGraph at 0x2842386d0>, 46, 46)]

NOTE: component other than the giant one seems to have same number of nodes and edges. Investigate this behaviour.

In [9]:
# Biggest connected component (a.k.a. giant component)
component_nx = wcc_rank[0][0]

In [10]:
# Testing assortativity coefficient (pearson correlation between nodes degree)
nx.degree_assortativity_coefficient(component_nx)

-0.118353697247374

In [11]:
# Create dataframe for connected component
component_df = pd.DataFrame(component_nx.edges.data('weight', default=1), columns=('source', 'target', 'weight'))

In [12]:
# Anonymize data

In [13]:
def anonymize_data_frame(df):
    
    anonymized_df = df.copy()
    
    # Create a real_id -> fake_id mapping
    mapping_df = pd.concat([df['source'], df['target']]).unique()
    mapping_df = pd.DataFrame({"user_id": mapping_df, "id": list(range(len(mapping_df)))})
    mapping_df.set_index("user_id", inplace=True)
    
    # Replace IDs in columns
    anonymized_df['source'] = mapping_df.loc[df.source].values
    anonymized_df['target'] = mapping_df.loc[df.target].values
    
    return mapping_df, anonymized_df

In [14]:
mapping_df, anonymized_df = anonymize_data_frame(component_df)

In [15]:
component_df

Unnamed: 0,source,target,weight
0,1248341835770200064,1248341835770200064,1308
1,1248341835770200064,4160203479,28
2,1248341835770200064,1125440833099112448,22
3,1248341835770200064,850324301450555392,21
4,1248341835770200064,742397759991480320,20
...,...,...,...
1002325,1685861604,2355256140,1
1002326,1685861604,2673130134,1
1002327,168586183,169109426,1
1002328,168586183,534732052,1


In [16]:
anonymized_df

Unnamed: 0,source,target,weight
0,0,0,1308
1,0,149,28
2,0,32365,22
3,0,32366,21
4,0,184,20
...,...,...,...
1002325,32363,35269,1
1002326,32363,40892,1
1002327,32364,84187,1
1002328,32364,168682,1


In [17]:
# Save the connected component as csv
anonymized_df.to_csv("edgelist_component.csv", index=False)
# Save the mapping for original id recovery
mapping_df.to_csv("edgelist_user_mapping.csv", index=False)