# Network component analysis

In [44]:
import pandas as pd
from pathlib import Path
import networkx as nx
from networkx.algorithms import community
from tqdm import tqdm
import copy
from time import perf_counter
import matplotlib.pyplot as plt
import seaborn as sns

## Load the data (sampled) and create NetworkX graphs

In [2]:
edgelists = {}
nx_graphs = {}

for file in Path("edgelistsSample").iterdir():
    # if file.stem == "edgelists_any_sample_w_data": continue
    tmp_edgelist = pd.read_csv(file)
    edgelists[file.stem] = tmp_edgelist

    tmp_nx_graph = nx.from_pandas_edgelist(tmp_edgelist, source='Source', target='Target', create_using=nx.Graph())
    # Ensure that graphs have a 'weight' edge attribute
    nx.set_edge_attributes(tmp_nx_graph, 1, 'weight')
    nx_graphs[file.stem] = tmp_nx_graph

## Our Louvain implementation

In [3]:
# Louvain implementation
def louvain(G, npassage):
    # Will contain the graph and the communities after each passage
    data = {}
    for i in range(0, npassage):
        print(f"Passage {i+1}", flush=True)
        G, communities, get_community = louvain_step(G, i==0)
        data[i] = get_community
        print(f"There are {len(communities)} communities after passage {i+1}", flush=True)
    return data


def louvain_step(G, is_first_passage):
    # Step 1: Initialization, start with each node being a single community
    communities = {idx: set([node]) for idx, node in enumerate(G.nodes)}
    # To get direct access to the community (it speeds up a bit the algorithm)
    get_community = {node: idx for idx, node in enumerate(G.nodes)}
    # Used in the modularity computation
    neighbors_sets = {node: set(G.neighbors(node)) for node in G.nodes}
    m = len(G.edges)

    # Sum the weights of the incident edges for all nodes inside a community, for all communities
    # Separate first passage and other ones to speed up the algorithm
    if is_first_passage:
        sum_communities = {idx: sum(dict(G.degree(community)).values()) for idx, community in communities.items()}
    else:
        sum_communities = {idx: sum(dict(G.degree(community, 'weight')).values()) for idx, community in communities.items()}

    prev_communities = {}
    i = 0
    # While there are changes in the communities, make another iteration
    while(communities != prev_communities):
        print(f"Iteration {i+1}", flush=True)
        # Deepcopy to have a new object with a new reference
        # Otherwise comparison in while is always True
        prev_communities = copy.deepcopy(communities)
        
        for node in tqdm(G.nodes):
            # Step 2: Remove node from its community
            neighboring_communities = get_neighboring_communities(G, node, get_community)
            if neighboring_communities == set():
                continue
            belong_to = get_community[node]
            communities[belong_to].remove(node)
            sum_communities[belong_to] -= G.degree(node, 'weight')
            if communities[belong_to] == set():
                del communities[belong_to]
                del sum_communities[belong_to]

            # Step 3: Insert the node in the community that maximizes the modularity
            scores = [
                (neighbor_community, modularity_gain(G, node, communities[neighbor_community], sum_communities[neighbor_community], neighbors_sets[node], m, is_first_passage))
                for neighbor_community in neighboring_communities
            ]
            best_community, best_score = max(scores, key=lambda x: x[1])
            communities[best_community].add(node)
            get_community[node] = best_community
            sum_communities[best_community] += G.degree(node, 'weight')
          
        i += 1

    # Step 4: Update the graph
    if is_first_passage:
        return get_new_graph(G, communities, sum_communities, get_community), communities, get_community
    return get_new_graph(G, communities, sum_communities, get_community), communities, get_community


def get_neighboring_communities(G, node, get_community):
    # Use a set to make sure a community only appear once
    neighboring_communities = set()
    for neighbor in G.neighbors(node):
        if neighbor == node: continue
        neighboring_communities.add(get_community[neighbor])
    return neighboring_communities


def modularity_gain(G, node, community, sum_community, neighbor_set, m, is_first_passage):
    # Separate first passage and other ones to speed up the algorithm
    if is_first_passage:
        # Sum the weights of the edges from node into community nodes
        # Using sets allow to use intersection()
        sum_weights_node = len(neighbor_set.intersection(community))
        right_member = (sum_community * G.degree[node]) / (2 * (m**2))
    else:
        # Sum the weights of the edges from node into community nodes
        sum_weights_node = sum([G.get_edge_data(node, member)['weight'] for member in G.neighbors(node) if member in community])
        right_member = (sum_community * G.degree(node, 'weight')) / (2 * (m**2))
    # Compute modularity
    left_member = sum_weights_node / (2 * m)
    return left_member - right_member


def get_new_graph(old_G, communities, sum_communities, get_community):
    print("Constructing new graph", flush=True)
    G = nx.Graph()
    G.add_nodes_from(communities.keys())
    for community in communities:
        G.add_edge(community, community, weight=sum_communities[community])

    for source, dest, weight_dict in old_G.edges(data=True):
        community1 = get_community[source]
        community2 = get_community[dest]
        # If edge not exist, weight initialized at 0
        current_weight = G.get_edge_data(community1, community2, {'weight': 0})['weight']
        new_weight = current_weight + weight_dict['weight']
        G.add_edge(community1, community2, weight=new_weight)
    return G

## Run our Louvain on the graphs

In [7]:
results = {}
for name, G in nx_graphs.items():
    print(f"Louvain on {name}")
    data = louvain(G, 3)
    results[name] = data
    print(f"\n")

Louvain on edgelists_any_sample
Passage 1
Iteration 1


100%|██████████| 221327/221327 [00:03<00:00, 69951.68it/s]

Iteration 2



100%|██████████| 221327/221327 [00:02<00:00, 81280.18it/s]

Iteration 3



100%|██████████| 221327/221327 [00:02<00:00, 81850.99it/s]

Constructing new graph





There are 99 communities after passage 1
Passage 2
Iteration 1


100%|██████████| 99/99 [00:00<00:00, 10961.88it/s]

Iteration 2



100%|██████████| 99/99 [00:00<00:00, 12329.96it/s]

Iteration 3



100%|██████████| 99/99 [00:00<00:00, 12371.10it/s]

Iteration 4



100%|██████████| 99/99 [00:00<00:00, 14093.48it/s]

Iteration 5



100%|██████████| 99/99 [00:00<00:00, 14088.22it/s]

Iteration 6



100%|██████████| 99/99 [00:00<00:00, 16426.13it/s]

Iteration 7



100%|██████████| 99/99 [00:00<00:00, 16541.95it/s]

Iteration 8



100%|██████████| 99/99 [00:00<00:00, 19798.60it/s]

Iteration 9



100%|██████████| 99/99 [00:00<00:00, 24741.47it/s]

Iteration 10



100%|██████████| 99/99 [00:00<00:00, 24760.65it/s]

Iteration 11



100%|██████████| 99/99 [00:00<00:00, 33015.51it/s]

Iteration 12



100%|██████████| 99/99 [00:00<00:00, 24753.27it/s]

Iteration 13



100%|██████████| 99/99 [00:00<00:00, 24726.74it/s]

Iteration 14



100%|██████████| 99/99 [00:00<00:00, 33010.26it/s]

Iteration 15



100%|██████████| 99/99 [00:00<00:00, 24753.27it/s]

Constructing new graph
There are 17 communities after passage 2
Passage 3
Iteration 1



100%|██████████| 17/17 [00:00<?, ?it/s]

Iteration 2



100%|██████████| 17/17 [00:00<?, ?it/s]

Constructing new graph
There are 15 communities after passage 3


Louvain on edgelists_liking_sample
Passage 1





Iteration 1


100%|██████████| 199158/199158 [00:02<00:00, 67488.34it/s]

Iteration 2



100%|██████████| 199158/199158 [00:02<00:00, 81155.74it/s]

Iteration 3



100%|██████████| 199158/199158 [00:02<00:00, 82832.45it/s]

Iteration 4



100%|██████████| 199158/199158 [00:02<00:00, 83258.99it/s]

Iteration 5



100%|██████████| 199158/199158 [00:02<00:00, 81990.80it/s]

Constructing new graph





There are 99 communities after passage 1
Passage 2
Iteration 1


100%|██████████| 99/99 [00:00<00:00, 10966.80it/s]

Iteration 2



100%|██████████| 99/99 [00:00<00:00, 12376.63it/s]

Iteration 3



100%|██████████| 99/99 [00:00<00:00, 12375.16it/s]

Iteration 4



100%|██████████| 99/99 [00:00<00:00, 14145.81it/s]

Iteration 5



100%|██████████| 99/99 [00:00<00:00, 12330.33it/s]

Iteration 6



100%|██████████| 99/99 [00:00<00:00, 12386.23it/s]

Iteration 7



100%|██████████| 99/99 [00:00<00:00, 14213.60it/s]

Iteration 8



100%|██████████| 99/99 [00:00<00:00, 14142.44it/s]

Iteration 9



100%|██████████| 99/99 [00:00<00:00, 9873.88it/s]

Iteration 10



100%|██████████| 99/99 [00:00<00:00, 14146.77it/s]

Iteration 11



100%|██████████| 99/99 [00:00<00:00, 12377.37it/s]

Iteration 12



100%|██████████| 99/99 [00:00<00:00, 14143.40it/s]

Iteration 13



100%|██████████| 99/99 [00:00<00:00, 14142.92it/s]

Iteration 14



100%|██████████| 99/99 [00:00<00:00, 14146.77it/s]

Iteration 15



100%|██████████| 99/99 [00:00<00:00, 12378.11it/s]

Iteration 16



100%|██████████| 99/99 [00:00<00:00, 14147.74it/s]

Iteration 17



100%|██████████| 99/99 [00:00<00:00, 12376.27it/s]

Iteration 18



100%|██████████| 99/99 [00:00<00:00, 12373.68it/s]

Iteration 19



100%|██████████| 99/99 [00:00<00:00, 12375.90it/s]

Iteration 20



100%|██████████| 99/99 [00:00<00:00, 14146.29it/s]

Iteration 21



100%|██████████| 99/99 [00:00<00:00, 12377.74it/s]

Iteration 22



100%|██████████| 99/99 [00:00<00:00, 12378.85it/s]

Iteration 23



100%|██████████| 99/99 [00:00<00:00, 14145.33it/s]

Iteration 24



100%|██████████| 99/99 [00:00<00:00, 12374.42it/s]

Iteration 25



100%|██████████| 99/99 [00:00<00:00, 12374.42it/s]

Iteration 26



100%|██████████| 99/99 [00:00<00:00, 14145.81it/s]

Iteration 27



100%|██████████| 99/99 [00:00<00:00, 14145.33it/s]

Iteration 28



100%|██████████| 99/99 [00:00<00:00, 12372.58it/s]

Iteration 29



100%|██████████| 99/99 [00:00<00:00, 12377.00it/s]

Iteration 30



100%|██████████| 99/99 [00:00<00:00, 14143.40it/s]

Iteration 31



100%|██████████| 99/99 [00:00<00:00, 14146.77it/s]

Iteration 32



100%|██████████| 99/99 [00:00<00:00, 12378.85it/s]

Iteration 33



100%|██████████| 99/99 [00:00<00:00, 14143.40it/s]

Iteration 34



100%|██████████| 99/99 [00:00<00:00, 12376.27it/s]

Iteration 35



100%|██████████| 99/99 [00:00<00:00, 11000.80it/s]

Iteration 36



100%|██████████| 99/99 [00:00<00:00, 14144.85it/s]

Iteration 37



100%|██████████| 99/99 [00:00<00:00, 12379.22it/s]

Iteration 38



100%|██████████| 99/99 [00:00<00:00, 14144.36it/s]

Iteration 39



100%|██████████| 99/99 [00:00<00:00, 12377.00it/s]

Iteration 40



100%|██████████| 99/99 [00:00<00:00, 12373.68it/s]

Iteration 41



100%|██████████| 99/99 [00:00<00:00, 14143.88it/s]

Iteration 42



100%|██████████| 99/99 [00:00<00:00, 14147.26it/s]

Iteration 43



100%|██████████| 99/99 [00:00<00:00, 14144.36it/s]

Iteration 44



100%|██████████| 99/99 [00:00<00:00, 14144.36it/s]

Iteration 45



100%|██████████| 99/99 [00:00<00:00, 14145.81it/s]

Iteration 46



100%|██████████| 99/99 [00:00<00:00, 12377.00it/s]

Iteration 47



100%|██████████| 99/99 [00:00<00:00, 14145.33it/s]

Iteration 48



100%|██████████| 99/99 [00:00<00:00, 12378.85it/s]

Iteration 49



100%|██████████| 99/99 [00:00<00:00, 14144.36it/s]

Iteration 50



100%|██████████| 99/99 [00:00<00:00, 14146.29it/s]

Iteration 51



100%|██████████| 99/99 [00:00<00:00, 10998.17it/s]

Iteration 52



100%|██████████| 99/99 [00:00<00:00, 14144.85it/s]

Iteration 53



100%|██████████| 99/99 [00:00<00:00, 14151.59it/s]

Iteration 54



100%|██████████| 99/99 [00:00<00:00, 12373.68it/s]

Iteration 55



100%|██████████| 99/99 [00:00<00:00, 12375.16it/s]

Iteration 56



100%|██████████| 99/99 [00:00<00:00, 14144.85it/s]

Iteration 57



100%|██████████| 99/99 [00:00<00:00, 14145.33it/s]

Iteration 58



100%|██████████| 99/99 [00:00<00:00, 14145.33it/s]


Iteration 59


100%|██████████| 99/99 [00:00<00:00, 14144.85it/s]

Iteration 60



100%|██████████| 99/99 [00:00<00:00, 14146.29it/s]

Iteration 61



100%|██████████| 99/99 [00:00<00:00, 12376.27it/s]

Iteration 62



100%|██████████| 99/99 [00:00<00:00, 14148.70it/s]


Constructing new graph
There are 37 communities after passage 2
Passage 3
Iteration 1


100%|██████████| 37/37 [00:00<00:00, 12336.19it/s]

Iteration 2



100%|██████████| 37/37 [00:00<00:00, 18512.38it/s]

Iteration 3



100%|██████████| 37/37 [00:00<00:00, 18525.64it/s]

Iteration 4



100%|██████████| 37/37 [00:00<00:00, 18503.55it/s]

Iteration 5



100%|██████████| 37/37 [00:00<00:00, 37020.34it/s]

Iteration 6



100%|██████████| 37/37 [00:00<00:00, 37011.51it/s]

Constructing new graph
There are 18 communities after passage 3


Louvain on edgelists_retweeters_sample
Passage 1





Iteration 1


100%|██████████| 56088/56088 [00:00<00:00, 71543.24it/s]

Iteration 2



100%|██████████| 56088/56088 [00:00<00:00, 84470.69it/s]

Iteration 3



100%|██████████| 56088/56088 [00:00<00:00, 84339.14it/s]

Constructing new graph





There are 99 communities after passage 1
Passage 2
Iteration 1


100%|██████████| 99/99 [00:00<00:00, 19672.91it/s]

Iteration 2



100%|██████████| 99/99 [00:00<00:00, 24750.32it/s]

Iteration 3



100%|██████████| 99/99 [00:00<00:00, 33033.90it/s]

Iteration 4



100%|██████████| 99/99 [00:00<00:00, 24760.65it/s]

Iteration 5



100%|██████████| 99/99 [00:00<00:00, 24750.32it/s]

Iteration 6



100%|██████████| 99/99 [00:00<00:00, 32706.06it/s]

Iteration 7



100%|██████████| 99/99 [00:00<00:00, 32905.63it/s]

Iteration 8



100%|██████████| 99/99 [00:00<00:00, 32929.11it/s]

Iteration 9



100%|██████████| 99/99 [00:00<00:00, 33057.57it/s]

Iteration 10



100%|██████████| 99/99 [00:00<00:00, 24742.94it/s]

Iteration 11



100%|██████████| 99/99 [00:00<00:00, 24561.46it/s]

Iteration 12



100%|██████████| 99/99 [00:00<00:00, 33018.14it/s]

Iteration 13



100%|██████████| 99/99 [00:00<00:00, 32724.10it/s]

Constructing new graph
There are 37 communities after passage 2
Passage 3
Iteration 1



100%|██████████| 37/37 [00:00<?, ?it/s]

Iteration 2



100%|██████████| 37/37 [00:00<?, ?it/s]

Iteration 3



100%|██████████| 37/37 [00:00<00:00, 36967.42it/s]

Constructing new graph
There are 33 communities after passage 3







## Export and save label communities of each node (for Gephi)

In [10]:
Path.mkdir(Path("analysis"), exist_ok=True)

# Get the label communities for the nodes for each passage
for name, result in results.items():
    intermediate_results = []

    for i in range(len(result)-1, -1, -1):
        if i == 0:
            break
        curr = result[i]
        prev = result[i-1]
        intermediate_result =  {node: curr[val] for node, val in prev.items()}
        intermediate_results.append(intermediate_result)
    
    try:
        commu = intermediate_results[-1]
    # If we run only for one passage
    except IndexError:
        commu = result[0]

    communities_df = pd.DataFrame(commu.items(), columns=['Id', 'Community'])
    communities_df["Label"] = communities_df["Id"].map(lambda x: x)
    communities_df["Is_author"] = False

    communities_df.loc[communities_df['Id'].isin(edgelists[name]['Target'].unique()), "Is_author"] = True

    communities_df.to_csv(f"analysis/communities_{name}.csv", index=False)

## Comparing the 3 results

In [18]:
df_result_any = pd.read_csv("analysis/communities_edgelists_any_sample.csv")
df_result_like = pd.read_csv("analysis/communities_edgelists_liking_sample.csv")
df_result_retweet = pd.read_csv("analysis/communities_edgelists_retweeters_sample.csv")

In [19]:
n_any = len(df_result_any['Id'].unique())
n_retweet = len(df_result_retweet['Id'].unique())
n_like = len(df_result_like['Id'].unique())

print(f"Number of nodes in any: {n_any}")
print(f"Number of nodes in retweet: {n_retweet}")
print(f"Number of nodes in like: {n_like}")


Number of nodes in any: 221327
Number of nodes in retweet: 56088
Number of nodes in like: 199158


In [21]:
# number of unique communities in df_result_any
n_unique_any = len(df_result_any['Community'].unique())
n_unique_retweet = len(df_result_retweet['Community'].unique())
n_unique_like = len(df_result_like['Community'].unique())

# print the number of unique communities
print(f"Number of unique communities in any: {n_unique_any}")
print(f"Number of unique communities in retweet: {n_unique_retweet}")
print(f"Number of unique communities in like: {n_unique_like}")

Number of unique communities in any: 17
Number of unique communities in retweet: 37
Number of unique communities in like: 37


In [31]:
# intersection of the IDs between like and retweet
intersection_ids = set(df_result_like['Id'].unique()).intersection(set(df_result_retweet['Id'].unique()))

# print the number of nodes in the intersection
print(f"Number of nodes in the intersection between like and retweet: {len(intersection_ids)}")

Number of nodes in the intersection between like and retweet: 33919


In [38]:
# not same label anyway => maybe check only to see if the same authors end up together

# compare like and retweet communities
df_result_like_retweet = df_result_like.merge(df_result_retweet, on='Id', how='inner')

# check if community_x is the same as community_y
df_result_like_retweet['Is_same_community'] = df_result_like_retweet.apply(lambda x: x['Community_x'] == x['Community_y'], axis=1)

df_result_like_retweet

Unnamed: 0,Id,Community_x,Label_x,Is_author_x,Community_y,Label_y,Is_author_y,Is_same_community
0,300625980,30,300625980,False,30,300625980,False,True
1,1152348175761465344,30,1152348175761465344,True,30,1152348175761465344,True,True
2,1303264019831234560,30,1303264019831234560,False,30,1303264019831234560,False,True
3,1109390623411499008,30,1109390623411499008,False,88,1109390623411499008,False,False
4,139761674,30,139761674,False,30,139761674,False,True
...,...,...,...,...,...,...,...,...
33914,1341075013735202822,36,1341075013735202822,False,98,1341075013735202822,False,False
33915,790351541135867904,36,790351541135867904,False,98,790351541135867904,False,False
33916,1064414690,36,1064414690,False,98,1064414690,False,False
33917,1347713012199788550,36,1347713012199788550,False,98,1347713012199788550,False,False


In [61]:
# df_result_like only keep is_author = True
df_result_like_only_authors = df_result_like[df_result_like['Is_author'] == True]
# df_result_retweet only keep is_author = True
df_result_retweet_only_authors = df_result_retweet[df_result_retweet['Is_author'] == True]

# number of unique communities in df_result_like_only_authors
n_unique_like_only_authors = len(df_result_like_only_authors['Id'].unique())
# number of unique communities in df_result_retweet_only_authors
n_unique_retweet_only_authors = len(df_result_retweet_only_authors['Id'].unique())

# print the number of unique authors 
print(f"Number of unique authors in like only authors: {n_unique_like_only_authors}")
print(f"Number of unique authors in retweet only authors: {n_unique_retweet_only_authors}")

Number of unique authors in like only authors: 100
Number of unique authors in retweet only authors: 100


## Comparison between our implementation and the one from NetworkX

In [6]:
results = {}
for name, G in nx_graphs.items():
    print(f"Graph of {len(G.nodes)} nodes")
    
    start_net = perf_counter()
    louvain_nx = community.louvain_partitions(G)
    first = next(louvain_nx)
    stop_net = perf_counter()
    print(f"After first passage: {len(first)} communities")
    # second = next(louvain_nx)
    # print(f"After second passage: {len(second)} communities")
    # third = next(louvain_nx)
    # print(f"After third passage: {len(third)} communities")
    print(f"Time for NetworkX: {stop_net - start_net}")
    
    start_our = perf_counter()
    print(f"Louvain on {name}")
    data = louvain(G, 1)
    stop_our = perf_counter()
    print(f"Time for our implementation: {stop_our - start_our}")
    results[name] = data

Graph of 221327 nodes
After first passage: 100 communities
Time for NetworkX: 7.501972300000006
Louvain on edgelists_any_sample
Passage 1
Iteration 1


100%|██████████| 221327/221327 [00:03<00:00, 69617.02it/s]

Iteration 2



100%|██████████| 221327/221327 [00:02<00:00, 82769.90it/s]

Iteration 3



100%|██████████| 221327/221327 [00:02<00:00, 78512.72it/s]

Constructing new graph





There are 99 communities after passage 1
Time for our implementation: 13.7340528
Graph of 199158 nodes
After first passage: 100 communities
Time for NetworkX: 8.056387700000016
Louvain on edgelists_liking_sample
Passage 1
Iteration 1


100%|██████████| 199158/199158 [00:02<00:00, 67741.55it/s]

Iteration 2



100%|██████████| 199158/199158 [00:02<00:00, 80143.16it/s]

Iteration 3



100%|██████████| 199158/199158 [00:02<00:00, 81089.56it/s]

Iteration 4



100%|██████████| 199158/199158 [00:02<00:00, 82741.20it/s]

Iteration 5



100%|██████████| 199158/199158 [00:02<00:00, 80500.44it/s]

Constructing new graph





There are 99 communities after passage 1
Time for our implementation: 17.570455100000004
Graph of 56088 nodes
After first passage: 100 communities
Time for NetworkX: 1.9476826000000074
Louvain on edgelists_retweeters_sample
Passage 1
Iteration 1


100%|██████████| 56088/56088 [00:00<00:00, 70997.41it/s]

Iteration 2



100%|██████████| 56088/56088 [00:00<00:00, 84176.51it/s]

Iteration 3



100%|██████████| 56088/56088 [00:00<00:00, 83964.22it/s]

Constructing new graph





There are 99 communities after passage 1
Time for our implementation: 3.2745122000000038
