# Network component analysis

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import networkx as nx
from networkx.algorithms import community
from numba import jit
from tqdm import tqdm
from itertools import tee

In [24]:
# Read csv from edgelistsSample folder
edgelists_sample_df = pd.read_csv("edgelistsSample/edgelists_any_sample.csv")
edgelists_sample_df
G = nx.from_pandas_edgelist(edgelists_sample_df, source='Source', target='Target', create_using=nx.Graph())
# Set all weights to 1
nx.set_edge_attributes(G, 1, 'weight')
print(len(G.nodes))

359703


In [25]:
louvain_nx = community.louvain_partitions(G)
first = next(louvain_nx)
print(f"After first passage: {len(first)} communities")
second = next(louvain_nx)
print(f"After second passage: {len(second)} communities")

After first passage: 43 communities
After second passage: 33 communities


In [26]:
# Louvain implementation
def louvain(G, npassage):
    # Will contain the graph and the communities after each passage
    data = {}
    for i in range(npassage):
        print(f"Passage {i}", flush=True)
        old_G, G, communities = louvain_step(G, G, i==0)
        data[i] = (old_G, communities)
        print(f"There are {len(communities)} communities after passage {i}", flush=True)
    return data


def louvain_step(G, copy, is_first_passage):
    # Step 1: Initialization, start with each node being a single community
    communities = {idx: set([node]) for idx, node in enumerate(G.nodes)}
    # To get direct access to the community (it speeds up a bit the algorithm)
    get_community = {node: idx for idx, node in enumerate(G.nodes)}
    # Used in the modularity computation
    neighbors_sets = {node: set(G.neighbors(node)) for node in G.nodes}
    m = len(G.edges)

    # Sum the weights of the incident edges for all nodes inside a community, for all communities
    # Separate first passage and other ones to speed up the algorithm
    if is_first_passage:
        sum_communities = {idx: sum(dict(G.degree(community)).values()) for idx, community in communities.items()}
    else:
        sum_communities = {idx: sum(dict(G.degree(community, 'weight')).values()) for idx, community in communities.items()}

    # Fixed number of iterations
    # TODO: change it?
    for i in range(4):
        print(f"Iteration {i}", flush=True)
        for node in tqdm(G.nodes):
            # Step 2: Remove node from its community
            # TODO: make sure what we want to do if the node has no neighboring communities
            neighboring_communities = get_neighboring_communities(G, node, get_community)
            if neighboring_communities == set():
                continue
            belong_to = get_community[node]
            communities[belong_to].remove(node)
            sum_communities[belong_to] -= G.degree(node, 'weight')
            if communities[belong_to] == set():
                del communities[belong_to]
                del sum_communities[belong_to]

            # Step 3: Insert the node in the community that maximizes the modularity
            scores = [
                (neighbor_community, modularity_gain(G, node, communities[neighbor_community], sum_communities[neighbor_community], neighbors_sets[node], m, is_first_passage))
                for neighbor_community in neighboring_communities
            ]
            best_community, best_score = max(scores, key=lambda x: x[1])
            communities[best_community].add(node)
            get_community[node] = best_community
            sum_communities[best_community] += G.degree(node, 'weight')

    return G, get_new_graph(G, communities, sum_communities, get_community), communities


def get_neighboring_communities(G, node, get_community):
    # Use a set to make sure a community only appear once
    neighboring_communities = set()
    for neighbor in G.neighbors(node):
        if neighbor == node: continue
        neighboring_communities.add(get_community[neighbor])

    return neighboring_communities


def modularity_gain(G, node, community, sum_community, neighbor_set, m, is_first_passage):
    # Separate first passage and other ones to speed up the algorithm
    if is_first_passage:
        # Sum the weights of the edges from node into community nodes
        # Using sets allow to use intersection()
        sum_weights_node = len(neighbor_set.intersection(community))
        right_member = (sum_community * G.degree[node]) / (2 * (m**2))
    else:
        # Sum the weights of the edges from node into community nodes
        sum_weights_node = sum([G.get_edge_data(node, member)['weight'] for member in G.neighbors(node) if member in community])
        right_member = (sum_community * G.degree(node, 'weight')) / (2 * (m**2))
    # Compute modularity
    left_member = sum_weights_node / (2 * m)
    return left_member - right_member


def get_new_graph(old_G, communities, sum_communities, get_community):
    print("Construct new graph", flush=True)
    G = nx.Graph()
    G.add_nodes_from(communities.keys())
    for community in communities:
        G.add_edge(community, community, weight=sum_communities[community])

    for source, dest, weight_dict in old_G.edges(data=True):
        community1 = get_community[source]
        community2 = get_community[dest]
        current_weight = G.get_edge_data(community1, community2, {'weight': 0})['weight']
        # TODO: is this correct?
        new_weight = current_weight + weight_dict['weight']
        G.add_edge(community1, community2, weight=new_weight)
    return G


data = louvain(G, 2)

Passage 0
Iteration 0


100%|███████████████████████████████████████████████████| 359703/359703 [00:03<00:00, 111285.91it/s]

Iteration 1



100%|███████████████████████████████████████████████████| 359703/359703 [00:02<00:00, 133350.81it/s]

Iteration 2



100%|███████████████████████████████████████████████████| 359703/359703 [00:02<00:00, 134921.01it/s]

Iteration 3



100%|███████████████████████████████████████████████████| 359703/359703 [00:02<00:00, 134068.58it/s]

Construct new graph





There are 43 communities after passage 0
Passage 1
Iteration 0


100%|████████████████████████████████████████████████████████████| 43/43 [00:00<00:00, 62536.43it/s]

Iteration 1



100%|████████████████████████████████████████████████████████████| 43/43 [00:00<00:00, 71711.76it/s]

Iteration 2



100%|████████████████████████████████████████████████████████████| 43/43 [00:00<00:00, 71825.99it/s]

Iteration 3



100%|████████████████████████████████████████████████████████████| 43/43 [00:00<00:00, 72635.95it/s]

Construct new graph
There are 21 communities after passage 1





In [20]:
# get last entry of data
last_entry = data[0]
networkx_graph, communities_dict = last_entry

In [21]:
communities_df = pd.DataFrame(columns=["Id","Label", "Community", "Is_author"])

# tranform communities_dict into a dataframe 
# key is the community, value is the Id
for key, value in communities_dict.items():
    for idx in value:
        communities_df = pd.concat([communities_df, pd.DataFrame({"Id":idx, "Label":idx, "Community":key, "Is_author":False}, index=[0])], ignore_index=True)

In [22]:
communities_df.loc[communities_df['Id'].isin(edgelists_sample_df['Target'].unique()), "Is_author"] = True

In [23]:
communities_df

Unnamed: 0,Id,Label,Community,Is_author
0,911968842548736000,911968842548736000,1,False
1,898353394636566529,898353394636566529,1,False
2,831170686311882753,831170686311882753,1,False
3,1479359505292951555,1479359505292951555,1,True
4,1492606363733737473,1492606363733737473,1,False
...,...,...,...,...
6287,2619827831,2619827831,3603,False
6288,2533360464,2533360464,3603,False
6289,3107281715,3107281715,3603,False
6290,50552213,50552213,3603,False


In [58]:
# save to csv, ignore index
communities_df.to_csv("communities_df.csv", index=False)

## TODO:
- Sample by taking random author_id and take all edges connected to it
- Implement from scratch community detection: Louvain (faster than Girvan)
- Look at how to store the communities (edgelist or so)
- Look at the caracteristics of the huge communities: what is the Tweet, who is the author, what are the hashtags etc...
- Make profiles of users: press, random guy, famous guy, etc...
- Look at outliers
- Load communities to Gephi to vizualise them