# Network component analysis

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import networkx as nx
from networkx.algorithms import community
from numba import jit
from tqdm import tqdm
from itertools import tee

In [5]:
# read csv from edgelistsSample folder
edgelists_sample_df = pd.read_csv("edgelistsSample/edgelists_any_sample.csv")
edgelists_sample_df
G = nx.from_pandas_edgelist(edgelists_sample_df, source='Source', target='Target', create_using=nx.Graph())
print(len(G.nodes))

2778


In [6]:
louvain_nx = community.louvain_partitions(G)
first = next(louvain_nx)
print(f"After first passage: {len(first)} communities")

After first passage: 4 communities


In [8]:
# Louvain implementation
def louvain(G, npassage):
    # Will contain the graph and the communities after each passage
    data = {}
    for i in range(npassage):
        print(f"Passage {i}")
        G, communities = louvain_step(G, i==0)
        data[i] = (G, communities)
        print(f"There are {len(communities)} communities after passage {i}")
    return data


def louvain_step(G, is_first_passage):
    # Communities are stored in a dict
    # Step 1: Initialization, start with each node being a single community
    communities = {idx:[node] for idx, node in enumerate(G.nodes)}
    # To get direct access to the community (it speeds up a bit the algorithm)
    get_community = {node: idx for idx, node in enumerate(G.nodes)}
    m = len(G.edges)

    # TODO: ATM it is a fixed number of iteration, we need to find a way to
    # Stop the iterations whenever the communities are not evolving anymore
    for i in range(2):
        print(f"Iteration {i}")
        for node in tqdm(G.nodes):
            # Step 2: Remove node from its community
            belong_to = get_community[node]
            communities[belong_to].remove(node)
            if communities[belong_to] == []:
                del communities[belong_to]

            # Step 3: Insert the node in the community that maximizes the modularity
            neighboring_communities = get_neighboring_communities(G, node, get_community)
            scores = [(neighbor_community, modularity_gain(G, node, communities[neighbor_community], m, is_first_passage)) for neighbor_community in neighboring_communities]
            best_comm, best_score = max(scores, key=lambda x: x[1])
            communities[best_comm].append(node)
            get_community[node] = best_comm
    return get_new_graph(G, communities), communities
    # return G, communities


def get_neighboring_communities(G, node, get_community):
    # Use a set to make sure a community only appear once
    neighboring_communities = set()
    for neighbor in G[node]:
        if neighbor == node: continue
        neighboring_communities.add(get_community[neighbor])

    assert neighboring_communities != set(), f"No neighboring communities for node {node}"
    return neighboring_communities


def modularity_gain(G, node, community, m, is_first_passage):
    # Separate first passage from other to speed up
    if is_first_passage:
        # Sum the weights of the edges from node into community nodes
        sum_weights_node = sum([1 for member in G.neighbors(node) if member in community])
        # Sum the weights of the incident edges for all nodes in community
        sum_community = sum(dict(G.degree(community)).values())
        right_member = (sum_community * G.degree[node]) / (2 * (m**2))
    else:
        # Sum the weights of the edges from node into community nodes
        sum_weights_node = sum([G.get_edge_data(node, member)['weight'] for member in G.neighbors(node) if member in community])
        # Sum the weights of the incident edges for all nodes in community
        sum_community = sum(dict(G.degree(community, 'weight')).values())
        right_member = (sum_community * G.degree(node, 'weight')) / (2 * (m**2))
    # Compute modularity
    left_member = sum_weights_node / (2 * m)
    return left_member - right_member


def get_new_graph(old_G, communities):
    G = nx.Graph()
    G.add_nodes_from(communities.keys())
    for idx, community in communities.items():
        # Sum the weights of the incident edges for all nodes in community
        sum_community = sum([weight for node, weight in G.degree(community, 'weight')])
        G.add_edge(idx, idx, weight=sum_community)
    between_edge = 0
    for (idx1, community1), (idx2, community2) in pairwise(communities.items()):
        for place, node1 in enumerate(community1):
            for node2 in community2[place:]:
                if G.get_edge_data(node1, node2) is not None:
                    between_edge += G.get_edge_data(node1, node2)['weight']
        G.add_edge(idx1, idx2, weight=between_edge)
    return G


# pairwise() from Itertools Recipes
# To iterate pairwise keys in a dict, from Stackoverflow
# https://stackoverflow.com/questions/37010754/python-loop-dictionary-two-keys-at-a-time
def pairwise(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = tee(iterable)
    next(b, None)
    return zip(a, b)


data = louvain(G, 2)

Passage 0
Iteration 0


100%|█████████████████████████████████████████████████████████| 2778/2778 [00:00<00:00, 3700.82it/s]


Iteration 1


100%|█████████████████████████████████████████████████████████| 2778/2778 [00:01<00:00, 2121.98it/s]


There are 4 communities after passage 0
Passage 1
Iteration 0


100%|██████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 17829.13it/s]


Iteration 1


100%|██████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 23899.17it/s]

There are 1 communities after passage 1





In [13]:
# get last entry of data
last_entry = data[-1]
networkx_graph, communities_dict = last_entry

In [14]:
communities_df = pd.DataFrame(columns=["Id","Label", "Community", "Is_author"])

# tranform communities_dict into a dataframe 
# key is the community, value is the Id
for key, value in communities_dict.items():
    for idx in value:
        communities_df = pd.concat([communities_df, pd.DataFrame({"Id":idx, "Label":idx, "Community":key, "Is_author":False}, index=[0])], ignore_index=True)

In [15]:
communities_df.loc[communities_df['Id'].isin(edgelists_sample_df['Target'].unique()), "Is_author"] = True

In [16]:
communities_df

Unnamed: 0,Id,Label,Community,Is_author
0,2439111781,2439111781,1,False
1,1107958498283974656,1107958498283974656,1,True
2,39098651,39098651,1,False
3,976817480,976817480,1,False
4,63679540,63679540,1,False
...,...,...,...,...
2773,1427335175877402626,1427335175877402626,916,False
2774,1125420654,1125420654,916,False
2775,27695425,27695425,916,False
2776,73475075,73475075,916,False


In [58]:
# save to csv, ignore index
communities_df.to_csv("communities_df.csv", index=False)

## TODO:
- Sample by taking random author_id and take all edges connected to it
- Implement from scratch community detection: Louvain (faster than Girvan)
- Look at how to store the communities (edgelist or so)
- Look at the caracteristics of the huge communities: what is the Tweet, who is the author, what are the hashtags etc...
- Make profiles of users: press, random guy, famous guy, etc...
- Look at outliers
- Load communities to Gephi to vizualise them