# Network component analysis

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import networkx as nx
from networkx.algorithms import community
from numba import jit
from tqdm import tqdm
from itertools import tee

In [2]:
# # read all csv files from edgelists folder and save to edgelists_df
# edgelists_df = pd.concat([pd.read_csv(file) for file in Path("edgelists").iterdir()], ignore_index=True)
# # drop duplicates in edgelists_df
# edgelists_df.drop_duplicates(inplace=True)
# print(edgelists_df)
# edgelists_graph_sample = edgelists_df['user_id'].unique()
# author_ids_sample = np.random.choice(edgelists_graph_sample, 20000, replace=False)
# edgelists_df_sample = edgelists_df[edgelists_df['user_id'].isin(author_ids_sample)]

# G = nx.from_pandas_edgelist(edgelists_df_sample, source='user_id', target='author_id', create_using=nx.Graph())
# # print number of nodes in edgelists_graph_sample
# print(f"{len(G.nodes)=}")

# # print number of edges in edgelists_graph_sample
# print(f"{len(G.edges)=}")
# nx.set_edge_attributes(G, 1, 'weight')

                     user_id           author_id
0                 4718799509  755067744333271040
1        1239540883080581120  755067744333271040
2                  495339130  755067744333271040
3                  198514083  755067744333271040
4        1488576875744137218  755067744333271040
...                      ...                 ...
5033369  1165006575939018759           106927705
5033373             88566817          2389823918
5033375           2389823918          2389823918
5033378           1468140559            54168532
5033380  1161924136383369216            54168532

[4466776 rows x 2 columns]
len(G.nodes)=20316
len(G.edges)=31779


In [13]:
# read csv from edgelistsSample folder
edgelists_sample_df = pd.read_csv("edgelistsSample/edgelists_any_sample.csv")
edgelists_sample_df
G = nx.from_pandas_edgelist(edgelists_sample_df, source='Source', target='Target', create_using=nx.Graph())
print(len(G.nodes))

3672


In [14]:
louvain_nx = community.louvain_partitions(G)
first = next(louvain_nx)
print(f"After first passage: {len(first)} communities")

After first passage: 2 communities


In [15]:
# Louvain implementation
def louvain(G, npassage):
    # Ensure there is a 'weight' label
    nx.set_edge_attributes(G, 1, 'weight')

    # Will contain the graph and the communities after each passage
    data = {}
    for i in range(npassage):
        print(f"Passage {i}")
        G, communities = louvain_step(G)
        data[i] = (G, communities)
        print(f"There are {len(communities)} communities after passage {i}")
    return data


def louvain_step(G):
    # Communities are stored in a dict
    # Step 1: Initialization, start with each node being a single community
    communities = {idx:[node] for idx, node in enumerate(G.nodes)}
    m = len(G.edges)

    # TODO: ATM it is a fixed number of iteration, we need to find a way to
    # Stop the iterations whenever the communities are not evolving anymore
    for i in range(2):
        print(f"Iteration {i}")
        for node in tqdm(G.nodes()):
            # Step 2: Remove node from its community
            belong_to = get_community(node, communities)
            communities[belong_to].remove(node)
            if communities[belong_to] == []:
                del communities[belong_to]

            # Step 3: Insert the node in the community that maximizes the modularity
            neighboring_communities = get_neighboring_communities(G, node, communities)
            scores = [(neighbor_community, modularity_gain(G, node, communities[neighbor_community], m)) for neighbor_community in neighboring_communities]
            best_comm, best_score = max(scores, key=lambda x: x[1])
            communities[best_comm].append(node)
    return get_new_graph(G, communities), communities


def get_community(node, communities):
    for idx, community in communities.items():
        if node in community:
            return idx
    assert False, f"Node {node} not found"


def get_neighboring_communities(G, node, communities):
    # Use a set to make sure a community only appear once
    neighboring_communities = set()
    for neighbor in G[node]:
        if neighbor == node: continue
        neighboring_communities.add(get_community(neighbor, communities))

    assert neighboring_communities != set(), f"No neighboring communities for node {node}"
    return neighboring_communities


def modularity_gain(G, node, community, m):
    # Sum the weights of the edges from node into community nodes
    sum_weights_node = sum([G.get_edge_data(node, member)['weight'] for member in community if member in G.neighbors(node)])
    # Sum the weights of the incident edges for all nodes in community
    sum_community = sum([weight for _node, weight in G.degree(community, 'weight')])
    # Compute modularity
    left_member = sum_weights_node / (2 * m)
    right_member = (sum_community * G.degree(node, 'weight')) / (2 * (m**2))
    return left_member - right_member


def get_new_graph(old_G, communities):
    G = nx.Graph()
    G.add_nodes_from(communities.keys())
    for idx, community in communities.items():
        # Sum the weights of the incident edges for all nodes in community
        sum_community = sum([weight for node, weight in G.degree(community, 'weight')])
        G.add_edge(idx, idx, weight=sum_community)
    between_edge = 0
    for (idx1, community1), (idx2, community2) in pairwise(communities.items()):
        for place, node1 in enumerate(community1):
            for node2 in community2[place:]:
                if G.get_edge_data(node1, node2) is not None:
                    between_edge += G.get_edge_data(node1, node2)['weight']
        G.add_edge(idx1, idx2, weight=between_edge)
    return G


# pairwise() from Itertools Recipes
# To iterate pairwise keys in a dict, from Stackoverflow
# https://stackoverflow.com/questions/37010754/python-loop-dictionary-two-keys-at-a-time
def pairwise(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = tee(iterable)
    next(b, None)
    return zip(a, b)


data = louvain(G, 2)


Passage 0
Iteration 0


100%|█████████████████████████████████████████████████████████████████████████████████████████| 3672/3672 [00:05<00:00, 625.32it/s]


Iteration 1


100%|█████████████████████████████████████████████████████████████████████████████████████████| 3672/3672 [00:08<00:00, 410.76it/s]


There are 2 communities after passage 0
Passage 1
Iteration 0


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 25040.62it/s]


Iteration 1


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 30174.85it/s]

There are 1 communities after passage 1





In [25]:
# get last entry of data
last_entry = data[len(data)-1]
networkx_graph, communities_dict = last_entry

In [54]:
communities_df = pd.DataFrame(columns=["Id","Label", "Community", "Is_author"])

# tranform communities_dict into a dataframe 
# key is the community, value is the Id
for key, value in communities_dict.items():
    for idx in value:
        communities_df = pd.concat([communities_df, pd.DataFrame({"Id":idx, "Label":idx, "Community":key, "Is_author":False}, index=[0])], ignore_index=True)

In [55]:
communities_df.loc[communities_df['Id'].isin(edgelists_sample_df['Target'].unique()), "Is_author"] = True

In [57]:
communities_df

Unnamed: 0,Id,Label,Community,Is_author
0,3320242092,3320242092,1,False
1,851678533374136321,851678533374136321,1,True
2,849448663546040320,849448663546040320,1,False
3,743981157881122816,743981157881122816,1,False
4,1123719719851298816,1123719719851298816,1,False
...,...,...,...,...
66,1430307870982107140,1430307870982107140,66,True
67,703213345470550016,703213345470550016,66,False
68,1309546727372922882,1309546727372922882,66,False
69,1425237620049776646,1425237620049776646,66,False


In [58]:
# save to csv, ignore index
communities_df.to_csv("communities_df.csv", index=False)

## TODO:
- Sample by taking random author_id and take all edges connected to it
- Implement from scratch community detection: Louvain (faster than Girvan)
- Look at how to store the communities (edgelist or so)
- Look at the caracteristics of the huge communities: what is the Tweet, who is the author, what are the hashtags etc...
- Make profiles of users: press, random guy, famous guy, etc...
- Look at outliers
- Load communities to Gephi to vizualise them