# Network component analysis

In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import networkx as nx
from networkx.algorithms import community 

In [2]:
# read all csv files from edgelists folder and save to edgelists_df
edgelists_df = pd.concat([pd.read_csv(file) for file in Path("edgelists").iterdir()], ignore_index=True)
# drop duplicates in edgelists_df
edgelists_df.drop_duplicates(inplace=True)
print(edgelists_df)
edgelists_graph_sample = edgelists_df['user_id'].unique()
author_ids_sample = np.random.choice(edgelists_graph_sample, 200, replace=False)
edgelists_df_sample = edgelists_df[edgelists_df['user_id'].isin(author_ids_sample)]

G = nx.from_pandas_edgelist(edgelists_df_sample, source='user_id', target='author_id', create_using=nx.Graph())
# print number of nodes in edgelists_graph_sample
print(f"{len(G.nodes)=}")

# print number of edges in edgelists_graph_sample
print(f"{len(G.edges)=}")

                     user_id  author_id
0                  159648236  454278875
1                  272581313  454278875
2                  400860401  454278875
3                 1103560652  454278875
4                   43412258  454278875
...                      ...        ...
5033373            325674737  554497503
5033376           4191445095  554497503
5033377            878977598  554497503
5033378            481769384  554497503
5033380  1462535282172084225  554497503

[4466776 rows x 2 columns]
len(G.nodes)=261
len(G.edges)=328


In [17]:
# read csv from edgelistsSample folder
edgelists_sample_df = pd.read_csv("edgelistsSample/edgelists_retweeters_sample.csv")
edgelists_sample_df

Unnamed: 0,Source,Target
0,3320242092,851678533374136321
1,849448663546040320,851678533374136321
2,743981157881122816,851678533374136321
3,1123719719851298816,851678533374136321
4,1523891821,851678533374136321
...,...,...
65,1281655655087198209,1430307870982107140
66,703213345470550016,1430307870982107140
67,1309546727372922882,1430307870982107140
68,1425237620049776646,1430307870982107140


In [18]:
G = nx.from_pandas_edgelist(edgelists_sample_df, source='Source', target='Target', create_using=nx.Graph())

In [20]:
# Louvain implementation
def louvain(G, npassage):
    # Will contain the graph and the communities after each passage
    data = {}
    for i in range(npassage):
        G, communities = louvain_step(G)
        data[i] = (G, communities)
        print(f"There are {len(communities)} communities after passage {i}")
    return data


def louvain_step(G):
    # Communities are stored in a dict
    # Step 1: Initialization, start with each node being a single community
    communities = {idx:[node] for idx, node in enumerate(G.nodes)}
    
    # TODO: ATM it is a fixed number of iteration, we need to find a way to
    # Stop the iterations whenever the communities are not evolving anymore
    for i in range(10):
        print('.', flush=True, end='')
        for node in G.nodes:
            # Step 2: Remove node from its community
            belong_to = get_community(node, communities)
            communities[belong_to].remove(node)
            if communities[belong_to] == []:
                del communities[belong_to]

            # Step 3: Insert the node in the community that maximizes the modularity
            neighboring_communities = get_neighboring_communities(G, node, communities)
            scores = [(neighbor_community, modularity(G, node, communities[neighbor_community])) for neighbor_community in neighboring_communities]
            best_comm, best_score = max(scores, key=lambda x: x[1])
            communities[best_comm].append(node)
            
#     return get_new_graph(G, communities), communities
    return G, communities

                
def get_community(node, communities):
    for idx, community in communities.items():
        if node in community:
            return idx
    assert False, f"Node {node} not found"
    
    
def get_neighboring_communities(G, node, communities):
    neighboring_communities = []
    for idx, community in communities.items():
        for member in community:
            # G[node] returns the neighbors of node
            if member in [neighbor for neighbor in G[node]]:
                neighboring_communities.append(idx)
                continue
    assert neighboring_communities != [], f"No neighboring communities for node {node}"
    return neighboring_communities
    

def modularity(G, node, community):
    shared_links = 0
    deg_sum_community = 0
    for member in community:
        deg_sum_community += G.degree[member]
        if member in [neighbor for neighbor in G[node]]:
            # Shared links are counted in both direction: a -> b and b -> a
            shared_links += 2
    
    return (1 / (2 * len(G.edges))) * (shared_links - (deg_sum_community*G.degree[node]/len(G.edges)))


# def get_new_graph(old_graph, communities):
# TODO: construct this method, IDK how to use the weighted edges in the modularity function/calculation
    
    
data = louvain(G, 3)

..........There are 2 communities after passage 0
..........There are 2 communities after passage 1
..........There are 2 communities after passage 2


In [25]:
# get last entry of data
last_entry = data[len(data)-1]
networkx_graph, communities_dict = last_entry

In [54]:
communities_df = pd.DataFrame(columns=["Id","Label", "Community", "Is_author"])

# tranform communities_dict into a dataframe 
# key is the community, value is the Id
for key, value in communities_dict.items():
    for idx in value:
        communities_df = pd.concat([communities_df, pd.DataFrame({"Id":idx, "Label":idx, "Community":key, "Is_author":False}, index=[0])], ignore_index=True)

In [55]:
communities_df.loc[communities_df['Id'].isin(edgelists_sample_df['Target'].unique()), "Is_author"] = True

In [57]:
communities_df

Unnamed: 0,Id,Label,Community,Is_author
0,3320242092,3320242092,1,False
1,851678533374136321,851678533374136321,1,True
2,849448663546040320,849448663546040320,1,False
3,743981157881122816,743981157881122816,1,False
4,1123719719851298816,1123719719851298816,1,False
...,...,...,...,...
66,1430307870982107140,1430307870982107140,66,True
67,703213345470550016,703213345470550016,66,False
68,1309546727372922882,1309546727372922882,66,False
69,1425237620049776646,1425237620049776646,66,False


In [58]:
# save to csv, ignore index
communities_df.to_csv("communities_df.csv", index=False)

## TODO:
- Sample by taking random author_id and take all edges connected to it
- Implement from scratch community detection: Louvain (faster than Girvan)
- Look at how to store the communities (edgelist or so)
- Look at the caracteristics of the huge communities: what is the Tweet, who is the author, what are the hashtags etc...
- Make profiles of users: press, random guy, famous guy, etc...
- Look at outliers
- Load communities to Gephi to vizualise them