### Using Louvain algorithem to detect communities in graphs 

In [40]:
# our implementation of louvain method takes alot of time 
# we use the method implemented in python_louvain instead
# if you want to test the method, uncomment the following
#from louvain import *

In [3]:
# reading graph edges from bigQuery tables 
from google.cloud import bigquery
from scipy.sparse import csr_matrix
import numpy as np 
from google.cloud import storage
from google.cloud import bigquery
from pyspark.sql import *
import pandas as pd
from scipy.io import mmwrite
import networkx as nx
from networkx.algorithms import community
import community as community_louvain
import time 
spark = SparkSession.builder \
  .appName('data-preparation')\
  .config('spark.jars', 'gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar') \
  .getOrCreate()

In [4]:
files = ['CA-AstroPh', 'CA-GrQc',  'CA-HepPh', 'CA-HepTh' ]
# the bucket where we store the edges of each dataset in cloud storage 
BUCKET_NAME = 'rplace-bucket'
# project name in GCP 
project_name = "bigdata-project-346922"
dataset = "collaboration_data"

partitions = []
elepses = []
client = bigquery.Client()

for file in files: 
    table_id = project_name + '.' + dataset + '.' + file
    # getting tables from BigQuery 
    dataframe = client.list_rows(table_id).to_dataframe(create_bqstorage_client=True)
    print("-----------------------------------")
    print("##### BigQuery table infos :")
    print(dataframe.info())
    # sparse matrix creation for each graph 
    rows = list(dataframe['fromId'])
    cols = list(dataframe['toId'])
    data = [1 for i in range(len(cols))]
    # we use identfiersList to map all nodes identifiers 
    # to the interval [0 - (nodes -1)]
    # in order to keep the sparse matrix small enough 
    
    identifiersList = [i for i in set(cols + rows)]
    identifiersList.sort()
    dic = dict()
    node = 0
    for i in identifiersList:
        dic[i] = node
        node+=1
    for k in range(len(cols)): 
        cols[k] = dic[cols[k]]
        rows[k] = dic[rows[k]]
    nodes = max(cols + rows)
    print(f"##### number of nodes {nodes} in dataset : {file}")
    m = csr_matrix((data, (rows, cols)), shape=(nodes+1, nodes+1), dtype=np.uint16)
    # m is a sparse matrix for adjecency matrix, we create then a networkx graph g 
    g = nx.from_scipy_sparse_matrix(m)
    start = time.time()
    p = community_louvain.best_partition(g)
    elepse = time.time() - start
    elepses.append(elepse)
    partitions.append(p)
    print("##### time elepsed {:.4f} seconds ".format(elepse))
    print("##### number of partitions is {0} in dataset : {1}".format( len(set(p.values())), file) ) 

-----------------------------------
##### BigQuery table infos :
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 396160 entries, 0 to 396159
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   id      396160 non-null  int64
 1   fromId  396160 non-null  int64
 2   toId    396160 non-null  int64
dtypes: int64(3)
memory usage: 9.1 MB
None
##### number of nodes 18771 in dataset : CA-AstroPh
##### time elepsed 45.3023 seconds 
##### number of partitions is 325 in dataset : CA-AstroPh
-----------------------------------
##### BigQuery table infos :
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28980 entries, 0 to 28979
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   id      28980 non-null  int64
 1   fromId  28980 non-null  int64
 2   toId    28980 non-null  int64
dtypes: int64(3)
memory usage: 679.3 KB
None
##### number of nodes 5241 in dataset : CA-GrQc
##### time elep

In [5]:
# Saving results in files 
i = 0
for file in files : 
    with open("louvain_node_to_community." + file, "w") as f:
        for n,c in partitions[i].items():
            f.write(str(n) + ":" + str(c)+"\n")
        f.write("Time_of_execution:{}".format(elepses[i]))
    i+=1


In [6]:
# copy results files in our working cloud storage bucket 
# this step is optional if we work in local machine
!gsutil -m cp ./louvain_node_to_community.CA* gs://rplace-bucket/results

Copying file://./louvain_node_to_community.CA-AstroPh [Content-Type=application/octet-stream]...
Copying file://./louvain_node_to_community.CA-GrQc [Content-Type=application/octet-stream]...
Copying file://./louvain_node_to_community.CA-HepPh [Content-Type=application/octet-stream]...
Copying file://./louvain_node_to_community.CA-HepTh [Content-Type=application/octet-stream]...
/ [4/4 files][356.6 KiB/356.6 KiB] 100% Done                                    
Operation completed over 4 objects/356.6 KiB.                                    
