# Graph construction


Let's build the graph

### Import data

In [1]:
import os
from google.colab import drive


# graph files folder
drive.mount('/content/drive')
data = '/content/drive/My Drive/covid_project/graph_files'
print(data)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/covid_project/graph_files


### Essential imports

In [2]:
!pip install jsonlines
import glob
import jsonlines
import networkx as nx



### Add nodes (not really needed, in case remember to delete nodes with deg(0))

In [3]:
def add_nodes(G):
  # make one node for each user
  for file in glob.glob(data+"/sampled_accounts.txt"):
      with open(file) as infile:
          for line in infile:
              G.add_node(line.strip())

### Add domains edges

In [4]:
from itertools import combinations

def add_domains_edges(G):
  # make one edge to connect each user in the list of each domains
  for file in glob.glob(data+"/inverted_domains.jsonl"):
      with jsonlines.open(file) as infile:
          for line in infile:
              domain = line['domain']
              users = line['users']

              # connect each user to each user in the 'posting list', avoiding repetitions 
              for comb in combinations(users, 2):
                  u1 = comb[0]
                  u2 = comb[1]
                  if G.has_edge(u1, u2):
                      G[u1][u2]['weight'] += 1
                  else:
                      G.add_edge(u1, u2, weight=1)
                                    

### Add hashtags edges

In [5]:
from itertools import combinations

def add_hashtags_edges(G):
  # make one edge to connect each user in the list of each domains
  for file in glob.glob(data+"/inverted_hashtags.jsonl"):
      with jsonlines.open(file) as infile:
          for line in infile:
              tag = line['hashtag']
              users = line['users']

              # connect each user to each user in the 'posting list', avoiding repetitions 
              for comb in combinations(users, 2):
                  u1 = comb[0]
                  u2 = comb[1]
                  if G.has_edge(u1, u2):
                      G[u1][u2]['weight'] += 1
                  else:
                      G.add_edge(u1, u2, weight=1)

### Add retweet and mentions edges

In [6]:
def add_ret_ment_edges(G):
  for file in glob.glob(data+"/retweet_mentions.jsonl"):
      with jsonlines.open(file) as infile:
          for line in infile:
              u1 = line['user']
              connections = line['connections']

              # connect the user to each connection
              for u2 in connections:
                if G.has_edge(u1, u2):
                      G[u1][u2]['weight'] += 1
                else:
                      G.add_edge(u1, u2, weight=1)
            

### Build the graph

In [8]:
def lets_build():
  G = nx.Graph()
  print(f"Initial \t\t\t # nodes: {G.number_of_nodes()}, \t # edges: {G.number_of_edges()}")
  add_domains_edges(G)
  clustering_coefficient = nx.average_clustering(G)
  print(f"After domains \t\t\t # nodes: {G.number_of_nodes()},  # edges: {G.number_of_edges()},   clustering coefficient: {clustering_coefficient}")
  add_hashtags_edges(G)
  clustering_coefficient = nx.average_clustering(G)
  print(f"After domains \t\t\t # nodes: {G.number_of_nodes()}, # edges: {G.number_of_edges()},  clustering coefficient: {clustering_coefficient}")
  add_ret_ment_edges(G)
  clustering_coefficient = nx.average_clustering(G)
  print(f"After retweets/mentions \t # nodes: {G.number_of_nodes()}, # edges: {G.number_of_edges()},   clustering coefficient: {clustering_coefficient}")
  return G

G = lets_build()

Initial 			 # nodes: 0, 	 # edges: 0
After domains 			 # nodes: 2995,  # edges: 18252,   clustering coefficient: 0.731353829365398
After domains 			 # nodes: 10298, # edges: 441348,  clustering coefficient: 0.779130029889205
After retweets/mentions 	 # nodes: 10938, # edges: 442391,   clustering coefficient: 0.7294592359119989


In [9]:
#remove isolated nodes
G.remove_nodes_from(list(nx.isolates(G)))
clustering_coefficient = nx.average_clustering(G)
print(f"Remove isolated nodes # nodes: {G.number_of_nodes()}, # edges: {G.number_of_edges()},   clustering coefficient: {clustering_coefficient}")

Remove isolated nodes # nodes: 10938, # edges: 442391,   clustering coefficient: 0.7294592359119989


### Extract graph

In [10]:
final = nx.to_dict_of_dicts(G)

# print first n elements in dic
n = 3
first_n = {k: final[k] for k in list(final)[:n]}
#print(first_n)

In [11]:
import pickle

with open(data+'/graph.pickle', 'wb') as handle:
    pickle.dump(final, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Python lib Luvain 

In [12]:
import community as community_louvain

# compute the best partition
partition = community_louvain.best_partition(G)

In [13]:
print(f'Communities found by Luvain: {max(partition.values())}')

Communities found by Luvain: 388


### Size of each community with more than 2 users

In [14]:
from collections import Counter

# size of each community
values = partition.values()

size_community = Counter()
for val in values:
    size_community[val] += 1


# delete communities with only 2 users
final = {}
for value in size_community:
  if size_community[value] > 2:
    final[value] = size_community[value] 

print(len(final))

109


In [15]:
print(final)

{0: 158, 1: 509, 2: 555, 3: 986, 4: 1335, 5: 490, 6: 947, 7: 1247, 8: 779, 9: 857, 10: 437, 11: 6, 12: 504, 16: 157, 17: 370, 18: 504, 21: 44, 23: 3, 26: 7, 27: 4, 30: 7, 37: 3, 38: 3, 39: 3, 45: 3, 48: 4, 50: 46, 51: 3, 52: 21, 57: 10, 59: 3, 67: 4, 70: 109, 72: 8, 73: 4, 77: 3, 81: 4, 84: 3, 96: 3, 97: 7, 103: 4, 104: 3, 108: 3, 109: 15, 112: 3, 113: 8, 114: 4, 116: 3, 127: 3, 129: 11, 139: 3, 141: 3, 143: 4, 144: 3, 148: 5, 150: 3, 157: 3, 158: 4, 164: 6, 166: 5, 173: 3, 178: 3, 179: 4, 181: 3, 182: 3, 183: 4, 184: 3, 186: 3, 187: 5, 188: 4, 190: 3, 195: 3, 196: 3, 197: 11, 199: 3, 202: 3, 208: 5, 211: 7, 213: 3, 219: 4, 220: 3, 221: 8, 224: 4, 228: 3, 232: 3, 235: 4, 240: 4, 246: 3, 249: 5, 252: 3, 253: 3, 257: 3, 265: 3, 274: 3, 275: 5, 276: 3, 278: 3, 279: 3, 281: 3, 282: 3, 283: 5, 288: 3, 292: 4, 293: 8, 297: 3, 316: 3, 322: 6, 326: 9, 331: 3}
