# Graph construction


Let's build the graph

### Import data

In [1]:
import os
from google.colab import drive


# graph files folder
drive.mount('/content/drive')
data = '/content/drive/My Drive/covid_project/graph_files'
print(data)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/covid_project/graph_files


### Essential imports

In [2]:
!pip install jsonlines
import glob
import jsonlines
import networkx as nx



### Add nodes (not really needed)

In [3]:
def add_nodes(G):
  # make one node for each user
  with open(data+"/sampled_accounts.txt") as infile:
      for line in infile:
          G.add_node(line.strip())

### Add domains edges

In [4]:
from itertools import combinations

def add_domains_edges(G):
  # make one edge to connect each user in the list of each domains
  with jsonlines.open(data+"/inverted_domains.jsonl") as infile:
      for line in infile:
          domain = line['domain']
          users = line['users']

          # connect each user to each user in the 'posting list', avoiding repetitions 
          for comb in combinations(users, 2):
              u1 = comb[0]
              u2 = comb[1]
              if G.has_edge(u1, u2):
                  G[u1][u2]['weight'] += 1
              else:
                  G.add_edge(u1, u2, weight=1)
                                    

### Add hashtags edges

In [5]:
from itertools import combinations

def add_hashtags_edges(G):
  # make one edge to connect each user in the list of each domains
  with jsonlines.open(data+"/inverted_hashtags.jsonl") as infile:
      for line in infile:
          tag = line['hashtag']
          users = line['users']

          # connect each user to each user in the 'posting list', avoiding repetitions 
          for comb in combinations(users, 2):
              u1 = comb[0]
              u2 = comb[1]
              if G.has_edge(u1, u2):
                  G[u1][u2]['weight'] += 1
              else:
                  G.add_edge(u1, u2, weight=1)

### Add retweet and mentions edges

In [6]:
def add_ret_ment_edges(G):
    with jsonlines.open(data+"/retweet_mentions.jsonl") as infile:
        for line in infile:
            u1 = line['user']
            connections = line['connections']

            # connect the user to each connection
            for u2 in connections:
              if G.has_edge(u1, u2):
                    G[u1][u2]['weight'] += 1
              else:
                    G.add_edge(u1, u2, weight=1)
            

### Build the graph

In [7]:
def lets_build():
  G = nx.Graph()
  print(f"Initial \t\t\t # nodes: {G.number_of_nodes()}, \t # edges: {G.number_of_edges()}")
  add_domains_edges(G)
  clustering_coefficient = nx.average_clustering(G)
  print(f"After domains \t\t\t # nodes: {G.number_of_nodes()},  # edges: {G.number_of_edges()},   avg. clustering coeff: {clustering_coefficient}")
  add_hashtags_edges(G)
  clustering_coefficient = nx.average_clustering(G)
  print(f"After hashtags \t\t\t # nodes: {G.number_of_nodes()}, # edges: {G.number_of_edges()},  avg. clustering coeff: {clustering_coefficient}")
  add_ret_ment_edges(G)
  clustering_coefficient = nx.average_clustering(G)
  print(f"After retweets/mentions \t # nodes: {G.number_of_nodes()}, # edges: {G.number_of_edges()},  avg. clustering coeff: {clustering_coefficient}")
  return G

G = lets_build()

Initial 			 # nodes: 0, 	 # edges: 0
After domains 			 # nodes: 2995,  # edges: 18252,   avg. clustering coeff: 0.731353829365398
After hashtags 			 # nodes: 10298, # edges: 441348,  avg. clustering coeff: 0.779130029889205
After retweets/mentions 	 # nodes: 10938, # edges: 442391,  avg. clustering coeff: 0.7294592359119989


In [8]:
#take the giant component for better community detection
Gcc = sorted(nx.connected_components(G), key=len, reverse=True)
G = G.subgraph(Gcc[0])
clustering_coefficient = nx.average_clustering(G)
print(f"Giant component \t\t # nodes: {G.number_of_nodes()}, # edges: {G.number_of_edges()},  avg. clustering coeff: {clustering_coefficient}")

Giant component 		 # nodes: 10162, # edges: 441707,  avg. clustering coeff: 0.7613322629146598


### Extract graph

In [9]:
final = nx.to_dict_of_dicts(G)

# print first n elements in dic
n = 3
first_n = {k: final[k] for k in list(final)[:n]}
#print(first_n)

In [10]:
import pickle

with open(data+'/graph.pickle', 'wb') as handle:
    pickle.dump(final, handle, protocol=pickle.HIGHEST_PROTOCOL)