# Graph construction


Let's build the graph

### Import data

In [1]:
import os
from google.colab import drive


# graph files folder
drive.mount('/content/drive')
data = '/content/drive/My Drive/covid_project/graph_files'
print(data)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/covid_project/graph_files


### Essential imports

In [2]:
!pip install jsonlines
import glob
import jsonlines
import networkx as nx



### Add nodes (not really needed, in case remember to delete nodes with deg(0))

In [3]:
def add_nodes(G):
  # make one node for each user
  for file in glob.glob(data+"/sampled_accounts.txt"):
      with open(file) as infile:
          for line in infile:
              G.add_node(line)

### Add domains edges

In [4]:
from itertools import combinations

def add_domains_edges(G):
  # make one edge to connect each user in the list of each domains
  for file in glob.glob(data+"/inverted_domains.jsonl"):
      with jsonlines.open(file) as infile:
          for line in infile:
              domain = line['domain']
              users = line['users']

              # connect each user to each user in the 'posting list', avoiding repetitions 
              for comb in combinations(users, 2):
                  u1 = comb[0]
                  u2 = comb[1]
                  if G.has_edge(u1, u2):
                      G[u1][u2]['weight'] += 1
                  else:
                      G.add_edge(u1, u2, weight=1)
                                    

### Add retweet and mentions edges

In [5]:
def add_ret_ment_edges(G):
  for file in glob.glob(data+"/retweet_mentions.jsonl"):
      with jsonlines.open(file) as infile:
          for line in infile:
              u1 = line['user']
              connections = line['connections']

              # connect the user to each connection
              for u2 in connections:
                if G.has_edge(u1, u2):
                      G[u1][u2]['weight'] += 1
                else:
                      G.add_edge(u1, u2, weight=1)
            

### Build the graph

In [6]:
def lets_build():
  G = nx.Graph()
  print(f"Initial \t\t\t # nodes: {G.number_of_nodes()}, \t # edges: {G.number_of_edges()}")
  add_domains_edges(G)
  print(f"After domains \t\t\t # nodes: {G.number_of_nodes()}, \t # edges: {G.number_of_edges()}")
  add_ret_ment_edges(G)
  print(f"After retweets/mentions \t # nodes: {G.number_of_nodes()}, # edges: {G.number_of_edges()}")
  return G

G = lets_build()

Initial 			 # nodes: 0, 	 # edges: 0
After domains 			 # nodes: 843, 	 # edges: 2464
After retweets/mentions 	 # nodes: 17903, # edges: 20623


In [7]:
#remove isolated nodes
G.remove_nodes_from(list(nx.isolates(G)))
print(f"Remove isolated nodes # nodes: {G.number_of_nodes()}, # edges: {G.number_of_edges()}")

Remove isolated nodes # nodes: 17903, # edges: 20623


### Extract graph

In [8]:
final = nx.to_dict_of_dicts(G)

# print first n elements in dic
n = 3
first_n = {k: final[k] for k in list(final)[:n]}
#print(first_n)

In [9]:
import pickle

with open(data+'/graph.pickle', 'wb') as handle:
    pickle.dump(final, handle, protocol=pickle.HIGHEST_PROTOCOL)