In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# always check the current version in github
!pip install networkx==2.6.3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip install wikipedia

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
from operator import itemgetter
import networkx as nx
import wikipedia
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from collections import deque
import numpy as np
import seaborn as sns
import pandas as pd

In [5]:
nx.__version__

'2.6.3'

### Building the Pipeline ###

In [21]:
class Pipeline():
    def __init__(self):
        self.tasks = []
        
    def task(self, depends_on=None):
        idx = 0
        if depends_on:
            idx = self.tasks.index(depends_on) + 1
        def inner(f):
            self.tasks.insert(idx, f)
            return f
        return inner
    
    def run(self, input_):
        output = input_
        for task in self.tasks:
            output = task(output)
        return output

In [22]:
# the functions that will be added in the pipeline

pipeline = Pipeline()

# data collecting
@pipeline.task()
def datacollect(SEED):
  STOPS = ("1930",
        "1934",
        "1938",
        "1942",
        "1946",
        "1950", 
        "1954",
        "1958", 
        "1962",
        "1966", 
        "1970",
        "1974", 
        "1978",
        "1982",
        "1986",
        "1990",
        "1994",
        "1998",
        "2002",
        "2006",
        "2010",
        "2014",
        "2018",
        "2022")

  todo_lst = [(0, SEED)] # The SEED is in the layer 0
  todo_set = set(SEED) # The SEED itself
  done_set = set()

  g = nx.DiGraph()
  layer, page = todo_lst[0]
  
  while layer < 2:
    # Remove the name page of the current page from the todo_lst, 
    # and add it to the set of processed pages. 
    # If the script encounters this page again, it will skip over it.
    del todo_lst[0]
    done_set.add(page)
  
    # Show progress
    print(layer, page) 
  
    # Attempt to download the selected page.
    try:
      wiki = wikipedia.page(page)
    except:
      print("Could not load", page)
      layer, page = todo_lst[0]
      continue
  
    for link in wiki.links:
      link = link.title()
      if link not in STOPS and not link.startswith("List Of"):
        if link not in todo_set and link not in done_set:
          todo_lst.append((layer + 1, link))
          todo_set.add(link)
        g.add_edge(page, link)
    layer, page = todo_lst[0]

  return g

# data cleaning
@pipeline.task(depends_on=datacollect)
def dataclean(g):
  # make a copy of raw graph
  original = g.copy()

  # remove self loops
  g.remove_edges_from(nx.selfloop_edges(g))

  # identify duplicates like that: 'network' and 'networks'
  duplicates = [(node, node + "s") 
                for node in g if node + "s" in g
                ]

  for dup in duplicates:
    # *dup is a technique named 'unpacking'
    g = nx.contracted_nodes(g, *dup, self_loops=False)

  print(duplicates)

  duplicates = [(x, y) for x, y in 
                [(node, node.replace("-", " ")) for node in g]
                  if x != y and y in g]
  print(duplicates)

  for dup in duplicates:
    g = nx.contracted_nodes(g, *dup, self_loops=False)

  # nx.contracted creates a new node/edge attribute called contraction
  # the value of the attribute is a dictionary, but GraphML
  # does not support dictionary attributes
  nx.set_node_attributes(g, 0,"contraction")
  nx.set_edge_attributes(g, 0,"contraction")

  return g

@pipeline.task(depends_on=dataclean)
def truncadenetwork(g):
  
  # filter nodes with degree greater than or equal to 2
  core = [node for node, deg in dict(g.degree()).items() if deg >= 2]

  # select a subgraph with 'core' nodes
  gsub = nx.subgraph(g, core)

  nx.write_graphml(gsub, "cmf.graphml")

  print("{} nodes, {} edges".format(len(gsub), nx.number_of_edges(gsub)))

@pipeline.task(depends_on=truncadenetwork)
def degreecentrally(self):
  g = nx.read_graphml('cmf.graphml')

  # the degree centrality of network(g)
  fig, ax = plt.subplots(1,1,figsize=(50,40))

  # layout position
  pos = nx.spring_layout(g,seed=123456789,k=0.3)
  # color of nodes
  color = list(dict(nx.degree_centrality(g)).values())

  # draw edges
  nx.draw_networkx_edges(g,
                        pos=pos,
                        alpha=0.4, ax=ax)

  # draw nodes
  nodes = nx.draw_networkx_nodes(g,
                  pos=pos,
                  node_color=color,
                  cmap=plt.cm.jet,
                  ax=ax)

  # draw labels
  nx.draw_networkx_labels(g, pos=pos,
                          font_color='white', ax=ax)


  plt.axis("off")
  plt.colorbar(nodes)
  plt.savefig('Imagens/degree_centrality.png', transparent=True,dpi=300)
  plt.show()

@pipeline.task(depends_on=degreecentrally)
def closenesscentrally(self):
  g = nx.read_graphml('cmf.graphml')

  # the closeness centrality of network(g)
  fig, ax = plt.subplots(1,1,figsize=(50,40))

  # layout position
  pos = nx.spring_layout(g,seed=123456789,k=0.3)
  # color of nodes
  color = list(dict(nx.closeness_centrality(g)).values())

  # draw edges
  nx.draw_networkx_edges(g,
                        pos=pos,
                        alpha=0.4, ax=ax)

  # draw nodes
  nodes = nx.draw_networkx_nodes(g,
                  pos=pos,
                  node_color=color,
                  cmap=plt.cm.jet,
                  ax=ax)

  # draw labels
  nx.draw_networkx_labels(g, pos=pos,
                          font_color='white', ax=ax)


  plt.axis("off")
  plt.colorbar(nodes)
  plt.savefig('Imagens/closeness_centrality.png', transparent=True,dpi=600)
  plt.show()

@pipeline.task(depends_on=closenesscentrally)
def betweennesscentrally(self):
  g = nx.read_graphml('cmf.graphml')

  # the betwenness centrality of network(g)
  fig, ax = plt.subplots(1,1,figsize=(50,40))

  # layout position
  pos = nx.spring_layout(g,seed=123456789,k=0.3)
  # color of nodes
  color = list(dict(nx.betweenness_centrality(g)).values())

  # draw edges
  nx.draw_networkx_edges(g,
                        pos=pos,
                        alpha=0.4, ax=ax)

  # draw nodes
  nodes = nx.draw_networkx_nodes(g,
                  pos=pos,
                  node_color=color,
                  cmap=plt.cm.jet,
                  ax=ax)

  # draw labels
  nx.draw_networkx_labels(g, pos=pos,
                          font_color='white', ax=ax)


  plt.axis("off")
  plt.colorbar(nodes)
  plt.savefig('Imagens/betweenness_centrality.png', transparent=True,dpi=600)
  plt.show()

@pipeline.task(depends_on=betweennesscentrally)
def eigenvectorcentrally(self):
  g = nx.read_graphml('cmf.graphml')
  
  # the eigenvector centrality of network(g)
  fig, ax = plt.subplots(1,1,figsize=(50,40))

  # layout position
  pos = nx.spring_layout(g,seed=123456789,k=0.3)
  # color of nodes
  color = list(dict(nx.eigenvector_centrality(g)).values())

  # draw edges
  nx.draw_networkx_edges(g,
                        pos=pos,
                        alpha=0.4, ax=ax)

  # draw nodes
  nodes = nx.draw_networkx_nodes(g,
                  pos=pos,
                  node_color=color,
                  cmap=plt.cm.jet,
                  ax=ax)

  # draw labels
  nx.draw_networkx_labels(g, pos=pos,
                          font_color='white', ax=ax)


  plt.axis("off")
  plt.colorbar(nodes)
  plt.savefig('Imagens/eigenvector_centrality.png', transparent=True,dpi=600)
  plt.show()

@pipeline.task(depends_on=eigenvectorcentrally)
def CDF(self):
  g = nx.read_graphml('cmf.graphml')

  degree_sequence = sorted([d for n, d in g.degree()], reverse=True) 

  plt.style.use("fivethirtyeight")
  #plt.style.use("default")

  fig, ax = plt.subplots(1,1,figsize=(10,8))

  sns.histplot(degree_sequence,bins=7,label="Count",ax=ax)
  ax2 = ax.twinx()
  sns.kdeplot(degree_sequence,color='r',label="Cumulative Density Function (CDF)",ax=ax2,cumulative=True)

  # ask matplotlib for the plotted objects and their labels
  lines, labels = ax.get_legend_handles_labels()
  lines2, labels2 = ax2.get_legend_handles_labels()
  ax2.legend(lines + lines2, labels + labels2, loc=0)

  ax.grid(False)
  ax2.grid(False)
  ax.set_xlabel("Degree")
  ax2.set_ylabel("Probability")

  plt.savefig('Imagens/cumulative_density_function.png', transparent=True,dpi=600,bbox_inches="tight")
  plt.show()

@pipeline.task(depends_on=CDF)
def PDF(self):
  g = nx.read_graphml('cmf.graphml')

  degree_sequence = sorted([d for n, d in g.degree()], reverse=True) 

  plt.style.use("fivethirtyeight")
  #plt.style.use("default")

  fig, ax = plt.subplots(1,1,figsize=(10,8))

  sns.histplot(degree_sequence,bins=7,label="Count",ax=ax)
  ax2 = ax.twinx()
  sns.kdeplot(degree_sequence,color='r',label="Probability Density Function (PDF)",ax=ax2)

  # ask matplotlib for the plotted objects and their labels
  lines, labels = ax.get_legend_handles_labels()
  lines2, labels2 = ax2.get_legend_handles_labels()
  ax2.legend(lines + lines2, labels + labels2, loc=0)

  ax.grid(False)
  ax2.grid(False)
  ax.set_xlabel("Degree")
  ax2.set_ylabel("Probability")

  plt.savefig('Imagens/probability_density_function.png', transparent=True,dpi=600,bbox_inches="tight")
  plt.show()

@pipeline.task(depends_on=PDF)
def comparizonbetweencentrality(self):
  g = nx.read_graphml('cmf.graphml')

  bc = pd.Series(nx.betweenness_centrality(g))
  dc = pd.Series(nx.degree_centrality(g))
  ec = pd.Series(nx.eigenvector_centrality(g))
  cc = pd.Series(nx.closeness_centrality(g))

  df = pd.DataFrame.from_dict({"Betweenness": bc,
                              "Degree": dc,
                              "EigenVector": ec,
                              "Closeness": cc})
  df.reset_index(inplace=True,drop=True)
  df.head()
  fig = sns.PairGrid(df)
  fig.map_upper(sns.scatterplot)
  fig.map_lower(sns.kdeplot, cmap="Reds_r")
  fig.map_diag(sns.kdeplot, lw=2, legend=False)

  plt.savefig('Imagens/all.png', transparent=True,dpi=800,bbox_inches="tight")
  plt.show()

@pipeline.task(depends_on=comparizonbetweencentrality)
def getcore_number(self):
  g = self.graph
  core = set([v for k,v in nx.core_number(g).items()])
  shell = list(core)[-2]
  core = list(core)[-1]

  return shell, core

@pipeline.task(depends_on=getcore_number)
def coredecomposition(shell, core):
  g = nx.read_graphml('cmf.graphml')
  # the degree of network(g)
  fig, ax = plt.subplots(1,1,figsize=(50,40))

  # Find k-cores
  g_shell = nx.k_shell(g, shell)
  g_core = nx.k_core(g, core)

  # layout position
  pos = nx.spring_layout(g,seed=123456789,k=0.3)

  # draw edges
  nx.draw_networkx_edges(g,
                        pos=pos,
                        alpha=0.4, ax=ax)

  # draw nodes
  nodes = nx.draw_networkx_nodes(g,
                  pos=pos,
                  node_color="#333333")

  # draw nodes
  nodes = nx.draw_networkx_nodes(g_shell,
                  pos=pos,
                  node_color="blue")

  nodes = nx.draw_networkx_nodes(g_core,
                  pos=pos,
                  node_color="red")

  # static legend
  red_patch = mpatches.Patch(color='red', label='86-core')
  blue_patch = mpatches.Patch(color='blue', label='56-shell')
  plt.legend(handles=[red_patch,blue_patch])

  plt.axis("off")
  plt.savefig('Imagens/k-core_sociopatterns.png', transparent=True,dpi=600)
  plt.show()

@pipeline.task(depends_on=coredecomposition)
def explorenetwork(self):
  gsub = nx.read_graphml('cmf.graphml')
  top_indegree = sorted(dict(gsub.in_degree()).items(),
                        reverse=True, key=itemgetter(1))[:100]
  print("\n".join(map(lambda t: "{} {}".format(*reversed(t)), top_indegree)))

In [None]:
SEED = "Copa do Mundo FIFA".title()

outputs = pipeline.run(SEED)