<a href="https://colab.research.google.com/github/AUT-Student/BigData-Project/blob/main/BigData_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Library

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from dataclasses import dataclass

# Load Datasets

## CAIDA

In [2]:
!gdown https://snap.stanford.edu/data/as-caida.tar.gz
!mkdir caida
!tar -C /content/caida -xzf /content/as-caida.tar.gz 

Downloading...
From: https://snap.stanford.edu/data/as-caida.tar.gz
To: /content/as-caida.tar.gz
100% 46.4M/46.4M [00:09<00:00, 4.66MB/s]


In [3]:
!grep as-caida2006 /content/caida/* | wc -l

52


In [4]:
!grep as-caida2007 /content/caida/* | wc -l

46


In [6]:
!head /content/caida/as-caida20060102.txt -n 40

# Directed graph: as-caida20060102.txt
# The CAIDA AS Relationships Dataset, from 01 02 2006
# Relationships:	-1 (<FromNodeId> is a customer of <ToNodeId>)
# 			1 (<FromNodeId> is a provider of <ToNodeId>)
# 			0 (<FromNodeId> and <ToNodeId> are peers)
# 			2 (<FromNodeId> and <ToNodeId> are siblings (the same organization).)
# Nodes:21202	Edges: 85850
# FromNodeId        ToNodeId	Relationship
8563	26716	1
8563	2914	-1
8563	21414	-1
8563	21482	1
8563	3277	0
8563	4323	-1
28801	702	-1
28801	8422	-1
28801	8220	-1
5006	20283	1
5006	21902	1
5006	25637	1
5006	17162	1
5006	18581	1
5006	11401	1
5006	16904	1
5006	21730	1
5006	32461	1
5006	7018	-1
5006	14278	1
5006	1347	2
5006	32664	1
5006	33111	1
5006	26021	1
5006	23359	1
5006	21951	1
5006	7792	1
5006	16595	1
5006	10337	1
5006	18699	1
5006	32110	1
5006	7973	1


## Oregon

In [7]:
!rm oregon -r

rm: cannot remove 'oregon': No such file or directory


In [8]:
!mkdir oregon
%cd oregon

!gdown https://snap.stanford.edu/data/oregon1_010331.txt.gz
!gdown https://snap.stanford.edu/data/oregon1_010407.txt.gz
!gdown https://snap.stanford.edu/data/oregon1_010414.txt.gz
!gdown https://snap.stanford.edu/data/oregon1_010421.txt.gz
!gdown https://snap.stanford.edu/data/oregon1_010428.txt.gz
!gdown https://snap.stanford.edu/data/oregon1_010505.txt.gz
!gdown https://snap.stanford.edu/data/oregon1_010512.txt.gz
!gdown https://snap.stanford.edu/data/oregon1_010519.txt.gz
!gdown https://snap.stanford.edu/data/oregon1_010526.txt.gz


!gunzip oregon1_010331.txt.gz
!gunzip oregon1_010407.txt.gz
!gunzip oregon1_010414.txt.gz
!gunzip oregon1_010421.txt.gz
!gunzip oregon1_010428.txt.gz
!gunzip oregon1_010505.txt.gz
!gunzip oregon1_010512.txt.gz
!gunzip oregon1_010519.txt.gz
!gunzip oregon1_010526.txt.gz

%cd ..

/content/oregon
Downloading...
From: https://snap.stanford.edu/data/oregon1_010331.txt.gz
To: /content/oregon/oregon1_010331.txt.gz
100% 69.1k/69.1k [00:00<00:00, 154kB/s]
Downloading...
From: https://snap.stanford.edu/data/oregon1_010407.txt.gz
To: /content/oregon/oregon1_010407.txt.gz
100% 69.3k/69.3k [00:00<00:00, 154kB/s]
Downloading...
From: https://snap.stanford.edu/data/oregon1_010414.txt.gz
To: /content/oregon/oregon1_010414.txt.gz
100% 69.8k/69.8k [00:00<00:00, 116kB/s]
Downloading...
From: https://snap.stanford.edu/data/oregon1_010421.txt.gz
To: /content/oregon/oregon1_010421.txt.gz
100% 70.7k/70.7k [00:00<00:00, 118kB/s]
Downloading...
From: https://snap.stanford.edu/data/oregon1_010428.txt.gz
To: /content/oregon/oregon1_010428.txt.gz
100% 70.1k/70.1k [00:00<00:00, 157kB/s]
Downloading...
From: https://snap.stanford.edu/data/oregon1_010505.txt.gz
To: /content/oregon/oregon1_010505.txt.gz
100% 70.5k/70.5k [00:00<00:00, 157kB/s]
Downloading...
From: https://snap.stanford.edu/d

In [9]:
!head /content/oregon/oregon1_010331.txt

# Undirected graph: oregon1_010331.txt
# AS peering information inferred from Oregon route-views BGP data, from March 31 2001
# Nodes: 10670 Edges: 22002
# FromNodeId	ToNodeId
10000	4725
4725	6805
4725	7523
4725	7524
4725	7673
4725	9352


# Data Structure Classes

## Edge Class

In [16]:
@dataclass(init=True, eq=True)
class Edge:
  source: int
  destination: int

  def get_reverse(self):
    return Edge(source=self.destination, destination=self.source)

## Graph Class

In [10]:
class Graph():
  def __init__(self, directed):
    self.edges = []
    self.nodes = set()
    self.directed = directed

    self.number_trees = None
    self.node_adjacency_list = None

  def is_directed(self):
    return self.directed

  def add_edge(self, edge:Edge):
    self.nodes.add(edge.source)
    self.nodes.add(edge.destination)

    self.edges.append(edge)
    if self.id_directed is False:
      self.edges.append(edge.get_reverse())

  def number_nodes(self):
    return len(self.nodes)
  
  def number_edges(self):
    return len(self.edges)

  def get_edges(self):
    return self.edges
  
  def get_nodes(self):
    return self.nodes

  def calculate_node_adjacency_list(self):
    for node in self.nodes:
      self.node_adjacency_list[node] = set()
    
    for edge in self.edges:
      self.node_adjacency_list[edge.source].add(edge.destination) 

  def calculate_exact_number_trees(self):
    if self.is_directed():
      raise Exception("This function implemented for undirected graph!")
    
    if self.node_adjacency_list is None:
      self.calculate_node_adjacency_list

    self.number_trees = 0
    for node1 in self.nodes:
      for node2 in self.node_adjacency_list:
        for node3 in self.node_adjacency_list:
          
          if node1 < node2 < node3:
            if node3 in self.node_adjacency_list[node2]:
              self.number_trees += 1

# Create Graphs of Datasets

## Ogegon

In [None]:
graph = Graph()

with open("/content/oregon/oregon1_010331.txt", "r") as file:
  for i, line in enumerate(file.readlines()):
    if line[0] == "#": continue
    
    source, destination = line[:-1].split("\t")
    source = int(source)
    destination = int(destination)

    graph.add_edge(source, destination)

In [None]:
graph.number_nodes()

10670

In [None]:
graph.number_edges()

22003

## CAIDA

In [None]:
graph = Graph()

with open("/content/caida/as-caida20060102.txt", "r") as file:
  for i, line in enumerate(file.readlines()):
    if line[0] == "#": continue
    
    source, destination, _ = line[:-1].split("\t")
    source = int(source)
    destination = int(destination)

    graph.add_edge(source, destination)

In [None]:
graph.number_nodes()

21202

In [None]:
graph.number_edges()

85850

# Oracle

In [None]:
class Oracle():
  def __init__(self, rho):
    self.rho = rho
    pass
  
  def train(self, graph):
    pass

  def is_heavy(source, destination):
    pass