<a href="https://colab.research.google.com/github/AUT-Student/BigData-HW2/blob/main/BigData_HW2_Q1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [1]:
import numpy as np 
from collections import deque

# Dataset

In [2]:
!gdown 1PdXgb4w0gtsocKHmeGQ_zPfb3VbW9YhL
!unzip /content/Bigdata_hw2_datasets.zip

Downloading...
From: https://drive.google.com/uc?id=1PdXgb4w0gtsocKHmeGQ_zPfb3VbW9YhL
To: /content/Bigdata_hw2_datasets.zip
  0% 0.00/6.28M [00:00<?, ?B/s] 75% 4.72M/6.28M [00:00<00:00, 44.6MB/s]100% 6.28M/6.28M [00:00<00:00, 56.6MB/s]
Archive:  /content/Bigdata_hw2_datasets.zip
   creating: Bigdata_hw2_datasets/
   creating: Bigdata_hw2_datasets/q1/
  inflating: Bigdata_hw2_datasets/q1/stream_data_dgim.txt  
   creating: Bigdata_hw2_datasets/q2/
  inflating: Bigdata_hw2_datasets/q2/games.csv  
  inflating: Bigdata_hw2_datasets/q2/ratings.csv  
   creating: Bigdata_hw2_datasets/q3/
  inflating: Bigdata_hw2_datasets/q3/c1.txt  
  inflating: Bigdata_hw2_datasets/q3/c2.txt  
  inflating: Bigdata_hw2_datasets/q3/data.txt  


In [3]:
dataset = [int(x) for x in  open("/content/Bigdata_hw2_datasets/q1/stream_data_dgim.txt").readlines()[0].split("\t")[:-1]]

In [4]:
len(dataset)

40000

# DGIM

In [49]:
class Bucket():
  def __init__(self, size, start, end):
    self.size = size
    self.start = start
    self.end = end

  @staticmethod
  def combine(bucket1, bucket2):
    assert bucket1.size == bucket2.size

    new_size = bucket1.size * 2
    new_start = min(bucket1.start, bucket2.start)
    new_end = max(bucket1.end, bucket2.end)

    return Bucket(size=new_size, start=new_start, end=new_end)
  
  def is_size(self, size):
    return self.size == size

  def is_complete_out_window(self, window_size, counter):
    return self.end + window_size <= counter 

  def is_partial_out_window(self, window_size, counter):
    return self.start + window_size <= counter

  def __str__(self):
    return f"start = {self.start}, end = {self.end}, size = {self.size}"

In [52]:
class DGIM():
  def __init__(self, window_size):
    self.window_size = window_size
    self.dataset = dataset
    self.counter = -1
    self.buckets = deque()

  def read(self, data):
    self.counter += 1

    if data == 0:
      return

    new_bucket = Bucket(size=1, start=self.counter, end=self.counter)
    self.buckets.appendleft(new_bucket)
    
    self._combine_small_buckets()
    self._remove_out_window_bucket()
    
  def _combine_small_buckets(self):
    size = 1
    check = 0

    while(check+2 < len(self.buckets)):

      bucket_0 = self.buckets[check+0]
      bucket_1 = self.buckets[check+1]
      bucket_2 = self.buckets[check+2]

      if bucket_0.is_size(size) and bucket_1.is_size(size) and bucket_2.is_size(size):

        tmp_list = []

        for i in range(check+1):
          tmp_list.append(self.buckets.popleft())

        bucket_1 = self.buckets.popleft()
        bucket_2 = self.buckets.popleft()

        bucket_12 = Bucket.combine(bucket_1, bucket_2)
        self.buckets.appendleft(bucket_12)

        for i in range(check, -1, -1):
          self.buckets.appendleft(tmp_list[i])
        
        check += 1
        size *= 2

      else:
        break

  def _remove_out_window_bucket(self):
    if self.buckets[-1].is_complete_out_window(self.window_size, self.counter):
      self.buckets.pop()

  def predict(self):
    output = 0

    for i in range(len(self.buckets)-1):
      output += self.buckets[i].size

    if self.buckets.is_partial_out_window(self.window_size, self.counter):
      output += 0.5 * self.buckets[-1].size
    else:
      output += self.buckets[-1].size

    return output

  def predict_partial(self, partial_size):
    output = 0

    for i in range(len(self.buckets)):
      if self.buckets[i].is_partial_out_window(self.window_size, self.counter):
        output += 0.5 * self.buckets[i].size
        break
      else:
        output += self.buckets[i].size

    return output

  def visualize(self):
    print(f"Window Size = {self.window_size}")
    print(f"Counter = {self.counter}")
    print(f"Buckets = ")
    for i in range(len(self.buckets)):
      print(self.buckets[i])

In [55]:
dgim = DGIM(window_size=16)

for data in dataset[:100]:
  print("================")
  print(data)
  dgim.read(data)
  dgim.visualize()

0
Window Size = 16
Counter = 0
Buckets = 
0
Window Size = 16
Counter = 1
Buckets = 
1
Window Size = 16
Counter = 2
Buckets = 
start = 2, end = 2, size = 1 False False
0
Window Size = 16
Counter = 3
Buckets = 
start = 2, end = 2, size = 1 False False
1
Window Size = 16
Counter = 4
Buckets = 
start = 4, end = 4, size = 1 False False
start = 2, end = 2, size = 1 False False
0
Window Size = 16
Counter = 5
Buckets = 
start = 4, end = 4, size = 1 False False
start = 2, end = 2, size = 1 False False
1
Window Size = 16
Counter = 6
Buckets = 
start = 6, end = 6, size = 1 False False
start = 2, end = 4, size = 2 False False
1
Window Size = 16
Counter = 7
Buckets = 
start = 7, end = 7, size = 1 False False
start = 6, end = 6, size = 1 False False
start = 2, end = 4, size = 2 False False
1
Window Size = 16
Counter = 8
Buckets = 
start = 8, end = 8, size = 1 False False
start = 6, end = 7, size = 2 False False
start = 2, end = 4, size = 2 False False
0
Window Size = 16
Counter = 9
Buckets = 
start 