# Import library

In [None]:
from sklearn.datasets import fetch_openml
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Load data

In [None]:
X, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=False)
Y = np.eye(10)[y.astype(int)] # one hot encoding untuk output

In [None]:
from sklearn.model_selection import train_test_split
# split training and validation set
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=10000, random_state=42)

# Value

In [None]:
class Value:
  def __init__(self, data, _children=(), _op="", label=""):
    self.data = data               # Data value
    self.grad = 0                  # Grad initialization = 0
    self._backward = lambda: None  # Local backward function
    self._prev = set(_children)    # Previous Values
    self._op = _op                 # Operator
    self.label = label             # Label (variabel name, e.g., x, net, o, h)

  # Data display when printed
  def __repr__(self):
    return f"Value({self.data})"

  # Multiply operator
  def __mul__(self, other):
    # self * other
    if isinstance(other, ValueTensor):
      return other + self
    elif isinstance(other, Value):
      other = other
    else:
      other = Value(other)

    out = Value(self.data * other.data, (self, other), "*")

    # Local backpropagation (derivative of out w.r.t self and other)
    def _backward():
      other.grad += self.data * out.grad
      self.grad += other.data * out.grad

    out._backward = _backward  # add _backward function to Value out

    return out

  # Reverse multiply operator
  def __rmul__(self, other):
    # other * self
    return self * other

  # Power operator
  def __pow__(self, other):
    # self**other
    if isinstance(other, (int, float)):
      other = other
    elif isinstance(other, Value):
      other = float(other.data)
    else:
      other = float(other)

    out = Value(self.data**other, (self,), f"**{other}")

    def _backward():
      self.grad += other * (self.data**(other - 1)) * out.grad

    out._backward = _backward

    return out

  def __rpow__(self, other):
    # other**self
    if isinstance(other, (int, float)):
      other = other
    elif isinstance(other, Value):
      other = float(other.data)
    else:
      other = float(other)

    out = Value(other**self.data, (self,), f"{other}**")

    def _backward():
      self.grad += out.data * np.log(other) * out.grad

    out._backward = _backward

    return out

  # Add operator
  def __add__(self, other):
    # self + other
    if isinstance(other, ValueTensor):
      return other + self
    elif isinstance(other, Value):
      other = other
    else:
      other = Value(other)

    out = Value(self.data + other.data, (self, other), "+")

    def _backward():
      other.grad += out.grad
      self.grad += out.grad

    out._backward = _backward

    return out

  # reverse add operator
  def __radd__(self, other):
    # other + self
    return self + other

  # negative operator
  def __neg__(self):
    # -self
    return self * -1

  # subtract operator
  def __sub__(self, other):
    # self - other
    return self + (-other)

  # reverse subtract operator
  def __rsub__(self, other):
    # other - self
    return other + (-self)

  # Division operator
  def __truediv__(self, other):
    # self / other
    return self * other**(-1)

  # reverse division operator
  def __rtruediv__(self, other):
    # other / self
    return other * self**(-1)

  def exp(self):
    out = Value(np.exp(self.data), (self,), "e**")

    def _backward():
      self.grad += out.data * out.grad
    out._backward = _backward

    return out

  def log(self):
    out = Value(np.log(self.data), (self,), "log")

    def _backward():
      self.grad += 1/self.data * out.grad
    out._backward = _backward

    return out

  def clip(self, min_val, max_val):
    out_data = np.clip(self.data, min_val, max_val)
    out = Value(out_data, (self,), "clip")

    def _backward():
        if min_val < self.data < max_val:
            self.grad += out.grad
        else:
            self.grad += 0

    out._backward = _backward
    return out

  # Global backward
  def backward(self):
    # Use topological order
    topo = []
    visited = set()
    def build_topo(val):
      if val not in visited:
        visited.add(val)
        for child in val._prev:
          build_topo(child)
        topo.append(val)

    build_topo(self)

    for val in reversed(topo):
      val.grad = 0

    # Set grad to 1 and apply the chain rule
    self.grad = 1
    for val in reversed(topo):
      val._backward()

  # Activation function
  # Linear
  def lin(self):
    out = Value(self.data, (self,), "Linear")

    def _backward():
      self.grad += 1 * out.grad
    out._backward = _backward

    return out

  # ReLU
  def relu(self):
    out = Value(max(0, self.data), (self,), "ReLU")

    def _backward():
      self.grad += (0 if self.data <= 0 else 1) * out.grad
    out._backward = _backward

    return out

  # Sigmoid
  def sigmoid(self):
    out = Value(1/(1 + np.exp(-self.data)), (self,), "Sigmoid")

    def _backward():
      self.grad += out.data * (1 - out.data) * out.grad
    out._backward = _backward

    return out

  # Hyperbolic tangent
  def tanh(self):
    out = Value((np.exp(self.data) - np.exp(-self.data)) \
                / (np.exp(self.data) + np.exp(-self.data)),
                (self,), "tanh")

    def _backward():
      self.grad += (2/(np.exp(self.data) - np.exp(-self.data)))**2 * out.grad
    out._backward = _backward

    return out


In [None]:
# List of value
class ValueTensor:
  def __init__(self, data, label="(h)"):
    if isinstance(data, ValueTensor):
      self.shape = data.shape
      self.dim = data.dim
      self.data = data.data
      self.label = data.label
      return

    if isinstance(data, (list, int, float, Value)):
      data = np.array(data, dtype=object)

    self.shape = data.shape
    self.dim = len(self.shape)
    if self.dim > 0:
      ilabel = np.full(self.shape, np.arange(1, self.shape[-1]+1))
    else:
      ilabel = np.full(self.shape, np.arange(1, 1+1))
    ufunc = np.frompyfunc(lambda val, i: Value(val, label=f"{label}{i}") if not isinstance(val, Value) else val, 2, 1)
    self.data = ufunc(data, ilabel)
    self.label = np.array(np.vectorize(lambda x: x.label)(self.data), dtype=object)

  @property
  def grad(self):
    return np.vectorize(lambda x: x.grad)(self.data)

  @property
  def T(self):
    return ValueTensor(self.data.T)

  def __repr__(self):
    if self.dim > 1:
      return f"ValueTensor(\n{np.vectorize(lambda x: x.data)(self.data)})"
    else:
      return f"ValueTensor({np.vectorize(lambda x: x.data)(self.data)})"

  def __getitem__(self, idx):
    item = self.data[idx]
    if isinstance(item, np.ndarray):
      return ValueTensor(item)
    return item

  def __setitem__(self, idx, val):
    if isinstance(val, (int, float)):
      self.data[idx] = Value(val)

    elif isinstance(val, Value):
      self.data[idx] = val

    elif isinstance(val, (list, np.ndarray)):
      val = np.array(val, dtype=object)
      item = np.vectorize(lambda x: Value(x) if not isinstance(x, Value) else x)(val)
      self.data[idx] = item

    elif isinstance(val, ValueTensor):
      self.data[idx] = val.data

  def append(self, val, axis=0, label="b"):
      if isinstance(val, (int, float)):
          val = Value(val)

      elif isinstance(val, (list, np.ndarray)):
          val = np.array(val, dtype=object)
          val = np.vectorize(lambda x: Value(x) if not isinstance(x, Value) else x)(val)

      elif isinstance(val, ValueTensor):
          val = val.data

      else:
          raise TypeError("Unsupported type for append")

      new_data = np.append(self.data, val, axis=axis)

      return ValueTensor(new_data, label=label)

  def sum(self, axis=0, **kwargs):
    result = np.sum(self.data, axis=axis, **kwargs)
    return ValueTensor(result)

  def mean(self, axis=0, **kwargs):
    result = np.mean(self.data, axis=axis, **kwargs)
    return ValueTensor(result)

  def clip(self, min_val, max_val):
    result = np.vectorize(lambda x: x.clip(min_val, max_val))(self.data)
    return ValueTensor(result)

  # Element wise addition
  def __add__(self, other):
    # self + other
    if isinstance(other, (int, float, Value)):
      if isinstance(other, Value):
        other = other.data
      else:
        other = other
      result = np.vectorize(lambda x: x + other)(self.data)

    elif isinstance(other, ValueTensor):
      if (other.dim == 0):
        result = np.vectorize(lambda x: x + other)(self.data)
      elif self.shape != other.shape:
        raise ValueError("Shapes do not match")
      result = np.vectorize(lambda x, y: x + y)(self.data, other.data)

    elif isinstance(other, (list, np.ndarray)):
      other_tensor = ValueTensor(other)
      return self + other_tensor

    return ValueTensor(result)

  # Element wise reverse addition
  def __radd__(self, other):
    # other + self
    return self + other

  # Element wise multiplication
  def __mul__(self, other):
    # self * other
    if isinstance(other, (int, float, Value)):
      if isinstance(other, Value):
        other = other.data
      else:
        other = other
      result = np.vectorize(lambda x: x * other)(self.data)

    elif isinstance(other, ValueTensor):
      if (other.dim == 0):
        result = np.vectorize(lambda x: x * other)(self.data)
      elif self.shape != other.shape:
        raise ValueError("Shapes do not match")
      result = np.vectorize(lambda x, y: x * y)(self.data, other.data)

    elif isinstance(other, (list, np.ndarray)):
      other_tensor = ValueTensor(other)
      return self * other_tensor

    return ValueTensor(result)

  # Element wise reverse multiplication
  def __rmul__(self, other):
    # other * self
    return self * other

  # Element wise power
  def __pow__(self, other):
    # self**other
    if isinstance(other, (int, float, Value)):
      if not isinstance(other, Value):
        other = Value(other)
      else:
        other = other
      result = np.vectorize(lambda x: x ** other.data)(self.data)

    elif isinstance(other, ValueTensor):
      if self.shape != other.shape:
        raise ValueError("Shapes do not match")
      result = np.vectorize(lambda x, y: x ** y)(self.data, other.data)

    elif isinstance(other, (list, np.ndarray)):
      other_tensor = ValueTensor(other)
      return self ** other_tensor

    return ValueTensor(result)

  # Element wise reverse power
  def __rpow__(self, other):
    # other**self
    if isinstance(other, (int, float, Value)):
      if not isinstance(other, Value):
        other = Value(other)
      else:
        other = other
      result = np.vectorize(lambda x: other.data ** x)(self.data)

    elif isinstance(other, ValueTensor):
      if self.shape != other.shape:
        raise ValueError("Shapes do not match")
      result = np.vectorize(lambda x, y: x ** y)(other.data, self.data)

    elif isinstance(other, (list, np.ndarray)):
      other_tensor = ValueTensor(other)
      return other_tensor ** self

    return ValueTensor(result)

  def exp(self):
    # e**self
    result = np.vectorize(lambda x: x.exp())(self.data)

    return ValueTensor(result)

  def log(self):
    # log(self)
    result = np.vectorize(lambda x: x.log())(self.data)

    return ValueTensor(result)

  def __matmul__(self, other):
    if not isinstance(other, ValueTensor):
        raise TypeError(f"Cannot multiply ValueTensor with {type(other)}")

    if self.shape[-1] != other.shape[0]:
        raise ValueError("Shapes do not match for matrix multiplication")

    result_data = np.empty((self.shape[0], other.shape[1]), dtype=object)

    for i in range(self.shape[0]):
        for j in range(other.shape[1]):
            result_data[i, j] = sum(self.data[i, k] * other.data[k, j] for k in range(self.shape[1]))

    return ValueTensor(result_data)

  def __rmatmul__(self, other):
    if not isinstance(other, ValueTensor):
        raise TypeError(f"Cannot right-multiply ValueTensor with {type(other)}")

    if other.shape[-1] != self.shape[0]:
        raise ValueError("Shapes do not match for matrix multiplication")

    result_data = np.empty((other.shape[0], self.shape[1]), dtype=object)

    for i in range(other.shape[0]):
        for j in range(self.shape[1]):
            result_data[i, j] = sum(other.data[i, k] * self.data[k, j] for k in range(other.shape[1]))

    return ValueTensor(result_data)

  # negative operator
  def __neg__(self):
    # -self
    return self * -1

  # subtract operator
  def __sub__(self, other):
    # self - other
    return self + (-other)

  # reverse subtract operator
  def __rsub__(self, other):
    # other - self
    return other + (-self)

  # Division operator
  def __truediv__(self, other):
    # self / other
    return self * other**(-1)

  # reverse division operator
  def __rtruediv__(self, other):
    # other / self
    return other * self**(-1)

  def linear(self):
    result = np.vectorize(lambda x: x.lin())(self.data)
    return ValueTensor(result)

  def relu(self):
    result = np.vectorize(lambda x: x.relu())(self.data)
    return ValueTensor(result)

  def sigmoid(self):
    result = np.vectorize(lambda x: x.sigmoid())(self.data)
    return ValueTensor(result)

  def tanh(self):
    result = np.vectorize(lambda x: x.tanh())(self.data)
    return ValueTensor(result)

  def softmax(self, axis=-1):
    exp_data = self.exp()
    sum_exp = np.sum(np.vectorize(lambda x: x.data)(exp_data.data), axis=axis, keepdims=True)

    result = np.vectorize(lambda x, s: x / Value(s))(exp_data.data, sum_exp)
    out = ValueTensor(result)

    def _backward():
        soft_vals = np.vectorize(lambda x: x.data)(out.data)
        grad_output = np.vectorize(lambda x: x.grad)(out.data)

        for i in range(soft_vals.shape[0]):
            s = soft_vals[i].reshape(-1, 1)
            jacobian = np.diagflat(s) - (s @ s.T)

            grad_input = jacobian @ grad_output[i].reshape(-1, 1)

            for j in range(soft_vals.shape[1]):
                out.data[i, j].grad += grad_input[j, 0]

    out._backward = _backward

    return out

  def backward(self):
    visited = set()

    def traverse(val):
        if val not in visited:
            visited.add(val)
            for child in val._prev:
                traverse(child)

    for val in np.ravel(self.data):
        traverse(val)

    for val in visited: # set gradien 0
        val.grad = 0

    for val in np.ravel(self.data):
        val.grad = 1

    topo = []
    visited_topo = set()

    def build_topo(val):
        if val not in visited_topo:
            visited_topo.add(val)
            for child in val._prev:
                build_topo(child)
            topo.append(val)

    for val in np.ravel(self.data):
        build_topo(val)

    for val in reversed(topo):
        val._backward()

class criterion:
  # Loss
  # MSE
  def mse(y_true, y_pred):
    y_true = ValueTensor(y_true)
    y_pred = ValueTensor(y_pred)

    mean_ = ((y_true-y_pred)**2).mean(axis=-1)
    mean_ = ValueTensor(np.expand_dims(mean_.data, axis=0))
    return mean_.mean(axis=-1)

  # BCE
  def binary_cross_entropy(y_true, y_pred):
    y_true = ValueTensor(y_true)
    y_pred = ValueTensor(y_pred.clip(1e-10, 1 - 1e-10))

    t1 = y_pred.log()
    t2 = y_true * t1
    t3 = (1 - y_true)
    t4 = (1 - y_pred).log()
    t5 = t3 * t4
    t6 = t2 + t5
    t7 = t6.mean(axis=-1)
    t7 = ValueTensor(np.expand_dims(t7.data, axis=0))
    t8 = t7.mean(axis=-1)
    return -t8

  # CCE
  def categorical_cross_entropy(y_true, y_pred):
    y_true = ValueTensor(y_true)
    y_pred = ValueTensor(y_pred.clip(1e-10, 1 - 1e-10))

    t1 = y_pred.log()
    t2 = y_true *  t1
    t3 = t2.sum(axis=-1)
    t3 = ValueTensor(np.expand_dims(t3.data, axis=0))
    t4 = t3.mean(axis=-1)
    return -t4

  # derivatives
  # output hasil yang belum dikali turunan fungsi aktivasi
  def mse_errors(y_true, y_pred):
    return -2 * (y_true - y_pred) / y_pred.shape[0]

  def bce_errors(y_true, y_pred):
    return -1 * (y_pred - y_true) / (y_pred * (1 - y_pred) * y_pred.shape[0])

  def cce_errors(y_true, y_pred):
    return -1 * y_true / (y_pred * y_pred.shape[0])

# Layer

In [None]:
class initialization:
  # kelas untuk inisialisasi bobot tiap neuron layer
  # beberapa cara inisialisasi: zero, uniform/normal distribution, xavier/he(bonus)
  # size: tuple (jumlah neuron input, jumlah neuron output)
  def zero(size):
    return np.zeros(size)

  def uniform(size, lower_bound=-1, upper_bound=1, seed=None, method="random"):
    if seed is not None:
      np.random.seed(seed)

    low, high = lower_bound, upper_bound
    if (method == "xavier"):
      x = np.sqrt(6 / (size[0] + size[1]))
      low, high = -x, x
    elif (method == "he"):
      x = np.sqrt(6 / size[0])
      low, high = -x, x

    return np.random.uniform(low=low, high=high, size=size)

  def normal(size, mean=0, std=1, seed=None, method="random"):
    if seed is not None:
      np.random.seed(seed)

    loc, scale = mean, std
    if (method == "xavier"):
      loc, scale = 0, np.sqrt(2 / (size[0] + size[1]))
    elif (method == "he"):
      loc, scale = 0, np.sqrt(2 / size[0])

    return np.random.normal(loc=loc, scale=scale, size=size)

In [None]:
class Layer:
  def __init__(self, input_size, output_size, activation_function="linear", weight_init="normal", weight_low_or_mean=None, weight_high_or_std=1, weight_seed=None, weight_type="random"):
    self.input_size = input_size # jumlah neuron di dalam layer ini
    self.output_size = output_size # jumlah neuron di layer selanjutnya. untuk weight
    self.activation_function = activation_function # string??? idk. activation function yg digunakan

    # weight_init (string): zero, uniform, atau normal
    # weight_low_or_mean: untuk lower bound kalau pakai uniform atau mean kalau pakai normal
    # weight_high_or_std: untuk upper bound kalau pakai uniform atau std kalau pakai normal
    # weight_seed: seed untuk inisialisasi weight. untuk reproducibility
    # weight_type (string): random, xavier (bonus), atau he (bonus)
    # inisialisasi weight semua neuron dan bias. size = (input_size + 1, output_size)
    weights_array = None
    if (weight_init == "uniform"):
      if (weight_low_or_mean == None): weights_array = initialization.uniform((input_size + 1, output_size), -1, weight_high_or_std, weight_seed, weight_type)
      else: weights_array = initialization.uniform((input_size + 1, output_size), weight_low_or_mean, weight_high_or_std, weight_seed, weight_type)
    elif (weight_init == "normal"):
      if (weight_low_or_mean == None): weights_array = initialization.normal((input_size + 1, output_size), 0, weight_high_or_std, weight_seed, weight_type)
      else: weights_array = initialization.normal((input_size + 1, output_size), weight_low_or_mean, weight_high_or_std, weight_seed, weight_type)
    else: # weight_init == "zero"
      weights_array = initialization.zero((input_size + 1, output_size))
    self.weights = ValueTensor(weights_array)

    self.neuron_values = None # berisikan semua nilai neuron dan bias dalam satu layer, dalam satu batch??? yang neuronnya sudah dikasih fungsi aktivasi
    self.next_raw = None # untuk simpan data nilai layer selanjutnya yang belum diberi fungsi aktivasi
    self.next_activated = None # untuk simpan data nilai layer selanjutnya yang sudah diberi fungsi aktivasi
    self.next_error = None # untuk simpan gradien untuk backpropagation dan update weight

  def forward(self, inputs): # untuk forward propagation
    if not isinstance(inputs, ValueTensor): inputs = ValueTensor(inputs)

    self.neuron_values = ValueTensor(np.hstack((inputs.data, np.ones((inputs.shape[0], 1))))) # sekalian isiin bias
    self.next_raw = self.neuron_values @ self.weights

    # activation function
    if (self.activation_function == "relu"): self.next_activated = self.next_raw.relu()
    elif (self.activation_function == "sigmoid"): self.next_activated = self.next_raw.sigmoid()
    elif (self.activation_function == "tanh"): self.next_activated = self.next_raw.tanh()
    elif (self.activation_function == "softmax"): self.next_activated = self.next_raw.softmax()
    else: self.next_activated = self.next_raw.linear() # activation function == "linear"

    return self.next_activated

  def backward_and_update_weights(self, next_gradients, learning_rate, is_last): # untuk back propagation dan sekaligus update weights
    # next_gradients itu error layer selanjutnya lagi yg sudah dikaliin dengan weights layer selanjutnya
    # is_last: true kalau bukan yang terakhir
    if not isinstance(next_gradients, ValueTensor): next_gradients = ValueTensor(next_gradients)

    self.next_activated.backward() # untuk self.next_raw.grad

    self.next_error = ValueTensor(self.next_raw.grad) * next_gradients

    if not is_last:
      weight_T_no_bias = ValueTensor(np.array([row[:-1] for row in self.weights.data.T], dtype=object))

    # update weights
    self.weights -= learning_rate * (ValueTensor(self.neuron_values.data.T) @ self.next_error)

    if not is_last: return (self.next_error @ weight_T_no_bias)
    else: return

In [None]:
class OutputLayer:
  def __init__(self, output_size, loss_function="mse"):
    self.predicted = None # ValueTensor matriks (batch_size, output_size) prediksi
    self.target = None # target output
    self.loss_function = loss_function # mse, bce, cce
    self.loss = None # nilai loss
    self.loss_derivatives = None # nilai hasil turunan loss yang belum dikali turunan nilai output layer dan sudah dibagi batch size. matriks

  def setPredictions(self, predicted, target):
    if not isinstance(predicted, ValueTensor): self.predicted = ValueTensor(predicted)
    else: self.predicted = predicted
    if not isinstance(target, ValueTensor): self.target = ValueTensor(target)
    else: self.target = target

  def calculateLoss(self):
    # asumsi sudah ada self.predicted dan self.target
    if (self.loss_function == "bce"): self.loss = criterion.binary_cross_entropy(self.target, self.predicted)
    elif (self.loss_function == "cce"): self.loss = criterion.categorical_cross_entropy(self.target, self.predicted)
    else: self.loss = criterion.mse(self.target, self.predicted) # self.loss_function == "mse"

  def lossDerivatives(self):
    # asumsi sudah ada self.predicted dan self.targets
    if (self.loss_function == "bce"): self.loss_derivatives = criterion.bce_errors(self.target, self.predicted)
    elif (self.loss_function == "cce"): self.loss_derivatives = criterion.cce_errors(self.target, self.predicted)
    else: self.loss_derivatives = criterion.mse_errors(self.target, self.predicted) # self.loss_function == "mse"

# Model

In [None]:
from tqdm import tqdm  # Progress bar
import pickle

class FFNN:
  def __init__(self, input_size, hidden_size_array, output_size, activation_function, loss_function, weight_init):
    self.input_size = input_size # 784
    self.hidden_size_array = np.array(hidden_size_array).astype(int) # array jumlah neuron tiap hidden layer
    self.output_size = output_size # 10
    self.num_neurons = np.insert(hidden_size_array, 0, input_size)
    self.num_neurons = np.append(self.num_neurons, output_size).astype(int) # array jumlah neuron termasuk input dan output layer
    self.activation_function = activation_function # array fungsi aktivasi setiap layer (termasuk output)
    self.loss_function = loss_function # hanya untuk output layer. MSE, Binary cross entropy, atau Categorical cross entropy
    self.weight_init = weight_init # array tuple (weight_init, weight_low_or_mean, weight_high_or_std, weight_seed, weight_type) inisialisasi bobot tiap layer (termasuk input)

    # asumsi model punya minimal satu hidden layer
    self.input_and_hidden_layers = [Layer(self.num_neurons[i], self.num_neurons[i+1], activation_function[i], weight_init[i][0], weight_init[i][1], weight_init[i][2], weight_init[i][3], weight_init[i][4]) for i in range (len(hidden_size_array) + 1)]
    self.output_layer = OutputLayer(output_size, loss_function)

  def forward_propagation(self, data, target):
    # forward propagation satu kali dalam satu batch
    values = data
    for i in range (len(self.input_and_hidden_layers)):
      values = self.input_and_hidden_layers[i].forward(values)
    self.output_layer.setPredictions(values, target)
    self.output_layer.calculateLoss()
    return self.output_layer.loss

  def back_propagation(self, learning_rate):
    # backward propagation satu kali dalam satu batch
    # asumsi sudah melakukan forward_propagation sebelum ini
    self.output_layer.lossDerivatives()
    values = self.output_layer.loss_derivatives
    for i in range (len(self.input_and_hidden_layers) - 1, 0, -1):
      values = self.input_and_hidden_layers[i].backward_and_update_weights(values, learning_rate, False)
    self.input_and_hidden_layers[0].backward_and_update_weights(values, learning_rate, True)
    return

  def train_model(self, batch_size, learning_rate, num_epochs, x_train, y_train, x_val, y_val, verbose=0):
    # verbose == 0: tidak menampilkan apa-apa
    # verbose == 1: menampilkan progress bar, kondisi training_loss dan validation_loss
    X_batches_train = np.array_split(x_train, np.ceil(len(x_train) / batch_size))
    Y_batches_train = np.array_split(y_train, np.ceil(len(y_train) / batch_size))
    X_batches_val = np.array_split(x_val, np.ceil(len(x_val) / batch_size))
    Y_batches_val = np.array_split(y_val, np.ceil(len(y_val) / batch_size))
    num_of_batches_train = len(X_batches_train)
    num_of_batches_val = len(X_batches_val)
    training_loss_array = []
    val_loss_array = []
    batches_loss_array = []
    for i in range (num_epochs):
      progress = range(num_of_batches_train + num_of_batches_val)
      if (verbose == 1): # show progress bar
        progress = tqdm(progress, desc=f"Epoch {i+1}/{num_epochs}", unit="batch")

      batches_loss_array.clear()
      for j in range (num_of_batches_train):
        batches_loss_array.append(self.forward_propagation(X_batches_train[j], Y_batches_train[j]))
        if (verbose == 1):
          progress.set_postfix({"Batch Loss": batches_loss_array[j]})
          progress.update(1)
        self.back_propagation(learning_rate)
      training_loss_array.append(batches_loss_array.mean())
      batches_loss_array.clear()
      for j in range (num_of_batches_val):
        batches_loss_array.append(self.forward_propagation(X_batches_val[j], Y_batches_val[j]))
      val_loss_array.append(batches_loss_array.mean())

      if (verbose == 1):
        (print(f"Epoch {i+1}: Train Loss = {training_loss_array[i]}, Val Loss = {val_loss_array[i]}"))
    return training_loss_array, val_loss_array

  # def weight_distribution

  # def gradient_distribution

  def save_model(self, filename):
    with open(filename, "wb") as f:
      pickle.dump(self, f)
    print(f"Model saved to {filename}")

  @staticmethod
  def load_model(filename):
    with open(filename, "rb") as f:
      model = pickle.load(f)
    print(f"Model loaded from {filename}")
    return model