# Questão A

## Instalando as dependências

In [None]:
%pip install -q matplotlib
%pip install -q numpy

## Importando o numpy e o matplotlib

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from matplotlib.axes import Axes
from matplotlib.figure import Figure
from matplotlib.lines import Line2D
from matplotlib.animation import FuncAnimation

from typing import Any, Callable, Iterable, Literal

from random import randint

from abc import ABC, abstractmethod

from IPython.display import HTML

## Criando as funções básicas

### Função linear para calcular os valores dos pontos

`x` será o conjunto de pontos que deseja ser calculado. `x` será um numpy array para permitir um 
cálculo vetorial mais eficiente.

A ideia de vetorizar se dá a partir dos seguintes passos:

A função linear se dá pela fórmula $f(x) = ax + b$, substituindo o $f(x)$ por $y$, temos: $y = ax + b$. 
Faremos isso para todo o conjunto de pontos, então podemos escrever:

$\{y_1, y_2, ..., y_n\} = \{ax_1 + b, ax_2 + b, \cdots, ax_n + b\}$

Agora podemos transformar o conjunto de $y$ como um vetor $\vec{y}$ de n dimensões:

$\vec{y} = \{ax_1 + b, ax_2 + b, \cdots, ax_n + b\}$

Outra forma de visualização de $\vec{y}$ é por meio matricial:

$$
\vec{y} = 
\begin{pmatrix}
y_1    \\
y_2    \\
\vdots \\
y_n
\end{pmatrix}
$$

Podemos isolar o $b$ que se repete em todos as colunas como um vetor $\vec{b}$ de $n$ dimensões:

$\vec{y} = \{ax_1, ax_2, \cdots, ax_n\} + \vec{b}$

Ou, na forma matricial:

$$
\begin{pmatrix}
y_1    \\
y_2    \\
\vdots \\
y_n
\end{pmatrix} =

\begin{pmatrix}
ax_1   \\
ax_2   \\
\vdots \\
ax_n
\end{pmatrix}

+

\begin{pmatrix}
b      \\
b      \\
\vdots \\
b
\end{pmatrix}
$$

Agora, transformamos "$a$" em um escalar por estar multiplicando igualmente todos os $x$ e o 
conjunto $\{x_1, x_2, ..., x_n\}$ 
como um vetor $\vec{x}$. Assim a fórmula pode ser reescrita por:

$\vec{y} = a\vec{x} + \vec{b}$

Ou na forma matricial:
$$
\begin{pmatrix}
y_1    \\
y_2    \\
\vdots \\
y_n
\end{pmatrix} = a \cdot

\begin{pmatrix}
x_1    \\
x_2    \\
\vdots \\
x_n
\end{pmatrix}

+

\begin{pmatrix}
b      \\
b      \\
\vdots \\
b
\end{pmatrix}
$$

Logo, é possível utilizar um array como um vetor para o cálculo inteiro, sendo o $\vec{x}$ o vetor de
entrada, o '$a$' representa o "slope" e o $\vec{b}$ representa o "bias"

In [None]:
def calcule_yPoints_linear_function(x: np.ndarray, bias: float, slope: float) -> np.ndarray:
    """Calcule the current y from f(x) = weight * slope + bias

    Args:
        x (np.ndarray): Current x in X axle
        bias (float): The 'b' in f(x) = ax + b
        slope (float): The 'a' in f(x) = ax + b

    Returns:
        np.ndarray: y values from the linear function
    """
    return bias + (slope * x)

### Fórmula do erro quadrado para identificar a distância do previsto para a resposta

Essa função está encarregada de pegar os pontos que temos para treinar o modelo e utiliza
para calcular os valores de que a regressão linear está prevendo, utilizamos cálculos vetoriais
novamente para otimizar o código python. Essa utilização do vetor surge a partir dos seguintes
passos:

$SE = \sum_{i=1}^{n} (y_i - y'_i)^2$

O "$SE$" representa o erro quadrado (_square error_), o $y_i$ representa o conjunto $Y$ de pontos pra 
treino e $y'_i$ é o conjunto $Y$ previsto pela regressão linear utilizando os mesmos valores de $x$ que os 
valores reais. Assim, podemos quebrar o somatório em um cálculo vetorial por $Y$ ser representado 
como um vetor assim como visto anteriormente:

$e = (\vec{y} - \vec{y'})$

$SE = \sum e^2$

**Obs:** Para fins de esclarecimento, informo que a jogada algébrica utilizada foi apenas de considerar o somatório
dos $y_i$ e os $y'_i$ como vetores $\vec{y}$ e $\vec{y'}$ sendo subtraídos:

$$
e =

\begin{pmatrix}
y_1 - y'_1 \\
y_2 - y'_2 \\
\vdots     \\
y_n - y'_n
\end{pmatrix}
$$

e o $e^2$ seria o quadrado de cada subtração:

$$
e^2 =

\begin{pmatrix}
(y_1 - y'_1)^2 \\
(y_2 - y'_2)^2 \\
\vdots         \\
(y_n - y'_n)^2
\end{pmatrix}
$$

E o $\sum e^2$ representa a soma de todos os quadrados da diferença do vetor equivalente ao `np.sum(error_array ** 2)`

In [None]:
def calcule_square_error(real_dots: np.ndarray, bias: float, slope: float) -> float:
    """Calcule the current square error

    Args:
        real_dots (np.ndarray): Real points (x, y) collected out of IA
        bias (float): The 'b' in f(x) = ax + b
        slope (float): The 'a' in f(x) = ax + b

    Returns:
        float: calcule from square error
    """

    # x_real and y_real are real values collecteds
    # calcule the error of the linear regression in x_real
    x_real: np.ndarray = real_dots[:, 0]
    y_real: np.ndarray = real_dots[:, 1]

    # y_pred is the IA response
    y_pred: np.ndarray = calcule_yPoints_linear_function(x=x_real, bias=bias, slope=slope)

    error_array: np.ndarray = y_real - y_pred

    # sum all squareerror array
    return np.sum(error_array ** 2)

## Função para calcular a curva do erro quadrado para os valores $x$ possíveis

In [None]:
def calcule_error_curve_values(x_ticks: np.ndarray, slope: float, real_dots: np.ndarray) -> np.ndarray:
    """Get all square error curve values in x linespace

    Args:
        x_ticks (np.ndarray): All x values to calculate (x axle)
        slope (float): The 'a' in f(x) = ax + b (weight)
        real_dots (np.ndarray): Real points (x, y) collected out of IA

    Returns:
        np.ndarray: All square error calculated
    """
    
    # x_ticks are all avaliable bias
    # bias = intecept
    y_errors_values: list[float] = [calcule_square_error(slope=slope, bias=bias, real_dots=real_dots) for bias in x_ticks]

    return np.array(y_errors_values)

## Função para calcular a derivada parcial da função do erro quadrado

Essa função busca utilizar o numpy reduzindo o overhead padrão do python, o cálculo
da derivada parcial se dá através da regra da cadeia como será demostrado adiante.

A função do erro quadrado é definida como:

$SE = \sum (\vec{y} - \vec{y'})^2$

Sendo $\vec{y'}$ uma abreviação que pode ser reescrita como:

$\vec{y'} = (a \cdot \vec{x} + \vec{b})$

Porém, podemos representar a subtração vetorial como:

$e = (\vec{y} - \vec{y'})$

Agora podemos derivar o valor $SE$ em relação a "$b$": 

$\frac{\partial SE}{\partial \vec{b}} = \frac{\partial SE}{\partial \vec{y'}}\sum (\vec{y} - (a \cdot \vec{x} + \vec{b}))^2$

Podemos mudar utilizar "$e$" no lugar de $(\vec{y} - (a \cdot \vec{x} + \vec{b}))$ e utilizar a regra da cadeia:

$\frac{\partial SE}{\partial \vec{b}} = \sum \frac{\partial}{\partial e}e^2 \frac{\partial}{\partial \vec{b}}(\vec{y} - \vec{y'})$

A derivada de $SE$ em relação a "$e$":

1. $\frac{\partial SE}{\partial e} = \frac{\partial}{\partial b}e^2$
2. $\frac{\partial SE}{\partial e} = 2e$

A derivada de $e$ em relação a $\vec{y}$:

1. $\frac{\partial e}{\partial \vec{b}} = \frac{\partial}{\partial \vec{b}}(\vec{y} - (a \cdot \vec{x} + \vec{b}))$
2. $\frac{\partial e}{\partial \vec{b}} = -1$

A derivada final:

$\frac{\partial SE}{\partial \vec{b}} = \sum 2e \times (-1)$

Podemos novamente substituir o valor de "$e$" por $(\vec{y} - \vec{y'})$ e passar as constantes multiplicativas para fora do somatório
e obtemos o valor da derivada parcial:

$\frac{\partial SE}{\partial b} = -2 \sum (\vec{y} - \vec{y'})$

In [None]:
def calcule_partial_derivative_bias(y_real: np.ndarray, y_pred: np.ndarray) -> float:
    """Get the partial derivative of square error curve

    Args:
        y_real (np.ndarray): Real points collected
        y_pred (np.ndarray): Linear Regression points (same lenght of points with `y_real`)

    Returns:
        float: Partial derivative
    """

    # -2 * (real_value - (bias + slope * x) + ... # x is a independent variable
    # this is the patial derivate
    # y_pred is a array with all values predicted by the linear regression
    # y_real is a arrrary with all real values
    return -2 * np.sum(y_real - y_pred)

## Função para calcular a tangente do erro quadrado

Essa função usará a equação da reta para gerar uma reta tangente com a fim de
demonstrar a derivada em relação àquele ponto. Como o "$m$" representa a inclinação
da reta, então ele será equivalente a derivada parcial do _bias_.

A equação da reta possui a seguinte fórmula: $y = m * (x - x_0) + y_0$

Como o "$y$", o "$y_0$" o "$x$" e o "$x_0$" representam um conjunto de pontos, posso novamente representa-los
como vetores, respectivamente: $\vec{y}$, $\vec{y_0}$, $\vec{x}$ e $\vec{x_0}$. Matricialmente essa fórmula
fica:

$$
\begin{pmatrix}
y_1    \\
y_2    \\
\vdots \\
y_n
\end{pmatrix} = m \cdot

\left[
\begin{pmatrix}
x_1    \\
x_2    \\
\vdots \\
x_n
\end{pmatrix} - \vec{x_0}
\right]

+ \vec{y_0}
$$

Simplificando a notação $\vec{x}-{x_0}$:

$$
\vec{x} - \vec{x_0} = 

\begin{pmatrix}
x_1 - x_0 \\
x_2 - x_0 \\
\vdots    \\
x_n - x_0
\end{pmatrix}
$$

Agora podemos representar o cálculo como:

$$
\begin{pmatrix}
y_1    \\
y_2    \\
\vdots \\
y_n
\end{pmatrix} = m \cdot

\begin{pmatrix}
x_1 - x_0 \\
x_2 - x_0 \\
\vdots    \\
x_n - x_0
\end{pmatrix} +

\begin{pmatrix}
y_0    \\
y_0    \\
\vdots \\
y_0
\end{pmatrix}
$$

**Obs:** Lembrando que o $x_0$ e o $y_0$ representam, respectivamente, o $b$ (`bias`) e o erro quadrado (`square_error`) daquele ponto

In [None]:
def calcule_tan_line(square_error: float, y_real: np.ndarray, y_pred: np.ndarray, bias: float, tan_range: float = 1.5) -> tuple[np.ndarray, np.ndarray]:
    """Calcule the current tangent line of the square error curve

    Args:
        square_error (float): Error of the linear regression
        y_real (np.ndarray): Real y values
        y_pred (np.ndarray): Prediced y values by linear regression
        bias (float): X in square error curve or (bias of the linear regression)
        tan_range (float, optional): Range to calculate the tangent (x size). Defaults to 1.5.

    Returns:
        tuple[np.ndarray, np.ndarray]: Tuple with current x tangent values and y tangent values
    """

    # bias is a x variable in square error curve
    x_tan: np.ndarray = np.linspace(
        start=bias - tan_range,
        stop=bias + tan_range,
        num=len(y_pred)
    )

    # equation of the straight line
    # y = m * (x - x0) + y0
    y_tan: np.ndarray = calcule_partial_derivative_bias(y_real, y_pred) * (x_tan - bias) + square_error
    return (x_tan, y_tan)


## Criando uma classe para ser o Animator das animações

In [None]:
class Animator:
    """
    Class to create animations with matplotlib.

    This class abstracts the creation of the Figure, the initial configuration 
        of artists (lines, text, etc.), and the animation lifecycle, simplifying 
        display in interactive environments like Jupyter Notebooks.
    """
    def __init__(
        self, 
        create_plots_function: Callable[[], tuple[Figure, tuple[Axes, ...]]], 
        init_function: Callable[[Any], tuple[Line2D, ...]],
        update_function: Callable[[Any], tuple[Line2D, ...]],
        *args,
        **kwargs,
    ) -> None:
        """Constructor from `Animator`

        Args:
            create_plots_function (Callable[[], tuple[Figure, tuple[Axes, ...]]]): Function 
                that creates the Matplotlib Figure and Axes. It must return a tuple containing 
                the Figure and a tuple of the created Axes.

            init_function (Callable[[Any], tuple[Line2D, ...]]): Function that receives 
                the Axes and extra arguments (`*args`, `**kwargs`) to set up the initial 
                visual state. It must return a tuple with all the artist objects
                (`Line2D`, `Text`, etc.) that need to be updated during the animation.


            update_function (Callable[[Any], tuple[Line2D, ...]]): Function that updates 
                the graphical data (the animation logic). The first parameter receives 
                the frame data (the value from `frames`), and subsequent parameters 
                receive arguments passed via `fargs` of the `plot()` method.
                This function MUST return a tuple with all modified artists.

        Raises:
            RuntimeError: If the `create_plots_function` does not return a `Figure` 
                object, or if there is an error creating the Matplotlib animation.

        Returns:
            None: The constructor returns nothing.
        """
        self._create_plots: Callable = create_plots_function
        self._init_func: Callable = init_function
        self._update_func: Callable = update_function
        
        self._fig, self._axes = self._create_plots()

        if not isinstance(self._fig, Figure):
            raise RuntimeError(f"The configure function don't return a matplot Figure: {type(self._fig)}")
        
        self._lines: tuple = self._init_func(self._axes, *args, **kwargs)
        
        self._ani: FuncAnimation | None = None

    def _create_animation(self, frames: Iterable, fargs: tuple, *args, **kwargs) -> None:
        new_fargs: tuple = (self._lines,) + fargs

        self._ani = FuncAnimation(
            fig=self._fig,
            fargs=new_fargs,
            func=self._update_func,
            frames=frames, 
            interval=kwargs.pop("interval", 100), # Interval between frames
            blit=kwargs.pop("blit", True),        # Otmization: just draw the changes
            repeat=kwargs.pop("repeat", False),   # Do not reapeat the animation
            *args,
            **kwargs,
        )
        
    def plot(self, frames: Iterable, fargs: tuple, *args, **kwargs) -> None:
        self._create_animation(frames, fargs, *args, **kwargs)

        if self._ani is None:
            raise RuntimeError(f"Erro to create the matplotlib animation.")
        
        plt.close(self._fig)
        display(HTML(self._ani.to_jshtml()))

## Funções para criar os gráficos

In [None]:
def create_subplots() -> tuple[Figure, tuple[Axes, Axes]]:
    """Create subplots to show animations

    Returns:
        tuple[Figure, tuple[Axes, ...]]: The figure and a tuple with all lines
    """
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
    return fig, axes

def config_axels(
        axes: tuple[Axes, Axes], 
        real_dots: np.ndarray,
        bias: float,
        slope: float,
        x_ticks_linear_regression: np.ndarray,
        ypoints_linear_regression: np.ndarray,
        x_ticks_error_curve: np.ndarray,
        ypoints_square_error_curve: np.ndarray,
        ) -> tuple:

    # linear regrassion
    ax1: Axes = axes[0]

    # square error
    ax2: Axes = axes[1]

    linear_regression, = ax1.plot(x_ticks_linear_regression, ypoints_linear_regression)

    # add individual real points
    ax1.scatter(x=real_dots[:, 0], y=real_dots[:, 1], label='Pontos Reais', color='green', s=15, marker="o")

    ax1.set_title("Linear Regression")
    ax1.set_xlabel("weight")
    ax1.set_ylabel("height")

    # define origin to (0, 0)
    ax1.set_xlim(left=0, right=x_ticks_linear_regression[-1])
    ax1.set_ylim(bottom=0)

    ax1.grid(visible=True, axis='both', alpha=0.5, color='gray')
    bias_text = ax1.text(
        0.1, # 10% x
        0.9, # 90% y
        f'Bias (intercept): {bias:.4f}',
        transform=ax1.transAxes, 
        fontsize=12, 
        bbox=dict(facecolor='white', alpha=0.7)
    )

    ax2.plot(x_ticks_error_curve, ypoints_square_error_curve, label='Square Error')
    ax2.set_title("Square Error")
    ax2.set_xlabel("Intercept (bias)")
    ax2.set_ylabel("Sum of Square Errors")

    square_error: float = calcule_square_error(real_dots, bias, 0.64)
    x_real =  real_dots[:, 0]
    y_real = real_dots[:, 1]
    square_error_curve_tangent, = ax2.plot(
        *calcule_tan_line(
            square_error,
            y_real,
            calcule_yPoints_linear_function(x_real, bias, slope),
            bias
        ),
        label='Derivative'
        )
    
    ax2.grid(visible=True, axis='both', alpha=0.5, color='gray')
    error_text = ax2.text(
        0.1,
        0.9,
        f'Square Error: {square_error:.4f}',
        transform=ax2.transAxes,
        fontsize=12,
        bbox=dict(facecolor='white', alpha=0.7)
    )
    
    current_error_point, = ax2.plot(
        [bias],
        [square_error],
        'ro',
        markersize=8,
        label='Current Error'
    )

    plt.legend()

    # define origin to (0, 0)
    ax2.set_xlim(left=0, right=x_ticks_error_curve[-1])
    ax2.set_ylim(bottom=0)

    return linear_regression, square_error_curve_tangent, current_error_point, bias_text, error_text

## Criando uma função para calcular o gradiente descendente

In [None]:
def gradient_descent_array(
        initial_bias: float,
        slope: float,
        real_dots: np.ndarray,
        learn_rate: float,
        max_iterations: int,
    ) -> np.ndarray:
    """Create a gradient descent array to update the linear regression bias

    Args:
        initial_bias (float): First bias of the linear regression
        slope (float): The 'a' of linear equation: f(x) = ax + b
        real_dots (np.ndarray): Real dots of data
        learn_rate (float): Multiply factor of the de step size
        max_iterations (int): Limit of iterations

    Returns:
        np.ndarray[float]: Array with updated bias
    """

    bias_list: list[float] = []
    current_bias: float = initial_bias
    for _ in range(max_iterations):
        x_real: np.ndarray = real_dots[:, 0]
        y_real: np.ndarray = real_dots[:, 1]
        y_pred: np.ndarray = calcule_yPoints_linear_function(
            x=x_real,
            bias=current_bias,
            slope=slope,
        )

        partial_derivative: float = calcule_partial_derivative_bias(
            y_real=y_real,
            y_pred=y_pred,
        )

        step_size: float = partial_derivative * learn_rate

        current_bias -= step_size
       
        if step_size == 0:
            break

        bias_list.append(current_bias)
    
    return np.array(bias_list)

## Criando uma função para calcular as variações do gráfico para criar a animação

In [None]:
def animate( 
        current_bias: float,
        lines: tuple[Line2D, ...],
        x_ticks_linear_regression: np.ndarray,
        real_dots: np.ndarray,
        slope: float = 0.0,
        tan_range: float = 1.5,
        ) -> tuple[Line2D, ...]:
    """Animate the new values of the linear regression and the square error function

    Args:
        current_bias (float): Current 'b' (intercept) of the linear regression: f(x) = ax + b
        lines (tuple[Line2D, ...]): Lines with graphics
        x_ticks_linear_regression (np.ndarray): X values to calculate the linear regression
        real_dots (np.ndarray): Real dots to use to adapt the linear regression
        slope (float, optional): The 'a' (weight) of the linear regression: f(x) = ax + b. Defaults to 0.0.
        tan_range (float, optional): Range of the tangent line. Defaults to 1.5.

    Returns:
        tuple[Line2D, ...]: All lines to update graphics
    """
    
    linear_regression, square_error_curve_tan, current_error_point, bias_text, error_text = lines

    y_points_linear_regression: np.ndarray = calcule_yPoints_linear_function(
        x=x_ticks_linear_regression,
        bias=current_bias,
        slope=slope
    )
    linear_regression.set_data(x_ticks_linear_regression, y_points_linear_regression)
    bias_text.set_text(f'Bias (intercept): {current_bias:.4f}')

    y_real: np.ndarray = real_dots[:, 1]
    x_real: np.ndarray = real_dots[:, 0]
    y_pred: np.ndarray = calcule_yPoints_linear_function(
        x=x_real, 
        bias=current_bias, 
        slope=slope
    )

    square_error: float = calcule_square_error(real_dots, current_bias, slope)
    x_tan, y_tan = calcule_tan_line(
        square_error=square_error,
        y_real=y_real,
        y_pred=y_pred,
        bias=current_bias,
        tan_range=tan_range
    )
    square_error_curve_tan.set_data(x_tan, y_tan)

    # actual curve point
    current_error_point.set_data([current_bias], [square_error])

    error_text.set_text(f'Square Error: {square_error:.4f}')

    return linear_regression, square_error_curve_tan, current_error_point, bias_text, error_text


## Constantes para controlar a regressão

In [None]:
X_TICKS_LINEAR_REGRESSION: np.ndarray = np.linspace(0, 3.5)
X_TICKS_ERROR_CURVE: np.ndarray = np.linspace(0, 2)
REAL_DOTS: np.ndarray = np.array([[0.5, 1.4], [2.3, 1.9], [2.9, 3.2]])
INITIAL_BIAS: float = 0.0
SLOPE = 0.64
LEARN_RATE: float = 0.1
MAX_ITERATIONS: int = 150
TAN_RANGE: float = 1.5

## Pegando os valores iniciais

In [None]:
y_points_linear_regression: np.ndarray = calcule_yPoints_linear_function(
    x=X_TICKS_LINEAR_REGRESSION,
    bias=INITIAL_BIAS,
    slope=SLOPE,
)

y_points_square_error: np.ndarray = calcule_error_curve_values(
    x_ticks=X_TICKS_ERROR_CURVE,
    slope=SLOPE,
    real_dots=REAL_DOTS,
)

bias_frame_array: np.ndarray = gradient_descent_array(
    initial_bias=INITIAL_BIAS,
    slope=SLOPE,
    real_dots=REAL_DOTS,
    learn_rate=LEARN_RATE,
    max_iterations=MAX_ITERATIONS
)

## Iniciando o animador

In [None]:
animator = Animator(
    # Animator params
    create_plots_function=create_subplots,
    init_function=config_axels,
    update_function=animate,

    # Config function params
    bias=INITIAL_BIAS,
    slope=SLOPE,
    real_dots=REAL_DOTS,
    x_ticks_linear_regression=X_TICKS_LINEAR_REGRESSION,
    ypoints_linear_regression=y_points_linear_regression,
    x_ticks_error_curve=X_TICKS_ERROR_CURVE,
    ypoints_square_error_curve=y_points_square_error,
)

## Mostrando a animação

In [None]:
animator.plot(
    frames=bias_frame_array,
    fargs=(
        X_TICKS_LINEAR_REGRESSION,
        REAL_DOTS,
        SLOPE,
        TAN_RANGE,
    )
)

# Questão B

## Função para calcular a derivada do slope

Essa função busca utilizar o numpy reduzindo o overhead padrão do python, o cálculo
da derivada parcial se dá através da regra da cadeia como será demostrado adiante.

A função do erro quadrado é definida como:

$SE = \sum (\vec{y} - \vec{y'})^2$

Sendo $\vec{y'}$ uma abreviação que pode ser reescrita como:

$\vec{y'} = (a \cdot \vec{x} + \vec{b})$

Porém, podemos representar a subtração vetorial como:

$e = (\vec{y} - \vec{y'})$

Agora podemos derivar o valor $SE$ em relação a "$a$": 

$\frac{\partial SE}{\partial a} = \frac{\partial SE}{\partial \vec{y'}}\sum (\vec{y} - (a \cdot \vec{x} + \vec{b}))^2$

Podemos mudar utilizar "$e$" no lugar de $(\vec{y} - (a \cdot \vec{x} + \vec{b}))$ e utilizar a regra da cadeia:

$\frac{\partial SE}{\partial a} = \sum \frac{\partial}{\partial e}e^2 \frac{\partial}{\partial a}(\vec{y} - \vec{y'})$

A derivada de $SE$ em relação a "$e$":

1. $\frac{\partial SE}{\partial e} = \frac{\partial}{\partial b}e^2$
2. $\frac{\partial SE}{\partial e} = 2e$

A derivada de $e$ em relação a "$a$":

1. $\frac{\partial e}{\partial a} = \frac{\partial}{\partial a}(\vec{y} - (a \cdot \vec{x} + \vec{b}))$
2. $\frac{\partial e}{\partial a} = -1 \cdot \vec{x}$

A derivada final:

$\frac{\partial SE}{\partial a} = \sum 2e \times (-1 \cdot \vec{x})$

Podemos novamente substituir o valor de "$e$" por $(\vec{y} - \vec{y'})$ e passar as constantes multiplicativas para fora do somatório
e obtemos o valor da derivada parcial:

$\frac{\partial SE}{\partial a} = -2 \cdot \vec{x} \times \sum (\vec{y} - \vec{y'})$

In [None]:
def calcule_partial_derivative_slope(x_real: np.ndarray, y_real: np.ndarray, y_pred: np.ndarray) -> float:
    return -2 * np.sum(x_real*(y_real - y_pred))

## Funções para criar as animações

In [None]:
def create_bias_slope_subplots() -> tuple:
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
    return fig, axes

def init_bias_slope_axle(
    axes: tuple[Axes, ...],
    real_dots: np.ndarray,
    bias: float,
    slope: float,
    x_ticks_linear_regression: np.ndarray,
    ):

    ax1: Axes = axes[0]
    ax2: Axes = axes[1]

    y_linear_values: np.ndarray = calcule_yPoints_linear_function(
        x=x_ticks_linear_regression,
        bias=bias,
        slope=slope
    )

    stochastic_linear_regression, = ax1.plot(
        x_ticks_linear_regression,
        y_linear_values,
        label="Linear Regression"
    )

    # add individual real points
    ax1.scatter(x=real_dots[:, 0], y=real_dots[:, 1], label='Real Points', color='green', s=15, marker="o")

    ax1.set_title("Stochastic Linear Regression")
    ax1.set_xlabel("weight")
    ax1.set_ylabel("height")

    # define origin to (0, 0)
    ax1.set_xlim(left=0, right=x_ticks_linear_regression[-1])
    ax1.set_ylim(bottom=0)

    ax1.grid(visible=True, axis='both', alpha=0.5, color='gray')
    stochastic_bias_text = ax1.text(
        0.1, # 10% x
        0.9, # 90% y
        f'Bias (intercept): {bias:.4f}',
        transform=ax1.transAxes,
        fontsize=12,
        bbox=dict(facecolor='white', alpha=0.7)
    )

    stochastic_slope_text = ax1.text(
        0.1, # 10% x
        0.8, # 80% y
        f'Slope: {slope:.4f}',
        transform=ax1.transAxes,
        fontsize=12,
        bbox=dict(facecolor='white', alpha=0.7)
    )

    ax1.legend()

    minibatch_linear_regression, = ax2.plot(
        x_ticks_linear_regression,
        y_linear_values,
        label='Linear Regression'
    )

    ax2.scatter(x=real_dots[:, 0], y=real_dots[:, 1], label='Real Points', color='green', s=15, marker="o")

    ax2.set_title("Mini-Batch (2) Linear Regression")
    ax2.set_xlabel("weight")
    ax2.set_ylabel("height")

    # define origin to (0, 0)
    ax2.set_xlim(left=0, right=x_ticks_linear_regression[-1])
    ax2.set_ylim(bottom=0)
    
    ax2.grid(visible=True, axis='both', alpha=0.5, color='gray')
    minibatch_bias_text = ax2.text(
        0.1, # 10% x
        0.9, # 90% y
        f'Bias (intercept): {bias:.4f}',
        transform=ax2.transAxes,
        fontsize=12,
        bbox=dict(facecolor='white', alpha=0.7)
    )

    minibatch_slope_text = ax2.text(
        0.1, # 10% x
        0.8, # 80% y
        f'Slope: {slope:.4f}',
        transform=ax2.transAxes,
        fontsize=12,
        bbox=dict(facecolor='white', alpha=0.7)
    )

    # define origin to (0, 0)
    ax2.set_xlim(left=0, right=x_ticks_linear_regression[-1])
    ax2.set_ylim(bottom=0)

    ax2.legend()

    return (stochastic_linear_regression, 
            stochastic_bias_text, 
            stochastic_slope_text, 
            minibatch_linear_regression, 
            minibatch_bias_text, 
            minibatch_slope_text)

## Criando a função para o cálculo dos gradientes de cada tipo

In [None]:
def gradiante_descent_array_bias_and_slope(
    initial_bias: float,
    initial_slope: float,
    real_dots: np.ndarray,
    learn_rate: float,
    max_iterations: int,
) -> np.ndarray:
    
    stochastic_bias_slope_list: list[list[float]] = []
    stochastic_current_bias: float = initial_bias
    stochastic_current_slope: float = initial_slope
    for _ in range(max_iterations):
        x_real: np.ndarray = real_dots[:, 0]
        y_real: np.ndarray = real_dots[:, 1]
        
        stochastic_choice: int = randint(0, len(x_real)-1)
        stochastic_x_real = np.array([x_real[stochastic_choice]])
        stochastic_y_real = np.array([y_real[stochastic_choice]])
        stochastic_y_pred: np.ndarray = calcule_yPoints_linear_function(
            x=stochastic_x_real,
            bias=stochastic_current_bias,
            slope=stochastic_current_slope,
        )

        stochastic_bias_partial_derivative: float = calcule_partial_derivative_bias(
            y_real=stochastic_y_real,
            y_pred=stochastic_y_pred,
        )

        stochastic_slope_partial_derivative: float = calcule_partial_derivative_slope(
            x_real=stochastic_x_real,
            y_real=stochastic_y_real,
            y_pred=stochastic_y_pred
        )

        stochastic_bias_step_size: float = stochastic_bias_partial_derivative * learn_rate
        stochastic_slope_step_size: float = stochastic_slope_partial_derivative * learn_rate

        if stochastic_bias_step_size != 0:
            stochastic_current_bias -= stochastic_bias_step_size

        if stochastic_slope_step_size != 0:
            stochastic_current_slope -= stochastic_slope_step_size
       
        if stochastic_bias_step_size == 0 and stochastic_slope_step_size == 0:
            break

        stochastic_bias_slope_list.append([stochastic_current_bias, stochastic_current_slope])
    
    minibatch_current_bias: float = initial_bias
    minibatch_current_slope: float = initial_slope
    minibatch_bias_slope_list: list[list[float]] = []
    for _ in range(max_iterations):
        x_real: np.ndarray = real_dots[:, 0]
        y_real: np.ndarray = real_dots[:, 1]
        
        minibatch_choice_1: int = randint(0, len(x_real)-1)
        valid: bool = False
        while not valid:
            minibatch_choice_2 = randint(0, len(x_real)-1)
            if minibatch_choice_2 != minibatch_choice_1:
                valid = True

        minibatch_x_real = np.array([x_real[minibatch_choice_1], x_real[minibatch_choice_2]])
        minibatch_y_real = np.array([y_real[minibatch_choice_1], y_real[minibatch_choice_2]])
        minibatch_y_pred: np.ndarray = calcule_yPoints_linear_function(
            x=minibatch_x_real,
            bias=minibatch_current_bias,
            slope=minibatch_current_slope,
        )

        minibatch_bias_partial_derivative: float = calcule_partial_derivative_bias(
            y_real=minibatch_y_real,
            y_pred=minibatch_y_pred,
        )

        minibatch_slope_partial_derivative: float = calcule_partial_derivative_slope(
            x_real=minibatch_x_real,
            y_real=minibatch_y_real,
            y_pred=minibatch_y_pred
        )

        minibatch_bias_step_size: float = minibatch_bias_partial_derivative * learn_rate
        minibatch_slope_step_size: float = minibatch_slope_partial_derivative * learn_rate

        if minibatch_bias_step_size != 0:
            minibatch_current_bias -= minibatch_bias_step_size

        if minibatch_slope_step_size != 0:
            minibatch_current_slope -= minibatch_slope_step_size
       
        if minibatch_bias_step_size == 0 and minibatch_slope_step_size == 0:
            break

        minibatch_bias_slope_list.append([minibatch_current_bias, minibatch_current_slope])

    aux_list: list = [stochastic_sublist+minibatch_sublist for stochastic_sublist, minibatch_sublist in zip(stochastic_bias_slope_list, minibatch_bias_slope_list)]

    return np.array(aux_list)

## Criando a função de atualização de ambos os parâmetros (bias e slope)

In [None]:
def update_bias_slope(
    bias_slope: np.ndarray,
    lines: tuple[Line2D, ...],
    x_ticks_linear_regression: np.ndarray,
) -> tuple:
    (stochastic_linear_regression, 
        stochastic_bias_text, 
        stochastic_slope_text, 
        minibatch_linear_regression, 
        minibatch_bias_text, 
        minibatch_slope_text) = lines

    stochastic_current_bias: float = bias_slope[0]
    stochastic_current_slope: float = bias_slope[1]
    minibatch_current_bias: float = bias_slope[2]
    minibatch_current_slope: float = bias_slope[3]

    stochastic_y_points_linear_regression: np.ndarray = calcule_yPoints_linear_function(
        x=x_ticks_linear_regression,
        bias=stochastic_current_bias,
        slope=stochastic_current_slope
    )
    stochastic_linear_regression.set_data(x_ticks_linear_regression, stochastic_y_points_linear_regression)
    stochastic_bias_text.set_text(f'Bias (intercept): {stochastic_current_bias:.4f}')
    stochastic_slope_text.set_text(f'Slope: {stochastic_current_slope:.4f}')

    minibatch_y_points_linear_regression: np.ndarray = calcule_yPoints_linear_function(
        x=x_ticks_linear_regression,
        bias=minibatch_current_bias,
        slope=minibatch_current_slope
    )
    minibatch_linear_regression.set_data(x_ticks_linear_regression, minibatch_y_points_linear_regression)
    minibatch_bias_text.set_text(f'Bias (intercept): {minibatch_current_bias:.4f}')
    minibatch_slope_text.set_text(f'Slope: {minibatch_current_slope:.4f}')

    return (stochastic_linear_regression, 
            stochastic_bias_text, 
            stochastic_slope_text, 
            minibatch_linear_regression, 
            minibatch_bias_text, 
            minibatch_slope_text)

## Criando constantes para a simulação

In [None]:
X_TICKS_LINEAR_REGRESSION: np.ndarray = np.linspace(0, 3.5)
REAL_DOTS: np.ndarray = np.array([[0.5, 1.4], [2.3, 1.9], [2.9, 3.2]])
INITIAL_BIAS: float = 0.0
INITIAL_SLOPE: float = 1.0
LEARN_RATE: float = 0.05
MAX_ITERATIONS: int = 150

## Criando o animador

In [None]:
animator: Animator = Animator(
    create_plots_function=create_bias_slope_subplots,
    init_function=init_bias_slope_axle,
    update_function=update_bias_slope,
    bias=INITIAL_BIAS,
    slope=INITIAL_SLOPE,
    real_dots=REAL_DOTS,
    x_ticks_linear_regression=X_TICKS_LINEAR_REGRESSION,
)

## Iniciando o bias e slope array

In [None]:
bias_slope_frame_array: np.ndarray = gradiante_descent_array_bias_and_slope(
    initial_bias=INITIAL_BIAS,
    initial_slope=INITIAL_SLOPE,
    real_dots=REAL_DOTS,
    learn_rate=LEARN_RATE,
    max_iterations=MAX_ITERATIONS,
)

## Exibindo a animação

In [None]:
animator.plot(
    frames=bias_slope_frame_array,
    fargs=(
        X_TICKS_LINEAR_REGRESSION,
    )
)

# Questão D

## Criando as funções de ativação

$\sigma(x) = \frac{1}{
    1 + e^x
}$

In [None]:
def sigmoid(x: np.ndarray) -> np.ndarray:
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x: np.ndarray) -> np.ndarray:
    s: np.ndarray = sigmoid(x)
    return s * (1 - s)

def softplus(x: np.ndarray) -> np.ndarray:
    return np.log(1 + np.exp(1)**x)

def softplus_derivative(x: np.ndarray) -> np.ndarray:
    return sigmoid(x)

## Criando uma classe para abstrair a camada de neurôrions

In [None]:
class DenseLayer:
    def __init__(
        self,
        input_size: int,
        output_size: int,
        weights: np.ndarray | None = None,
        biases: np.ndarray | None = None,
    ) -> None:
        
        self.input_size: int = input_size
        self.output_size: int = output_size

        # weights to multiply
        if weights is None:
            self._weights: np.ndarray = np.random.randn(input_size, output_size) # 2D
        else:
            self._weights = weights

        # biases to sum
        if biases is None:
            self._biases: np.ndarray = np.random.randn(output_size) # 1D
        else:
            self._biases = biases

    @property
    def weights(self) -> np.ndarray:
        return self._weights
    
    @weights.setter
    def weights(self, new_weights: np.ndarray) -> None:
        self._weights = new_weights

    @property
    def biases(self) -> np.ndarray:
        return self._biases

    @biases.setter
    def biases(self, new_biases: np.ndarray) -> None:
        self._biases = new_biases


## Criando uma base abstrata para redes neurais das questões D e E

In [None]:
class BaseNeuralNetwork(ABC):
    def __init__(
        self,
        input_size: int,
        hidden_layers: list[DenseLayer],
        output_layer: DenseLayer,
        activation_function: Literal["softplus", "sigmoid"] = "softplus",
    ) -> None:
        self._input_size: int = input_size

        if input_size != hidden_layers[0].input_size:
            raise ValueError("Input size mismatch in the first hidden layer.")
    
        self._hidden_layers: list[DenseLayer] = hidden_layers
        self._output_layer: DenseLayer = output_layer

        functions_dict: dict[str, Callable[[np.ndarray], np.ndarray]] = {
            "softplus": softplus, 
            "sigmoid": sigmoid,
        }
        derivatives_dict: dict[str, Callable[[np.ndarray], np.ndarray]] = {
            "softplus": softplus_derivative, 
            "sigmoid": sigmoid_derivative,
        }

        self._activation_function: Callable[[np.ndarray], np.ndarray] = functions_dict[activation_function]
        self._derivative_function: Callable[[np.ndarray], np.ndarray] = derivatives_dict[activation_function]

        self._cache_Z: list[np.ndarray] = []
        self._cache_A: list[np.ndarray] = []

    @property
    def output_layer(self) -> DenseLayer:
        return self._output_layer

    @property
    def hidden_layers(self) -> list[DenseLayer]:
        return self._hidden_layers

    def predict(self, input_array: np.ndarray, use_activate_func_in_output: bool = False) -> np.ndarray:
        # to ensure that the array will have one dimension
        if input_array.ndim == 1:
            input_array = input_array.reshape(-1, 1)
    
        if input_array.shape[-1] != self._input_size:
            raise ValueError(f"The input layer must have the size of: {self._input_size}")

        current_output: np.ndarray = input_array

        self._cache_Z.clear()
        self._cache_A.clear()

        for layer in self.hidden_layers:
            weights_matrix: np.ndarray = layer.weights
            biases_matrix: np.ndarray = layer.biases

            # calcule output to cache
            Z: np.ndarray = current_output @ weights_matrix + biases_matrix

            current_output = self._activation_function(Z)

            self._cache_Z.append(Z)
            self._cache_A.append(current_output)

        Z = current_output @ self.output_layer.weights + self.output_layer.biases

        if use_activate_func_in_output:
            current_output = self._activation_function(Z)
        else:
            current_output = Z

        self._cache_Z.append(Z)
        self._cache_A.append(current_output)

        return current_output

    @staticmethod
    def _calcule_total_square_error(y_real: np.ndarray, y_pred: np.ndarray) -> float:
        return np.sum((y_real - y_pred) ** 2)
    
    @staticmethod
    def _calcule_square_error_derivative_float(y_real: np.ndarray, y_pred: np.ndarray) -> float:
        """Get the partial derivative of square error

        Args:
            y_real (np.ndarray): Real points collected
            y_pred (np.ndarray): Linear Regression points (same lenght of points with `y_real`)

        Returns:
            float: Partial derivative
        """

        # -2 * (real_value - (bias + slope * x) + ... # x is a independent variable
        # this is the patial derivate
        # y_pred is a array with all values predicted by the neural network
        # y_real is a arrary with all real values
        return -2 * np.sum(y_real - y_pred)

    @staticmethod
    def _calcule_square_error_derivative_array(y_real: np.ndarray, y_pred: np.ndarray) -> np.ndarray:
        """Get the partial derivative of square error vector

        Args:
            y_real (np.ndarray): Real points collected
            y_pred (np.ndarray): Linear Regression points (same lenght of points with `y_real`)

        Returns:
            np.ndarray: Partial derivative array
        """

        # -2 * (real_value - (bias + slope * x) + ... # x is a independent variable
        # this is the patial derivate
        # y_pred is a array with all values predicted by the neural network
        # y_real is a arrary with all real values
        return -2 * (y_real - y_pred)
    
    @abstractmethod
    def fit(
        self,
        x_train: np.ndarray,
        y_train: np.ndarray,
        learning_rate: float = 0.01,
        epochs: int = 100,
        tolerance: float = 10e-5,
        ) -> None:
        """_summary_

        Args:
            x_train (np.ndarray): X axle to calculate the predict output
            y_train (np.ndarray): Y axle to compare the predict output
            learning_rate (float, optional): Rate of the derivative is apply. Defaults to 0.01.
            epochs (int, optional): Number of iterations to train. Defaults to 100.
            tolerance (float, optional): Number of tolerance to stop the training. Defaults to 5.
        """
        pass

## Criando uma classe específica para a questão D

In [None]:
class NeuralNetwork_D(BaseNeuralNetwork):
    def __init__(
        self, input_size: int, 
        hidden_layers: list[DenseLayer],
        output_layer: DenseLayer,
        activation_function: Literal['softplus'] | Literal['sigmoid'] = "softplus"
    ) -> None:
        super().__init__(
            input_size, 
            hidden_layers, 
            output_layer, 
            activation_function
        )

    def fit(
        self,
        x_train: np.ndarray,
        y_train: np.ndarray,
        learning_rate: float = 0.01,
        epochs: int = 100,
        tolerance: float = 10e-5,
    ) -> None:
        
        if x_train.ndim == 1: x_train = x_train.reshape(-1, 1)
        if y_train.ndim == 1: y_train = y_train.reshape(-1, 1)

        for epoch in range(epochs):
            y_pred: np.ndarray = self.predict(x_train)

            square_error_derivative: float = self._calcule_square_error_derivative_float(y_real=y_train, y_pred=y_pred)

            step_size: float = square_error_derivative * learning_rate

            if abs(step_size) <= tolerance:
                break
            
            print("-"*50)
            print(f"Old b_3: {self.output_layer.biases}")

            self.output_layer.biases = self.output_layer.biases - step_size

            print(f"Step size: {step_size}")
            print(f"New b_3: {self.output_layer.biases}")
            print(f"Epoch: {epoch}")
            print("-"*50)
        

## Criando e mostrando o gráfico gerado pela rede neural

In [None]:
hidden_layers = [
    DenseLayer(
        input_size=1,
        output_size=2,
        weights=np.array([ 
            [3.34, -3.53]
        ]),
        biases=np.array(
            [-1.43, 0.57]
        )
    )
]

output_layer: DenseLayer = DenseLayer(
    input_size=2,
    output_size=1,
    weights=np.array([ 
        [-1.22], [-2.30]
    ]),
    biases=np.array(
        #[2.61]
        [0.0]
    )
)

nn = NeuralNetwork_D(
    input_size=1,
    hidden_layers=hidden_layers,
    output_layer=output_layer
)

x_ticks: np.ndarray = np.linspace(0, 1)
y_pred: np.ndarray = nn.predict(x_ticks)

plt.plot(x_ticks, y_pred)

## Treinando e mostrando o gráfico com o valor de b_3 atualizado

In [None]:
X_TRAIN: np.ndarray = np.array([0.0, 0.5, 1.0])
Y_TRAIN: np.ndarray = np.array([0.0, 1.0, 0.0])
nn.fit(
    x_train=X_TRAIN,
    y_train=Y_TRAIN
)

x_ticks: np.ndarray = np.linspace(0, 1)
y_pred: np.ndarray = nn.predict(x_ticks)

plt.plot(x_ticks, y_pred)

# Questão E

## Criando uma rede neural específica para a questão E

In [None]:
class NeuralNetwork_E(BaseNeuralNetwork):
    def __init__(
        self, input_size: int, 
        hidden_layers: list[DenseLayer],
        output_layer: DenseLayer,
        activation_function: Literal['softplus'] | Literal['sigmoid'] = "softplus"
    ) -> None:
        super().__init__(
            input_size,
            hidden_layers,
            output_layer,
            activation_function
        )

    def fit(
        self,
        x_train: np.ndarray,
        y_train: np.ndarray,
        learning_rate: float = 0.01,
        epochs: int = 100,
        tolerance: float = 10e-5,
    ) -> None:
       
        if x_train.ndim == 1: x_train = x_train.reshape(-1, 1)
        if y_train.ndim == 1: y_train = y_train.reshape(-1, 1)

        for epoch in range(epochs):
            y_pred: np.ndarray = self.predict(x_train)

            output_gradient_vec: np.ndarray = self._calcule_square_error_derivative_array(y_real=y_train, y_pred=y_pred)

            output_delta: np.ndarray = output_gradient_vec

            # if the input of the layer is a output from the last hidden layer
            if len(self._cache_A) > 1:
                input_to_output: np.ndarray = self._cache_A[-2]
            else:
                raise RuntimeError("No hidden layers in neural network.")
            
            gradient_w_output: np.ndarray = input_to_output.T @ output_delta
            gradient_b_output: np.ndarray = np.sum(output_delta, axis=0)

            output_layer_weights_step_size: np.ndarray = learning_rate * gradient_w_output
            output_layer_bias_step_size: np.ndarray = learning_rate * gradient_b_output
            
            if (
                abs(output_layer_bias_step_size[0]) <= tolerance and 
                abs(output_layer_weights_step_size[0, 0]) <= tolerance and 
                abs(output_layer_weights_step_size[1, 0]) <= tolerance
            ):
                break

            print("="*50)
            print(f"Old b3: {self.output_layer.biases[0]}")
            print(f"b3 step size: {output_layer_bias_step_size[0]}")
            print("-"*25)
            print(f"Old w3: {self.output_layer.weights[0, 0]}")
            print(f"w3 step size: {output_layer_weights_step_size[0, 0]}")
            print("-"*25)
            print(f"Old w4: {self.output_layer.weights[1, 0]}")
            print(f"w4 step size: {output_layer_weights_step_size[1, 0]}")
            print("-"*25)

            self.output_layer.weights -= output_layer_weights_step_size
            self.output_layer.biases  -= output_layer_bias_step_size
        
            print(f"New b_3: {self.output_layer.biases}")
            print(f"New w3: {self.output_layer.weights[0, 0]}")
            print(f"New w4: {self.output_layer.weights[1, 0]}")
            print(f"Epoch: {epoch}")
            print("="*50)


In [None]:
hidden_layers = [
    DenseLayer(
        input_size=1,
        output_size=2,
        weights=np.array([ 
            [3.34, -3.53]
        ]),
        biases=np.array(
            [-1.43, 0.57]
        )
    )
]

output_layer: DenseLayer = DenseLayer(
    input_size=2,
    output_size=1,
    weights=np.array([ 
        [1.0], [1.0] # default weights
    ]),        
    biases=np.array(
        [0.0] # defautl bias
    )
)

nn = NeuralNetwork_E(
    input_size=1,
    hidden_layers=hidden_layers,
    output_layer=output_layer
)

x_ticks: np.ndarray = np.linspace(0, 1)
y_pred: np.ndarray = nn.predict(x_ticks)

plt.plot(x_ticks, y_pred)

## Treinando a rede neural e plotando o resultado

In [None]:
X_TRAIN: np.ndarray = np.array([0.0, 0.5, 1.0])
Y_TRAIN: np.ndarray = np.array([0.0, 1.0, 0.0])
nn.fit(
    x_train=X_TRAIN,
    y_train=Y_TRAIN,
    learning_rate=0.125,
    epochs=500
)

x_ticks: np.ndarray = np.linspace(0, 1)
y_pred: np.ndarray = nn.predict(x_ticks)

plt.plot(x_ticks, y_pred)