In [16]:
import numpy as np
from typing import Literal

In [27]:
class GRU:
    def __init__(self, input_size, hidden_size, output_size, learning_rate: float = 0.01,
                 output_activation: Literal['linear', 'leaky_relu'] = 'linear',
                 leaky_relu_alpha: float = 0.01):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size

        # Инициализация весов для GRU
        self.Wz = np.random.rand(hidden_size, input_size) * 0.01
        self.Uz = np.random.rand(hidden_size, hidden_size) * 0.01
        self.bz = np.zeros((hidden_size, 1))

        self.Wh = np.random.rand(hidden_size, input_size) * 0.01
        self.Uh = np.random.randn(hidden_size, hidden_size) * 0.01
        self.bh = np.zeros((hidden_size, 1))

        self.Wr = np.random.rand(hidden_size, input_size) * 0.01
        self.Ur = np.random.rand(hidden_size, hidden_size) * 0.01
        self.br = np.zeros((hidden_size, 1))

        self.Wo = np.random.rand(output_size, hidden_size) * 0.01
        self.bo = np.zeros((output_size, 1))

        self.learning_rate = learning_rate

        # Output neurons activation func + params
        self.output_activation = output_activation
        self.leaky_relu_alpha = leaky_relu_alpha

    def activation(self, x):
        # Гиперболический арксинус
        return np.arcsinh(x)
    
    def leaky_relu(self, x, k: float = 0.01):
        return np.where(x > 0, x, k * x)
    
    def linear(self, x):
        return x

    def forward(self, X):
        self.h = np.zeros((self.hidden_size, 1))
        self.y = []
        self.cache = []

        for t in range(len(X)):
            x_t = X[t].reshape(-1, 1)

            # Грейдиентное обновление для GRU
            z_t = self.sigmoid(self.Wz @ x_t + self.Uz @ self.h + self.bz)
            r_t = self.sigmoid(self.Wr @ x_t + self.Ur @ self.h + self.br)
            h_tilde = self.activation(self.Wh @ x_t + self.Uh @ (r_t * self.h) + self.bh)
            h_next = (1 - z_t) * h_tilde + z_t * self.h
            self.h = h_next

            # Сохранение активаций для обратного распространения
            self.cache.append((x_t, z_t, r_t, h_tilde, h_next))

            # Выходное значение
            y_t = self.Wo @ h_next + self.bo

            if self.output_activation == 'linear':
                y_t = self.linear(y_t)
            elif self.output_activation == 'leaky_relu':
                y_t = self.leaky_relu(y_t, self.leaky_relu_alpha)

            self.y.append(y_t)

        return np.array(self.y).squeeze(axis=-1)

    def sigmoid(self, x):                
        return 1 / (1 + np.exp(-x))

    def backward(self, X, Y, grad_clip_value=5.0):
        dWz = np.zeros_like(self.Wz)
        dUz = np.zeros_like(self.Uz)
        dbz = np.zeros_like(self.bz)

        dWh = np.zeros_like(self.Wh)
        dUh = np.zeros_like(self.Uh)
        dbh = np.zeros_like(self.bh)

        dWr = np.zeros_like(self.Wr)
        dUr = np.zeros_like(self.Ur)
        dbr = np.zeros_like(self.br)

        dWo = np.zeros_like(self.Wo)
        dbo = np.zeros_like(self.bo)

        dh_next = np.zeros_like(self.h)

        # Обратное распространение ошибки по времени
        for t in reversed(range(len(X))):
            x_t, z_t, r_t, h_tilde, h_next = self.cache[t]
            y_t = self.y[t]

            # Ошибка на выходе
            dy = y_t - Y[t]

            # Градиенты для весов выхода
            dWo += dy @ h_next.T
            dbo += dy

            # Градиенты для скрытого состояния
            dh = self.Wo.T @ dy + dh_next
            dh_tilde = dh * (1 - z_t)
            dz = dh * (h_next - h_tilde)
            dWh += dh_tilde * (1 - h_tilde ** 2) @ x_t.T
            dUh += dh_tilde * (1 - h_tilde ** 2) @ (r_t * self.h).T
            dbh += dh_tilde * (1 - h_tilde ** 2)

            # Градиенты для обновлений
            dUr += dz * r_t * self.h @ self.h.T
            dWr += dz * r_t * self.h @ x_t.T
            dbr += dz * r_t * self.h

            # Градиенты для сброса и обновления
            dUz += dz * (1 - z_t) @ self.h.T
            dWz += dz * (1 - z_t) @ x_t.T
            dbz += dz * (1 - z_t)

            dh_next = (1 - z_t) * dh_tilde + z_t * dh
        
        # Ограничение градиентов, чтобы избежать взрыва
        gradients = [dWz, dUz, dbz, dWh, dUh, dbh, dWr, dUr, dbr, dWo, dbo]
        for grad in gradients:
            np.clip(grad, -grad_clip_value, grad_clip_value, out=grad)

        # Обновление весов
        self.Wz -= self.learning_rate * dWz
        self.Uz -= self.learning_rate * dUz
        self.bz -= self.learning_rate * dbz

        self.Wh -= self.learning_rate * dWh
        self.Uh -= self.learning_rate * dUh
        self.bh -= self.learning_rate * dbh

        self.Wr -= self.learning_rate * dWr
        self.Ur -= self.learning_rate * dUr
        self.br -= self.learning_rate * dbr

        self.Wo -= self.learning_rate * dWo
        self.bo -= self.learning_rate * dbo

    def train(self, X, Y, epochs=100, verbosity: int = 1, grad_clip_value: float = 5.0,
              reset_hidden: bool = False):
        for epoch in range(epochs):
            # Reset hidden state each epoch if needed
            if reset_hidden:
                self.h = np.zeros((self.hidden_size, 1))

            self.forward(X)
            self.backward(X, Y, grad_clip_value)
            if epoch % verbosity == 0:
                loss = np.mean((np.array(self.y) - Y) ** 2)
                print(f'Epoch {epoch}/{epochs}, Loss: {loss:.6f}')

In [3]:
# Prepare data using sliding window
def create_sliding_window_data(sequence, window_size):
    X, y = [], []
    for i in range(len(sequence) - window_size):
        X.append(sequence[i:i + window_size])
        y.append(sequence[i + window_size])
    return np.array(X), np.array(y).reshape(-1, 1)

In [4]:
# Fibonacci sequence generator
def fibonacci_generator(n):
    a, b = 0, 1
    for _ in range(n):
        yield a
        a, b = b, a + b

In [5]:
# Squared num sequence generator
def squared_generator(n, fst: float):
    num = fst
    for _ in range(n):
        yield num
        num = num**2

In [11]:
# Squared num sequence generator
def half_generator(n, fst: float):
    num = fst
    for _ in range(n):
        yield num
        num /= 2

In [20]:
# 1/n sequence generator
def one_by_n_generator(n):    
    for i in range(n):
        yield 1 / (i + 1)

In [41]:
# 1, -1, 1, -1, 1,... sequence generator
def plus_one_minus_one_generator(n):    
    for i in range(n):        
        yield 1 if i % 2 == 0 else -1

In [None]:
# Generate Fibonacci sequence and prepare data for prediction
n = 10
fibonacci_sequence = list(fibonacci_generator(n))
fib_max = max(fibonacci_sequence)
window_size = 3
X_fib, y_fib = create_sliding_window_data(fibonacci_sequence, window_size)
print(X_fib)
print(y_fib)
X_fib, y_fib = X_fib / fib_max, y_fib / fib_max
print(X_fib)
print(y_fib)


# Predict the next number in the Fibonacci sequence
fib_model = GRU(input_size=window_size, hidden_size=5, output_size=1, learning_rate=0.00001)
fib_model.train(X_fib, y_fib, epochs=20000, verbosity=500, grad_clip_value=1)
print(X_fib[-1].reshape(1, window_size))
predicted_fib = fib_model.forward(X_fib[-1].reshape(1, window_size)).squeeze()
print(predicted_fib)
print("Predicted Fibonacci sequence:", predicted_fib * fib_max)

In [56]:
# Generate squared sequence
output_activation = 'linear'
output_activation_alpha = 0.01

n = 15
sequence = list(plus_one_minus_one_generator(n))
window_size = 3
X, y = create_sliding_window_data(sequence, window_size)
X_train, y_train = X[0:int(len(X) * 0.8)], y[0:int(len(X) * 0.8)]
X_test, y_test = X[int(len(X) * 0.8):], y[int(len(X) * 0.8):]

# Predict the next number in the sequence
model = GRU(input_size=window_size, hidden_size=5, output_size=1, learning_rate=0.000001,
            output_activation=output_activation, leaky_relu_alpha=output_activation_alpha)
model.train(X_train, y_train, epochs=400000, verbosity=1000, grad_clip_value=1)
print('Test dataset validation')
mae = np.sum(np.absolute((y_test.squeeze() - model.forward(X_test).squeeze()))) / len(y_test)
print(f'MAE on test: {mae:.8f}')

Epoch 0/400000, Loss: 1.000002
Epoch 1000/400000, Loss: 0.997968
Epoch 2000/400000, Loss: 0.996268
Epoch 3000/400000, Loss: 0.994849
Epoch 4000/400000, Loss: 0.993663
Epoch 5000/400000, Loss: 0.992673
Epoch 6000/400000, Loss: 0.991846
Epoch 7000/400000, Loss: 0.991155
Epoch 8000/400000, Loss: 0.990579
Epoch 9000/400000, Loss: 0.990097
Epoch 10000/400000, Loss: 0.989694
Epoch 11000/400000, Loss: 0.989358
Epoch 12000/400000, Loss: 0.989077
Epoch 13000/400000, Loss: 0.988843
Epoch 14000/400000, Loss: 0.988647
Epoch 15000/400000, Loss: 0.988483
Epoch 16000/400000, Loss: 0.988347
Epoch 17000/400000, Loss: 0.988233
Epoch 18000/400000, Loss: 0.988137
Epoch 19000/400000, Loss: 0.988058
Epoch 20000/400000, Loss: 0.987991
Epoch 21000/400000, Loss: 0.987936
Epoch 22000/400000, Loss: 0.987889
Epoch 23000/400000, Loss: 0.987850
Epoch 24000/400000, Loss: 0.987818
Epoch 25000/400000, Loss: 0.987791
Epoch 26000/400000, Loss: 0.987769
Epoch 27000/400000, Loss: 0.987750
Epoch 28000/400000, Loss: 0.98773