# **Implementation of MinGRU**

# **Sequential Mode**

In [1]:
import numpy as np

class MinGRUinSequentialMode:
    def __init__(self, input_size, hidden_size):
        self.hidden_size = hidden_size

        # Initialize weights
        self.Wz = np.random.randn(input_size, hidden_size) * 0.01  # Update gate weights
        self.Wh = np.random.randn(input_size, hidden_size) * 0.01  # Hidden state weights
        self.Uz = np.random.randn(hidden_size, hidden_size) * 0.01  # Update gate recurrent weights
        self.Uh = np.random.randn(hidden_size, hidden_size) * 0.01  # Hidden state recurrent weights

        # Initialize biases
        self.bz = np.zeros((1, hidden_size))
        self.bh = np.zeros((1, hidden_size))

    def forward(self, x_t, h_prev):
        # Update gate
        z_t = self.sigmoid(np.dot(x_t, self.Wz) + np.dot(h_prev, self.Uz) + self.bz)

        # Candidate hidden state
        h_tilde = np.tanh(np.dot(x_t, self.Wh) + np.dot(h_prev, self.Uh) + self.bh)

        # New hidden state
        h_t = (1 - z_t) * h_prev + z_t * h_tilde

        return h_t

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

# Example usage
input_size = 3   # Size of input vector
hidden_size = 2  # Size of hidden state

min_gru = MinGRUinSequentialMode(input_size, hidden_size)
h_prev = np.zeros((1, hidden_size))  # Initial hidden state
x_t = np.random.randn(5, input_size)  # Current input

# Forward pass
h_t = min_gru.forward(x_t, h_prev)
print("New hidden state:", h_t)

New hidden state: [[-0.00063065  0.00440903]
 [ 0.01602061 -0.00444691]
 [-0.00231247 -0.00320737]
 [-0.0114797  -0.00222481]
 [ 0.01844214  0.00330333]]


# **Parallel Mode**

In [3]:
import numpy as np

class MinGRUinParallelMode:
    def __init__(self, input_size, hidden_size):
        self.hidden_size = hidden_size

        # Initialize weights
        self.Wz = np.random.randn(input_size, hidden_size) * 0.01  # Update gate weights
        self.Wh = np.random.randn(input_size, hidden_size) * 0.01  # Hidden state weights
        self.Uz = np.random.randn(hidden_size, hidden_size) * 0.01  # Update gate recurrent weights
        self.Uh = np.random.randn(hidden_size, hidden_size) * 0.01  # Hidden state recurrent weights

        # Initialize biases
        self.bz = np.zeros((1, hidden_size))
        self.bh = np.zeros((1, hidden_size))

    def forward(self, x_seq):
        # Initialize hidden states
        h_states = np.zeros((x_seq.shape[0], self.hidden_size))

        for t in range(x_seq.shape[0]):
            if t == 0:
                h_prev = np.zeros((1, self.hidden_size))  # Initial hidden state
            else:
                h_prev = h_states[t-1:t]  # Previous hidden state

            # Update gate
            z_t = self.sigmoid(np.dot(x_seq[t], self.Wz) + np.dot(h_prev, self.Uz) + self.bz)

            # Candidate hidden state
            h_tilde = np.tanh(np.dot(x_seq[t], self.Wh) + np.dot(h_prev, self.Uh) + self.bh)

            # New hidden state
            h_states[t] = (1 - z_t) * h_prev + z_t * h_tilde

        return h_states

    def parallel_scan(self, x_seq):
        n = x_seq.shape[0]
        h_states = np.zeros((n, self.hidden_size))

        # Step 1: Calculate hidden states in parallel
        for t in range(n):
            if t == 0:
                h_states[t] = np.zeros((self.hidden_size,))  # Initial hidden state
            else:
                h_prev = h_states[t-1]  # Previous hidden state

            z_t = self.sigmoid(np.dot(x_seq[t], self.Wz) + np.dot(h_states[t-1], self.Uz) + self.bz)
            h_tilde = np.tanh(np.dot(x_seq[t], self.Wh) + np.dot(h_states[t-1], self.Uh) + self.bh)
            h_states[t] = (1 - z_t) * h_states[t-1] + z_t * h_tilde

        return h_states

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

# Example usage
input_size = 3   # Size of input vector
hidden_size = 2  # Size of hidden state

min_gru = MinGRUinParallelMode(input_size, hidden_size)
x_seq = np.random.randn(5, input_size)  # Sequence of inputs with 5 time steps

# Forward pass with parallel scan
h_states = min_gru.parallel_scan(x_seq)
print("Hidden states:\n", h_states)

Hidden states:
 [[-0.00541459  0.0037567 ]
 [-0.00131788  0.0039851 ]
 [ 0.00120119  0.00659064]
 [ 0.00329749  0.00253187]
 [-0.00310563  0.00456795]]


# **BabyLM from MinGRU**

In [5]:
import numpy as np

class BabyLMwithMinGRU:
    def __init__(self, vocab_size, embedding_size, hidden_size):
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size

        # Embedding layer
        self.embedding = np.random.randn(vocab_size, embedding_size) * 0.01

        # MinGRU Layer
        self.min_gru = MinGRUinParallelMode(embedding_size, hidden_size)

        # Output layer
        self.Wo = np.random.randn(hidden_size, vocab_size) * 0.01  # Output weights
        self.bo = np.zeros((1, vocab_size))  # Output biases

    def forward(self, input_seq):
        # Step 1: Get the embeddings of the input sequence
        embedded_seq = self.embedding[input_seq]

        # Step 2: Pass the embedded sequence through the MinGRU
        h_states = self.min_gru.parallel_scan(embedded_seq)

        # Step 3: Compute the logits (pre-softmax output)
        logits = np.dot(h_states, self.Wo) + self.bo

        return logits

    def predict(self, input_seq):
        # Forward pass
        logits = self.forward(input_seq)

        # Softmax to get probabilities of the next word in the sequence
        probabilities = self.softmax(logits[-1])

        # Return the index of the word with the highest probability (next word)
        return np.argmax(probabilities)

    def softmax(self, x):
        exp_x = np.exp(x - np.max(x))  # For numerical stability
        return exp_x / np.sum(exp_x)

class MinGRUinParallelMode:
    def __init__(self, input_size, hidden_size):
        self.hidden_size = hidden_size

        # Initialize weights
        self.Wz = np.random.randn(input_size, hidden_size) * 0.01
        self.Wh = np.random.randn(input_size, hidden_size) * 0.01
        self.Uz = np.random.randn(hidden_size, hidden_size) * 0.01
        self.Uh = np.random.randn(hidden_size, hidden_size) * 0.01

        # Initialize biases
        self.bz = np.zeros((1, hidden_size))
        self.bh = np.zeros((1, hidden_size))

    def forward(self, x_seq):
        n = x_seq.shape[0]
        h_states = np.zeros((n, self.hidden_size))

        for t in range(n):
            if t == 0:
                h_prev = np.zeros((1, self.hidden_size))
            else:
                h_prev = h_states[t-1:t]

            z_t = self.sigmoid(np.dot(x_seq[t], self.Wz) + np.dot(h_prev, self.Uz) + self.bz)
            h_tilde = np.tanh(np.dot(x_seq[t], self.Wh) + np.dot(h_prev, self.Uh) + self.bh)
            h_states[t] = (1 - z_t) * h_prev + z_t * h_tilde

        return h_states

    def parallel_scan(self, x_seq):
        n = x_seq.shape[0]
        h_states = np.zeros((n, self.hidden_size))

        for t in range(n):
            if t == 0:
                h_states[t] = np.zeros((self.hidden_size,))
            else:
                h_prev = h_states[t-1]

            z_t = self.sigmoid(np.dot(x_seq[t], self.Wz) + np.dot(h_states[t-1], self.Uz) + self.bz)
            h_tilde = np.tanh(np.dot(x_seq[t], self.Wh) + np.dot(h_states[t-1], self.Uh) + self.bh)
            h_states[t] = (1 - z_t) * h_states[t-1] + z_t * h_tilde

        return h_states

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

# Example usage with text
def create_vocabulary(texts):
    # Create a vocabulary from the input texts
    all_words = ' '.join(texts).lower().split()
    unique_words = sorted(set(all_words))
    
    # Create word to index and index to word mappings
    word_to_idx = {word: idx for idx, word in enumerate(unique_words)}
    idx_to_word = {idx: word for word, idx in word_to_idx.items()}
    
    return word_to_idx, idx_to_word

# Example texts
texts = [
    "the quick brown fox jumps over the lazy dog",
    "a journey of a thousand miles begins with a single step",
    "to be or not to be that is the question"
]

# Create vocabulary
word_to_idx, idx_to_word = create_vocabulary(texts)
vocab_size = len(word_to_idx)

# Hyperparameters
embedding_size = 10
hidden_size = 20

# Instantiate BabyLM
baby_lm = BabyLMwithMinGRU(vocab_size, embedding_size, hidden_size)

# Prepare input sequence
example_text = "the quick brown fox"
input_words = example_text.lower().split()
input_seq = np.array([word_to_idx[word] for word in input_words])

# Predict the next word
next_word_idx = baby_lm.predict(input_seq)
next_word = idx_to_word[next_word_idx]

print("Input sequence:", example_text)
print("Vocabulary size:", vocab_size)
print("Predicted next word:", next_word)
print("Predicted next word index:", next_word_idx)

Input sequence: the quick brown fox
Vocabulary size: 24
Predicted next word: jumps
Predicted next word index: 8
