# **Implementation with MinLSTM**

## **Sequential Mode**

In [12]:
import numpy as np

class MinLSTMinSequentialMode:
    def __init__(self, input_size, hidden_size):
        self.hidden_size = hidden_size

        # Initialize weights for gates
        self.Wf = np.random.randn(input_size, hidden_size) * 0.01  # Forget gate weights
        self.Wi = np.random.randn(input_size, hidden_size) * 0.01  # Input gate weights
        self.Wc = np.random.randn(input_size, hidden_size) * 0.01  # Cell candidate weights
        self.Wo = np.random.randn(input_size, hidden_size) * 0.01  # Output gate weights

        self.Uf = np.random.randn(hidden_size, hidden_size) * 0.01  # Forget gate recurrent weights
        self.Ui = np.random.randn(hidden_size, hidden_size) * 0.01  # Input gate recurrent weights
        self.Uc = np.random.randn(hidden_size, hidden_size) * 0.01  # Cell candidate recurrent weights
        self.Uo = np.random.randn(hidden_size, hidden_size) * 0.01  # Output gate recurrent weights

        # Initialize biases
        self.bf = np.zeros((1, hidden_size))
        self.bi = np.zeros((1, hidden_size))
        self.bc = np.zeros((1, hidden_size))
        self.bo = np.zeros((1, hidden_size))

    def forward_sequence(self, x_sequence):
        """
        Process an entire sequence in sequential mode
        
        Args:
        x_sequence (numpy.ndarray): Input sequence of shape (seq_length, input_size)
        
        Returns:
        tuple: Hidden states and cell states for the entire sequence
        """
        seq_length = x_sequence.shape[0]
        h_states = np.zeros((seq_length, self.hidden_size))
        c_states = np.zeros((seq_length, self.hidden_size))

        # Initial hidden and cell states
        h_prev = np.zeros((1, self.hidden_size))
        c_prev = np.zeros((1, self.hidden_size))

        # Process each time step sequentially
        for t in range(seq_length):
            # Current input at time step t
            x_t = x_sequence[t:t+1]

            # Forget gate
            f_t = self.sigmoid(np.dot(x_t, self.Wf) + np.dot(h_prev, self.Uf) + self.bf)
            
            # Input gate
            i_t = self.sigmoid(np.dot(x_t, self.Wi) + np.dot(h_prev, self.Ui) + self.bi)
            
            # Cell candidate
            c_tilde = np.tanh(np.dot(x_t, self.Wc) + np.dot(h_prev, self.Uc) + self.bc)
            
            # New cell state
            c_t = f_t * c_prev + i_t * c_tilde
            
            # Output gate
            o_t = self.sigmoid(np.dot(x_t, self.Wo) + np.dot(h_prev, self.Uo) + self.bo)
            
            # New hidden state
            h_t = o_t * np.tanh(c_t)

            # Store states
            h_states[t] = h_t
            c_states[t] = c_t

            # Update previous states for next iteration
            h_prev = h_t
            c_prev = c_t

        return h_states, c_states

    def forward_single_step(self, x_t, h_prev, c_prev):
        """
        Process a single time step
        
        Args:
        x_t (numpy.ndarray): Input at current time step
        h_prev (numpy.ndarray): Previous hidden state
        c_prev (numpy.ndarray): Previous cell state
        
        Returns:
        tuple: Current hidden state and cell state
        """
        # Forget gate
        f_t = self.sigmoid(np.dot(x_t, self.Wf) + np.dot(h_prev, self.Uf) + self.bf)
        
        # Input gate
        i_t = self.sigmoid(np.dot(x_t, self.Wi) + np.dot(h_prev, self.Ui) + self.bi)
        
        # Cell candidate
        c_tilde = np.tanh(np.dot(x_t, self.Wc) + np.dot(h_prev, self.Uc) + self.bc)
        
        # New cell state
        c_t = f_t * c_prev + i_t * c_tilde
        
        # Output gate
        o_t = self.sigmoid(np.dot(x_t, self.Wo) + np.dot(h_prev, self.Uo) + self.bo)
        
        # New hidden state
        h_t = o_t * np.tanh(c_t)

        return h_t, c_t

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))


## **Parallel Mode** 

In [11]:
import numpy as np

class MinLSTMinParallelMode:
    def __init__(self, input_size, hidden_size):
        self.hidden_size = hidden_size

        # Initialize weights
        self.Wf = np.random.randn(input_size, hidden_size) * 0.01  # Forget gate weights
        self.Wi = np.random.randn(input_size, hidden_size) * 0.01  # Input gate weights
        self.Wc = np.random.randn(input_size, hidden_size) * 0.01  # Cell state weights
        self.Wo = np.random.randn(input_size, hidden_size) * 0.01  # Output gate weights

        self.Uf = np.random.randn(hidden_size, hidden_size) * 0.01  # Forget gate recurrent weights
        self.Ui = np.random.randn(hidden_size, hidden_size) * 0.01  # Input gate recurrent weights
        self.Uc = np.random.randn(hidden_size, hidden_size) * 0.01  # Cell state recurrent weights
        self.Uo = np.random.randn(hidden_size, hidden_size) * 0.01  # Output gate recurrent weights

        # Initialize biases
        self.bf = np.zeros((1, hidden_size))
        self.bi = np.zeros((1, hidden_size))
        self.bc = np.zeros((1, hidden_size))
        self.bo = np.zeros((1, hidden_size))

    def parallel_scan(self, x_seq):
        n = x_seq.shape[0]
        h_states = np.zeros((n, self.hidden_size))
        c_states = np.zeros((n, self.hidden_size))

        for t in range(n):
            if t == 0:
                h_prev = np.zeros((self.hidden_size,))  # Initial hidden state
                c_prev = np.zeros((self.hidden_size,))  # Initial cell state
            else:
                h_prev = h_states[t-1]  # Previous hidden state
                c_prev = c_states[t-1]  # Previous cell state

            f_t = self.sigmoid(np.dot(x_seq[t], self.Wf) + np.dot(h_prev, self.Uf) + self.bf)
            i_t = self.sigmoid(np.dot(x_seq[t], self.Wi) + np.dot(h_prev, self.Ui) + self.bi)
            c_tilde = np.tanh(np.dot(x_seq[t], self.Wc) + np.dot(h_prev, self.Uc) + self.bc)
            c_t = f_t * c_prev + i_t * c_tilde
            o_t = self.sigmoid(np.dot(x_seq[t], self.Wo) + np.dot(h_prev, self.Uo) + self.bo)
            h_states[t] = o_t * np.tanh(c_t)
            c_states[t] = c_t

        return h_states, c_states

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))


## **BabyLM with MinLSTM**

In [14]:
class BabyLMwithMinLSTM:
    def __init__(self, vocab_size, embedding_size, hidden_size):
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size

        # Embedding layer
        self.embedding = np.random.randn(vocab_size, embedding_size) * 0.01

        # MinLSTM Layer
        self.min_lstm = MinLSTMinParallelMode(embedding_size, hidden_size)

        # Output layer
        self.Wo = np.random.randn(hidden_size, vocab_size) * 0.01  # Output weights
        self.bo = np.zeros((1, vocab_size))  # Output biases

    def forward(self, input_seq):
        # Step 1: Get the embeddings of the input sequence
        embedded_seq = self.embedding[input_seq]

        # Step 2: Pass the embedded sequence through the MinLSTM
        h_states, _ = self.min_lstm.parallel_scan(embedded_seq)

        # Step 3: Compute the logits (pre-softmax output)
        logits = np.dot(h_states, self.Wo) + self.bo

        return logits

    def predict(self, input_seq):
        # Forward pass
        logits = self.forward(input_seq)

        # Softmax to get probabilities of the next word in the sequence
        probabilities = self.softmax(logits[-1])

        # Return the index of the word with the highest probability (next word)
        return np.argmax(probabilities)

    def softmax(self, x):
        exp_x = np.exp(x - np.max(x))  # For numerical stability
        return exp_x / np.sum(exp_x)

# Example usage with words
# Vocabulary and mapping of words to indices
vocab = {
    "hello": 0,
    "world": 1,
    "this": 2,
    "is": 3,
    "test": 4
}
vocab_size = len(vocab)
inv_vocab = {v: k for k, v in vocab.items()}  # For decoding indices back to words

# Hyperparameters
embedding_size = 5  # Embedding size
hidden_size = 4  # Hidden state size for LSTM

# Instantiate BabyLM_LSTM
baby_lm_lstm = BabyLMwithMinLSTM(vocab_size, embedding_size, hidden_size)

# Example input sequence (words converted to indices)
input_words = ["hello", "world", "this", "is"]
input_seq = np.array([vocab[word] for word in input_words])

# Predict the next word
next_word_idx = baby_lm_lstm.predict(input_seq)
next_word = inv_vocab[next_word_idx]

print("Input words:", input_words)
print("Predicted next word:", next_word)

Input words: ['hello', 'world', 'this', 'is']
Predicted next word: hello
