# **LMM (Long-Term Memory Module)**
Lets build a tiny LMM (Long-Term Memory Module

In [None]:
import torch
import torch.nn as nn

In [None]:
# VOCAB
vocab = [
    # Keys
    "user_name", "pet_name", "favorite_color", "city", "job",
    "age", "hobby", "car", "food", "sport",

    # Values - Person 1 (Fahad)
    "Fahad", "Max", "blue", "Toronto", "engineer",
    "23", "coding", "Tesla", "biryani", "MMA",

    # Values - Person 2 (Alice)
    "Alice", "Luna", "green", "Berlin", "designer",
    "28", "painting", "BMW", "pasta", "tennis",

    # Structure
    "is", "has", "likes"
]

word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for i, w in enumerate(vocab)}


In [None]:
# Weights (Frozen)
embed = nn.Embedding(len(vocab), 16)
embed.weight.requires_grad = False # weights freezed

# LMM (BIGGER MLP for more capacity)
class LMM(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(16, 128),  # capacity
            nn.ReLU(),
            nn.Linear(128, 16)
        )
    def forward(self, x):
        return self.net(x)

lmm = LMM()
optimizer = torch.optim.SGD(lmm.parameters(), lr=0.2)

# --- HELPERS ---
def word_to_vec(w):
    return embed(torch.tensor([word2idx[w]]))

def vec_to_word(v):
    with torch.no_grad():
        dists = torch.norm(embed.weight - v, dim=1)
        return idx2word[dists.argmin().item()]

In [None]:
# Attention
class SimpleAttention(nn.Module):
    def __init__(self, dim=16):
        super().__init__()
        self.query = nn.Linear(dim, dim)
        self.key = nn.Linear(dim, dim)
        self.value = nn.Linear(dim, dim)

    def forward(self, x):
        # x shape: (seq_len, dim) or (1, dim)

        q = self.query(x)  # (seq_len, dim)
        k = self.key(x)    # (seq_len, dim)
        v = self.value(x)  # (seq_len, dim)

        # FIX: Use transpose instead of .T
        scores = torch.matmul(q, k.transpose(-2, -1)) / (16 ** 0.5)

        attn_weights = torch.softmax(scores, dim=-1)
        output = torch.matmul(attn_weights, v)

        return output, attn_weights

In [None]:
# MAC Layer (combines LMM + Attention)
class MAC_Layer(nn.Module):
    def __init__(self):
        super().__init__()
        self.lmm = LMM()
        self.attention = SimpleAttention(16)

    def forward(self, tokens):
        # Get memory for each token
        memory_vecs = torch.stack([self.lmm(tok) for tok in tokens])

        # Combine original + memory
        combined = torch.cat([tokens, memory_vecs], dim=0)

        # Attention over both
        output, weights = self.attention(combined)

        return output[:len(tokens)], weights  # Return first half only

In [None]:

# DATA: INTERLEAVED FACTS
base_facts = [
    ["user_name", "is", "Fahad"],
    ["pet_name", "is", "Max"],
    ["city", "is", "Toronto"],
    ["job", "is", "engineer"],
    ["favorite_color", "is", "blue"],
    ["age", "is", "23"],
    ["hobby", "is", "coding"],
    ["car", "is", "Tesla"],
    ["food", "likes", "biryani"],
    ["sport", "likes", "MMA"]
]
# Repeat 30x → 90 training steps, evenly mixed
sequence = base_facts * 30

In [None]:
# Check if all words in sequences are in vocab
print("🔍 Checking vocabulary coverage...\n")

all_words = set()
for seq in sequence:
    for word in seq:
        all_words.add(word)

missing_words = []
for word in all_words:
    if word not in word2idx:
        missing_words.append(word)

if missing_words:
    print(f"❌ Missing words: {missing_words}")
    print(f"\nCurrent vocab: {vocab}")
else:
    print("✅ All words found in vocabulary!")

print()

🔍 Checking vocabulary coverage...

✅ All words found in vocabulary!



In [None]:
print("\nTraining MAC on sequences...\n")

mac = MAC_Layer()
optimizer = torch.optim.SGD(mac.parameters(), lr=0.05, weight_decay=0.0001)  # ← Lower LR + decay

for epoch in range(200):  # More epochs with lower LR
    total_loss = 0

    for seq in sequence:
        seq_vecs = torch.stack([word_to_vec(w).squeeze(0) for w in seq])
        output, attn_weights = mac(seq_vecs)

        loss = 0
        for i in range(len(seq) - 1):
            pred = output[i]
            target = seq_vecs[i + 1]
            loss += (pred - target).pow(2).mean()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    if epoch % 40 == 0:
        print(f"Epoch {epoch}: Loss = {total_loss:.4f}")

print("\n✅ Training complete!")


Training MAC on sequences...

Epoch 0: Loss = 301.3813
Epoch 40: Loss = 0.0076
Epoch 80: Loss = 0.0032
Epoch 120: Loss = 0.0022
Epoch 160: Loss = 0.0019

✅ Training complete!


In [None]:
print("\n-----------TOKEN-BY-TOKEN GENERATION TEST:----------------\n")

tests = [
    (["user_name", "is"], "Fahad"),
    (["pet_name", "is"], "Max"),
    (["city", "is"], "Toronto"),
    (["job", "is"], "engineer"),
    (["favorite_color", "is"], "blue"),
    (["age", "is"], "23"),
    (["hobby", "is"], "coding"),
    (["car", "is"], "Tesla"),
    (["food", "likes"], "biryani"),
    (["sport", "likes"], "MMA")
]

correct = 0
for query, expected in tests:
    query_vecs = torch.stack([word_to_vec(w).squeeze(0) for w in query])
    output, _ = mac(query_vecs)
    pred_word = vec_to_word(output[-1])

    status = "✅" if pred_word == expected else "❌"
    print(f"{status} '{' '.join(query)}' → Expected: '{expected}' | Got: '{pred_word}'")
    if pred_word == expected:
        correct += 1

print(f"\nScore: {correct}/10")


-----------TOKEN-BY-TOKEN GENERATION TEST:----------------

✅ 'user_name is' → Expected: 'Fahad' | Got: 'Fahad'
✅ 'pet_name is' → Expected: 'Max' | Got: 'Max'
✅ 'city is' → Expected: 'Toronto' | Got: 'Toronto'
❌ 'job is' → Expected: 'engineer' | Got: 'BMW'
✅ 'favorite_color is' → Expected: 'blue' | Got: 'blue'
✅ 'age is' → Expected: '23' | Got: '23'
❌ 'hobby is' → Expected: 'coding' | Got: 'likes'
✅ 'car is' → Expected: 'Tesla' | Got: 'Tesla'
✅ 'food likes' → Expected: 'biryani' | Got: 'biryani'
❌ 'sport likes' → Expected: 'MMA' | Got: 'car'

Score: 7/10


#### **Flow of How all works**


---

## **What Each Part Does:**

### **1. Vocabulary**
- List of all words the model knows
- Creates mappings: word ↔ number

### **2. Embeddings (Frozen)**
- Converts each word into a vector of 16 numbers
- "Frozen" = these vectors don't change during training
- Like a dictionary: "Fahad" → [0.2, -0.1, 0.8, ...]

### **3. LMM (Long-Term Memory)**
- A small neural network (2 layers)
- Takes a 16-number vector IN
- Outputs a 16-number vector OUT
- **Its job:** Learn to map questions → answers
- Example: "pet_name" → "Max"

### **4. Attention**
- Compares tokens to each other
- Decides which tokens are important
- Creates weighted combinations
- **Its job:** "Which past tokens should I focus on?"

### **5. MAC Layer**
- **Combines LMM + Attention**
- For each token:
  - Get memory summary from LMM
  - Combine original token + memory summary
  - Use attention to decide: use memory or not?

### **6. Training Loop**
- Processes sequences: ["user_name", "is", "Fahad"]
- At each position, tries to predict the NEXT word
- Position 0: See "user_name" → predict "is"
- Position 1: See "user_name is" → predict "Fahad"
- Calculates error (loss)
- Updates LMM and Attention weights to reduce error

### **7. Testing**
- After training, ask: "pet_name" → what's the answer?
- MAC uses both memory and attention to respond

---


In [14]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Training Titan MAC on Tiny Stories**
As we saw the result was not good enough for tiny data so lets do it for a real data

In [1]:
!pip install datasets transformers



In [2]:
from datasets import load_dataset
from transformers import GPT2Tokenizer

In [3]:
dataset = load_dataset("roneneldan/TinyStories", split="train[:1000]") # First 1000 stories

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00004-2d5a1467fff108(…):   0%|          | 0.00/249M [00:00<?, ?B/s]

data/train-00001-of-00004-5852b56a2bd28f(…):   0%|          | 0.00/248M [00:00<?, ?B/s]

data/train-00002-of-00004-a26307300439e9(…):   0%|          | 0.00/246M [00:00<?, ?B/s]

data/train-00003-of-00004-d243063613e5a0(…):   0%|          | 0.00/248M [00:00<?, ?B/s]

data/validation-00000-of-00001-869c898b5(…):   0%|          | 0.00/9.99M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2119719 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/21990 [00:00<?, ? examples/s]

In [4]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [5]:
import torch
import torch.nn as nn

In [22]:
vocab_size = tokenizer.vocab_size
# print(vocab_size)
embed_dim = 512

In [23]:
embed = nn.Embedding(vocab_size, embed_dim)
# print(embed)
embed.weight.requires_grad = True

In [32]:
# LMM & Attention
class LMM(nn.Module):
    def __init__(self, dim=512):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, 1024),
            nn.ReLU(),
            nn.Linear(1024, 1024),
            nn.ReLU(),
            nn.Linear(1024, 1024),
            nn.ReLU(),
            nn.Linear(1024, dim)
        )

    def forward(self, x):
        return self.net(x)

In [33]:

class Attention(nn.Module):
    def __init__(self, dim=512):
        super().__init__()
        self.dim = dim
        self.query = nn.Linear(dim, dim)
        self.key = nn.Linear(dim, dim)
        self.value = nn.Linear(dim, dim)

    def forward(self, x):
        q = self.query(x)
        k = self.key(x)
        v = self.value(x)
        scores = torch.matmul(q, k.transpose(-2, -1)) / (self.dim ** 0.5)
        attn_weights = torch.softmax(scores, dim=-1)
        output = torch.matmul(attn_weights, v)
        return output, attn_weights

In [35]:
class DeepMAC(nn.Module):
    def __init__(self, num_layers=3, dim=512, vocab_size=50257):
        super().__init__()

        # Stack MAC layers
        self.mac_layers = nn.ModuleList([
            MAC_Layer(dim, vocab_size=dim) for _ in range(num_layers-1)
        ])

        # Final layer outputs vocab
        self.final_mac = MAC_Layer(dim, vocab_size=vocab_size)

    def forward(self, tokens):
        x = tokens

        # Process through stacked MACs
        for mac in self.mac_layers:
            logits, _ = mac(x)
            x = logits  # Feed output to next layer

        # Final output
        logits, weights = self.final_mac(x)

        return logits, weights

In [36]:
# LMM = Your brain remembering facts
# Attention = Your focus/concentration
# MAC = Using memory + focus together to understand

In [37]:
def prepare_data(dataset, max_len=50):
    sequences = []

    for i in range(min(100, len(dataset))):  # First 100 stories
        story = dataset[i]

        # Get text
        text = story['text']

        # Tokenize
        tokens = tokenizer.encode(text, add_special_tokens=True)

        # Split into chunks of max_len
        for j in range(0, len(tokens) - max_len, max_len):
            chunk = tokens[j:j + max_len]
            if len(chunk) == max_len:  # Only full chunks
                sequences.append(chunk)

    return sequences

sequences = prepare_data(dataset)
print(f"Prepared {len(sequences)} sequences")
print(f"First sequence length: {len(sequences[0])}")

Prepared 324 sequences
First sequence length: 50


In [None]:
# Move to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Move models to GPU
mac = DeepMAC(num_layers=3, dim=512, vocab_size=tokenizer.vocab_size).to(device)
optimizer = torch.optim.Adam(mac.parameters(), lr=0.0001)

optimizer = torch.optim.Adam(mac.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()

print("\n training Memory Augumented Context (MAC) on Tiny Stories (GPU)...\n")

for epoch in range(50):
    total_loss = 0

    for token_ids in sequences:
        # Move data to GPU
        seq_vecs = embed(torch.tensor(token_ids).to(device))

        # Forward
        logits, _ = mac(seq_vecs)

        # Calculate loss
        loss = 0
        for i in range(len(token_ids) - 1):
            pred_logits = logits[i]
            target_id = torch.tensor([token_ids[i+1]]).to(device)

            loss += criterion(pred_logits.unsqueeze(0), target_id)

        loss = loss / (len(token_ids) - 1)

        # Backprop
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(mac.parameters(), 1.0)
        optimizer.step()

        total_loss += loss.item()

    # Print EVERY epoch
    print(f"Epoch {epoch}: Loss = {total_loss:.2f}")

print("\n Training complete!")

Using device: cuda

 training Memory Augumented Context (MAC) on Tiny Stories (GPU)...



In [31]:
print("\n testing MAC Generation:\n")

def generate_next_word(prompt):
    # Tokenize prompt
    token_ids = tokenizer.encode(prompt)
    seq_vecs = embed(torch.tensor(token_ids).to(device))

    # Generate
    with torch.no_grad():
        logits, _ = mac(seq_vecs)
        next_logits = logits[-1]  # Last position

        # Get most likely token
        next_token_id = torch.argmax(next_logits).item()
        next_word = tokenizer.decode([next_token_id])

    return next_word

# Test prompts
test_prompts = [
    "Once upon a time",
    "The little girl",
    "One day there was",
    "A cat and a dog",
    "The boy wanted to"
]

for prompt in test_prompts:
    next_word = generate_next_word(prompt)
    print(f"'{prompt}' → '{next_word}'")


 testing MAC Generation:

'Once upon a time' → ' there'
'The little girl' → ' alert'
'One day there was' → '.'
'A cat and a dog' → ''s'
'The boy wanted to' → ' away'
