# **LMM (Long-Term Memory Module)**
Lets build a tiny LMM (Long-Term Memory Module

In [None]:
import torch
import torch.nn as nn

In [None]:
# VOCAB
vocab = [
    # Keys
    "user_name", "pet_name", "favorite_color", "city", "job",
    "age", "hobby", "car", "food", "sport",

    # Values - Person 1 (Fahad)
    "Fahad", "Max", "blue", "Toronto", "engineer",
    "23", "coding", "Tesla", "biryani", "MMA",

    # Values - Person 2 (Alice)
    "Alice", "Luna", "green", "Berlin", "designer",
    "28", "painting", "BMW", "pasta", "tennis",

    # Structure
    "is", "has", "likes"
]

word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for i, w in enumerate(vocab)}


In [None]:
# Weights (Frozen)
embed = nn.Embedding(len(vocab), 16)
embed.weight.requires_grad = False # weights freezed

# LMM (BIGGER MLP for more capacity)
class LMM(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(16, 128),  # capacity
            nn.ReLU(),
            nn.Linear(128, 16)
        )
    def forward(self, x):
        return self.net(x)

lmm = LMM()
optimizer = torch.optim.SGD(lmm.parameters(), lr=0.2)

# --- HELPERS ---
def word_to_vec(w):
    return embed(torch.tensor([word2idx[w]]))

def vec_to_word(v):
    with torch.no_grad():
        dists = torch.norm(embed.weight - v, dim=1)
        return idx2word[dists.argmin().item()]

In [None]:
# Attention
class SimpleAttention(nn.Module):
    def __init__(self, dim=16):
        super().__init__()
        self.query = nn.Linear(dim, dim)
        self.key = nn.Linear(dim, dim)
        self.value = nn.Linear(dim, dim)

    def forward(self, x):
        # x shape: (seq_len, dim) or (1, dim)

        q = self.query(x)  # (seq_len, dim)
        k = self.key(x)    # (seq_len, dim)
        v = self.value(x)  # (seq_len, dim)

        # FIX: Use transpose instead of .T
        scores = torch.matmul(q, k.transpose(-2, -1)) / (16 ** 0.5)

        attn_weights = torch.softmax(scores, dim=-1)
        output = torch.matmul(attn_weights, v)

        return output, attn_weights

In [None]:
# MAC Layer (combines LMM + Attention)
class MAC_Layer(nn.Module):
    def __init__(self):
        super().__init__()
        self.lmm = LMM()
        self.attention = SimpleAttention(16)

    def forward(self, tokens):
        # Get memory for each token
        memory_vecs = torch.stack([self.lmm(tok) for tok in tokens])

        # Combine original + memory
        combined = torch.cat([tokens, memory_vecs], dim=0)

        # Attention over both
        output, weights = self.attention(combined)

        return output[:len(tokens)], weights  # Return first half only

In [None]:

# DATA: INTERLEAVED FACTS
base_facts = [
    ["user_name", "is", "Fahad"],
    ["pet_name", "is", "Max"],
    ["city", "is", "Toronto"],
    ["job", "is", "engineer"],
    ["favorite_color", "is", "blue"],
    ["age", "is", "23"],
    ["hobby", "is", "coding"],
    ["car", "is", "Tesla"],
    ["food", "likes", "biryani"],
    ["sport", "likes", "MMA"]
]
# Repeat 30x ‚Üí 90 training steps, evenly mixed
sequence = base_facts * 30

In [None]:
# Check if all words in sequences are in vocab
print("üîç Checking vocabulary coverage...\n")

all_words = set()
for seq in sequence:
    for word in seq:
        all_words.add(word)

missing_words = []
for word in all_words:
    if word not in word2idx:
        missing_words.append(word)

if missing_words:
    print(f"‚ùå Missing words: {missing_words}")
    print(f"\nCurrent vocab: {vocab}")
else:
    print("‚úÖ All words found in vocabulary!")

print()

üîç Checking vocabulary coverage...

‚úÖ All words found in vocabulary!



In [None]:
print("\nTraining MAC on sequences...\n")

mac = MAC_Layer()
optimizer = torch.optim.SGD(mac.parameters(), lr=0.05, weight_decay=0.0001)  # ‚Üê Lower LR + decay

for epoch in range(200):  # More epochs with lower LR
    total_loss = 0

    for seq in sequence:
        seq_vecs = torch.stack([word_to_vec(w).squeeze(0) for w in seq])
        output, attn_weights = mac(seq_vecs)

        loss = 0
        for i in range(len(seq) - 1):
            pred = output[i]
            target = seq_vecs[i + 1]
            loss += (pred - target).pow(2).mean()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    if epoch % 40 == 0:
        print(f"Epoch {epoch}: Loss = {total_loss:.4f}")

print("\n‚úÖ Training complete!")


Training MAC on sequences...

Epoch 0: Loss = 301.3813
Epoch 40: Loss = 0.0076
Epoch 80: Loss = 0.0032
Epoch 120: Loss = 0.0022
Epoch 160: Loss = 0.0019

‚úÖ Training complete!


In [None]:
print("\n-----------TOKEN-BY-TOKEN GENERATION TEST:----------------\n")

tests = [
    (["user_name", "is"], "Fahad"),
    (["pet_name", "is"], "Max"),
    (["city", "is"], "Toronto"),
    (["job", "is"], "engineer"),
    (["favorite_color", "is"], "blue"),
    (["age", "is"], "23"),
    (["hobby", "is"], "coding"),
    (["car", "is"], "Tesla"),
    (["food", "likes"], "biryani"),
    (["sport", "likes"], "MMA")
]

correct = 0
for query, expected in tests:
    query_vecs = torch.stack([word_to_vec(w).squeeze(0) for w in query])
    output, _ = mac(query_vecs)
    pred_word = vec_to_word(output[-1])

    status = "‚úÖ" if pred_word == expected else "‚ùå"
    print(f"{status} '{' '.join(query)}' ‚Üí Expected: '{expected}' | Got: '{pred_word}'")
    if pred_word == expected:
        correct += 1

print(f"\nScore: {correct}/10")


-----------TOKEN-BY-TOKEN GENERATION TEST:----------------

‚úÖ 'user_name is' ‚Üí Expected: 'Fahad' | Got: 'Fahad'
‚úÖ 'pet_name is' ‚Üí Expected: 'Max' | Got: 'Max'
‚úÖ 'city is' ‚Üí Expected: 'Toronto' | Got: 'Toronto'
‚ùå 'job is' ‚Üí Expected: 'engineer' | Got: 'BMW'
‚úÖ 'favorite_color is' ‚Üí Expected: 'blue' | Got: 'blue'
‚úÖ 'age is' ‚Üí Expected: '23' | Got: '23'
‚ùå 'hobby is' ‚Üí Expected: 'coding' | Got: 'likes'
‚úÖ 'car is' ‚Üí Expected: 'Tesla' | Got: 'Tesla'
‚úÖ 'food likes' ‚Üí Expected: 'biryani' | Got: 'biryani'
‚ùå 'sport likes' ‚Üí Expected: 'MMA' | Got: 'car'

Score: 7/10


#### **Flow of How all works**


---

## **What Each Part Does:**

### **1. Vocabulary**
- List of all words the model knows
- Creates mappings: word ‚Üî number

### **2. Embeddings (Frozen)**
- Converts each word into a vector of 16 numbers
- "Frozen" = these vectors don't change during training
- Like a dictionary: "Fahad" ‚Üí [0.2, -0.1, 0.8, ...]

### **3. LMM (Long-Term Memory)**
- A small neural network (2 layers)
- Takes a 16-number vector IN
- Outputs a 16-number vector OUT
- **Its job:** Learn to map questions ‚Üí answers
- Example: "pet_name" ‚Üí "Max"

### **4. Attention**
- Compares tokens to each other
- Decides which tokens are important
- Creates weighted combinations
- **Its job:** "Which past tokens should I focus on?"

### **5. MAC Layer**
- **Combines LMM + Attention**
- For each token:
  - Get memory summary from LMM
  - Combine original token + memory summary
  - Use attention to decide: use memory or not?

### **6. Training Loop**
- Processes sequences: ["user_name", "is", "Fahad"]
- At each position, tries to predict the NEXT word
- Position 0: See "user_name" ‚Üí predict "is"
- Position 1: See "user_name is" ‚Üí predict "Fahad"
- Calculates error (loss)
- Updates LMM and Attention weights to reduce error

### **7. Testing**
- After training, ask: "pet_name" ‚Üí what's the answer?
- MAC uses both memory and attention to respond

---


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
