In [1]:

import os
import random
import math
import re
from pathlib import Path
from typing import List, Tuple

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, processors
from tqdm.auto import tqdm

# ========== CONFIG ==========
DATA_PATH = "v1trainingdataset40mb.txt"   # your input txt file (USER/COMPATIKA pairs)
OUTPUT_DIR = "compatika_v1_lstm_checkpoints"
os.makedirs(OUTPUT_DIR, exist_ok=True)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ========== CONFIG ==========
DATA_PATH = "v1trainingdataset40mb.txt"   # your input txt file (USER/COMPATIKA pairs)
OUTPUT_DIR = "compatika_v1_lstm_checkpoints"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:

# Tokenizer
VOCAB_SIZE = 10000          # choose 8000-12000 as desired
MIN_FREQ = 2
TOKENIZER_FILE = os.path.join(OUTPUT_DIR, "bpe_tokenizer.json")


In [13]:
# Dataset / length
MAX_INPUT_LEN = 160         # context length for prompt (128-256)
MAX_TARGET_LEN = 128        # max reply length
BLOCK_INPUT = MAX_INPUT_LEN
BLOCK_TARGET = MAX_TARGET_LEN

# Model size => aim for 5-10M params (adjust to fit)
EMBED_DIM = 256             # token embedding dim
HIDDEN_SIZE = 256        # LSTM hidden size
NUM_LAYERS = 2              # encoder & decoder layers

# Training
BATCH_SIZE = 64             # reduce if OOM (try 32/16)
EPOCHS = 6
LR = 3e-4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEED = 42
torch.manual_seed(SEED)
random.seed(SEED)

# Misc
PAD_TOKEN = "[PAD]"
UNK_TOKEN = "[UNK]"
BOS_TOKEN = "[BOS]"
EOS_TOKEN = "[EOS]"

In [14]:
# ========== HELPERS: read prompt/reply from file ==========
def read_pairs_from_txt(path: str) -> List[Tuple[str,str]]:
    raw = Path(path).read_text(encoding="utf-8")
    blocks = re.split(r'\n\s*\n', raw.strip())
    pairs = []
    for b in blocks:
        # match USER: ... and COMPATIKA: ...
        user_m = re.search(r'USER:\s*(.+?)(?:\n|$)', b, flags=re.IGNORECASE)
        comp_m = re.search(r'COMPATIKA:\s*(.+)', b, flags=re.IGNORECASE | re.DOTALL)
        if user_m and comp_m:
            user = user_m.group(1).strip()
            comp = comp_m.group(1).strip()
            if user and comp:
                prompt = f"USER: {user}\nCOMPATIKA:"
                reply = comp
                pairs.append((prompt, reply))
    return pairs

if not Path(DATA_PATH).exists():
    raise FileNotFoundError(f"Data file not found: {DATA_PATH}")

pairs = read_pairs_from_txt(DATA_PATH)
print(f"Loaded {len(pairs)} prompt-reply pairs.")

Loaded 422648 prompt-reply pairs.


In [15]:
# ========== TRAIN OR LOAD TOKENIZER ==========
if not os.path.exists(TOKENIZER_FILE):
    print("Training BPE tokenizer...")
    tok = Tokenizer(models.BPE(unk_token=UNK_TOKEN))
    tok.pre_tokenizer = pre_tokenizers.ByteLevel()
    trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, min_frequency=MIN_FREQ,
                                  special_tokens=[PAD_TOKEN, UNK_TOKEN, BOS_TOKEN, EOS_TOKEN])
    tok.train([DATA_PATH], trainer)
    tok.post_processor = processors.ByteLevel(trim_offsets=True)
    tok.enable_truncation(max_length=MAX_INPUT_LEN + MAX_TARGET_LEN)  # safe limit
    tok.save(TOKENIZER_FILE)
    print("Tokenizer saved ->", TOKENIZER_FILE)
else:
    print("Loading tokenizer from", TOKENIZER_FILE)
    tok = Tokenizer.from_file(TOKENIZER_FILE)

vocab_size = tok.get_vocab_size()
pad_id = tok.token_to_id(PAD_TOKEN)
unk_id = tok.token_to_id(UNK_TOKEN)
bos_id = tok.token_to_id(BOS_TOKEN)
eos_id = tok.token_to_id(EOS_TOKEN)
print("Vocab size:", vocab_size, "pad:", pad_id, "unk:", unk_id)

def encode(text: str) -> List[int]:
    return tok.encode(text).ids

def decode(ids: List[int]) -> str:
    return tok.decode(ids)

Loading tokenizer from compatika_v1_lstm_checkpoints\bpe_tokenizer.json
Vocab size: 10000 pad: 0 unk: 1


In [16]:
# ========== DATASET ==========
class Seq2SeqDataset(Dataset):
    def __init__(self, pairs, tokenizer, max_input, max_target, pad_id, bos_id, eos_id):
        self.examples = []
        for prompt, reply in pairs:
            p_ids = tokenizer.encode(prompt).ids
            r_ids = tokenizer.encode(" " + reply).ids  # leading space helps BPE
            # ensure we have EOS for target
            if len(r_ids) == 0:
                continue
            if r_ids[-1] != eos_id:
                r_ids = r_ids + [eos_id]
            # truncate if necessary
            if len(p_ids) > max_input:
                p_ids = p_ids[-max_input:]  # keep most recent tokens if too long
            if len(r_ids) > max_target - 1:
                r_ids = r_ids[:max_target - 1] + [eos_id]
            self.examples.append((p_ids, r_ids))
        if len(self.examples) == 0:
            raise ValueError("No examples after tokenization/truncation.")
    def __len__(self): return len(self.examples)
    def __getitem__(self, idx):
        p_ids, r_ids = self.examples[idx]
        # build decoder input: BOS + r_ids[:-1]
        dec_in = [bos_id] + r_ids[:-1]
        # pad sequences to fixed lengths
        enc = p_ids[:] + [pad_id] * (MAX_INPUT_LEN - len(p_ids))
        dec_in_padded = dec_in + [pad_id] * (MAX_TARGET_LEN - len(dec_in))
        dec_out = r_ids + [pad_id] * (MAX_TARGET_LEN - len(r_ids))
        # convert to tensors
        return (torch.tensor(enc, dtype=torch.long),
                torch.tensor(dec_in_padded, dtype=torch.long),
                torch.tensor(dec_out, dtype=torch.long))

dataset = Seq2SeqDataset(pairs, tok, MAX_INPUT_LEN, MAX_TARGET_LEN, pad_id, bos_id, eos_id)
print("Prepared dataset with", len(dataset),"examples")
loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)


Prepared dataset with 422648 examples


In [17]:
# ========== MODEL: Encoder-Decoder LSTM with tied embeddings ==========
class LSTMSeq2Seq(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, num_layers, pad_id):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_id)
        self.encoder = nn.LSTM(embed_dim, hidden_size, num_layers=num_layers,
                               batch_first=True, bidirectional=False)
        self.decoder = nn.LSTM(embed_dim, hidden_size, num_layers=num_layers,
                               batch_first=True, bidirectional=False)
        # output projection uses tied embedding weights to save parameters
        self.output_proj = nn.Linear(hidden_size, vocab_size, bias=False)
        # tie weights
        self.output_proj.weight = self.embed.weight
        self.hidden_size = hidden_size
        self.num_layers = num_layers
    def forward(self, enc_input, dec_input, enc_lengths=None):
        # enc_input: B x L_enc
        # dec_input: B x L_dec
        emb_enc = self.embed(enc_input)   # B, L_enc, E
        # optionally pack padded sequence if you track lengths (we used fixed length)
        enc_outputs, (h_n, c_n) = self.encoder(emb_enc)  # h_n: num_layers x B x H
        emb_dec = self.embed(dec_input)   # B, L_dec, E
        dec_outputs, _ = self.decoder(emb_dec, (h_n, c_n))
        logits = self.output_proj(dec_outputs)  # B, L_dec, V
        return logits

model = LSTMSeq2Seq(vocab_size, EMBED_DIM, HIDDEN_SIZE, NUM_LAYERS, pad_id).to(DEVICE)
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Model parameters:", total_params)

Model parameters: 4665344


In [18]:

# Quick sanity: expected near 5-10M
# If too big/small, adjust EMBED_DIM, HIDDEN_SIZE, NUM_LAYERS, or VOCAB_SIZE.

# ========== TRAIN LOOP ==========
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss(ignore_index=pad_id)
model.train()

global_step = 0
for epoch in range(1, EPOCHS+1):
    pbar = tqdm(loader, desc=f"Epoch {epoch}/{EPOCHS}")
    running_loss = 0.0
    for enc_in, dec_in, dec_out in pbar:
        enc_in = enc_in.to(DEVICE)
        dec_in = dec_in.to(DEVICE)
        dec_out = dec_out.to(DEVICE)
        logits = model(enc_in, dec_in)  # B, L_dec, V
        B, L, V = logits.shape
        loss = criterion(logits.view(B*L, V), dec_out.view(B*L))
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        running_loss += loss.item()
        global_step += 1
        if global_step % 100 == 0:
            pbar.set_postfix({"loss": running_loss / 100})
            running_loss = 0.0
        if global_step % 2000 == 0:
            ck = os.path.join(OUTPUT_DIR, f"step_{global_step}.pt")
            torch.save({"model": model.state_dict(), "tokenizer": TOKENIZER_FILE}, ck)
            print("Saved checkpoint:", ck)


Epoch 1/6:  30%|███       | 2001/6603 [02:08<05:04, 15.13it/s, loss=1.86]

Saved checkpoint: compatika_v1_lstm_checkpoints\step_2000.pt


Epoch 1/6:  61%|██████    | 4003/6603 [04:17<02:52, 15.08it/s, loss=1.68]

Saved checkpoint: compatika_v1_lstm_checkpoints\step_4000.pt


Epoch 1/6:  91%|█████████ | 6001/6603 [06:21<00:38, 15.81it/s, loss=1.61]

Saved checkpoint: compatika_v1_lstm_checkpoints\step_6000.pt


Epoch 1/6: 100%|██████████| 6603/6603 [06:58<00:00, 15.77it/s, loss=1.61]
Epoch 2/6:  21%|██        | 1398/6603 [01:25<06:30, 13.33it/s, loss=1.55]

Saved checkpoint: compatika_v1_lstm_checkpoints\step_8000.pt


Epoch 2/6:  51%|█████▏    | 3400/6603 [03:27<03:39, 14.62it/s, loss=1.59]

Saved checkpoint: compatika_v1_lstm_checkpoints\step_10000.pt


Epoch 2/6:  82%|████████▏ | 5398/6603 [05:34<01:18, 15.44it/s, loss=1.55]

Saved checkpoint: compatika_v1_lstm_checkpoints\step_12000.pt


Epoch 2/6: 100%|██████████| 6603/6603 [06:51<00:00, 16.06it/s, loss=1.52]
Epoch 3/6:  12%|█▏        | 796/6603 [00:49<06:12, 15.59it/s, loss=1.52]

Saved checkpoint: compatika_v1_lstm_checkpoints\step_14000.pt


Epoch 3/6:  42%|████▏     | 2796/6603 [02:54<03:57, 16.05it/s, loss=1.45]

Saved checkpoint: compatika_v1_lstm_checkpoints\step_16000.pt


Epoch 3/6:  73%|███████▎  | 4796/6603 [04:53<01:53, 15.99it/s, loss=1.45]

Saved checkpoint: compatika_v1_lstm_checkpoints\step_18000.pt


Epoch 3/6: 100%|██████████| 6603/6603 [06:46<00:00, 16.23it/s, loss=1.47]
Epoch 4/6:   3%|▎         | 194/6603 [00:12<06:52, 15.55it/s, loss=1.46]

Saved checkpoint: compatika_v1_lstm_checkpoints\step_20000.pt


Epoch 4/6:  33%|███▎      | 2192/6603 [02:19<04:52, 15.09it/s, loss=1.48]

Saved checkpoint: compatika_v1_lstm_checkpoints\step_22000.pt


Epoch 4/6:  63%|██████▎   | 4192/6603 [04:25<02:38, 15.20it/s, loss=1.37]

Saved checkpoint: compatika_v1_lstm_checkpoints\step_24000.pt


Epoch 4/6:  94%|█████████▍| 6192/6603 [06:32<00:27, 15.20it/s, loss=1.45]

Saved checkpoint: compatika_v1_lstm_checkpoints\step_26000.pt


Epoch 4/6: 100%|██████████| 6603/6603 [06:58<00:00, 15.79it/s, loss=1.49]
Epoch 5/6:  24%|██▍       | 1590/6603 [01:40<05:28, 15.28it/s, loss=1.4] 

Saved checkpoint: compatika_v1_lstm_checkpoints\step_28000.pt


Epoch 5/6:  54%|█████▍    | 3590/6603 [03:46<03:15, 15.45it/s, loss=1.43]

Saved checkpoint: compatika_v1_lstm_checkpoints\step_30000.pt


Epoch 5/6:  85%|████████▍ | 5590/6603 [05:50<01:01, 16.36it/s, loss=1.46]

Saved checkpoint: compatika_v1_lstm_checkpoints\step_32000.pt


Epoch 5/6: 100%|██████████| 6603/6603 [06:51<00:00, 16.04it/s, loss=1.39]
Epoch 6/6:  15%|█▍        | 988/6603 [00:59<05:47, 16.14it/s, loss=1.42]

Saved checkpoint: compatika_v1_lstm_checkpoints\step_34000.pt


Epoch 6/6:  45%|████▌     | 2988/6603 [03:00<03:43, 16.19it/s, loss=1.36]

Saved checkpoint: compatika_v1_lstm_checkpoints\step_36000.pt


Epoch 6/6:  76%|███████▌  | 4988/6603 [05:01<01:40, 16.06it/s, loss=1.41]

Saved checkpoint: compatika_v1_lstm_checkpoints\step_38000.pt


Epoch 6/6: 100%|██████████| 6603/6603 [06:38<00:00, 16.58it/s, loss=1.39]


In [19]:
# final save
torch.save({"model": model.state_dict(), "tokenizer": TOKENIZER_FILE}, os.path.join(OUTPUT_DIR, "final.pt"))
print("Training finished. Saved final checkpoint.")

Training finished. Saved final checkpoint.


In [20]:
# ========== GENERATION: greedy decoding ==========
model.eval()
@torch.no_grad()
def generate_reply(user_text: str, max_len: int = 120):
    prompt = f"USER: {user_text}\nCOMPATIKA:"
    enc_ids = tok.encode(prompt).ids
    if len(enc_ids) > MAX_INPUT_LEN:
        enc_ids = enc_ids[-MAX_INPUT_LEN:]
    enc_tensor = torch.tensor(enc_ids + [pad_id]*(MAX_INPUT_LEN - len(enc_ids)), dtype=torch.long).unsqueeze(0).to(DEVICE)
    # initialize decoder input with BOS
    dec_input = [bos_id] + [pad_id]*(MAX_TARGET_LEN-1)
    dec_t = torch.tensor(dec_input, dtype=torch.long).unsqueeze(0).to(DEVICE)
    # run encoder to get hidden states
    with torch.no_grad():
        emb_enc = model.embed(enc_tensor)
        enc_out, (h_n, c_n) = model.encoder(emb_enc)
        # decode step by step (greedy)
        generated = []
        h, c = h_n, c_n
        prev_token = torch.tensor([[bos_id]], dtype=torch.long).to(DEVICE)
        for step in range(max_len):
            emb = model.embed(prev_token)  # 1,1,E
            out, (h, c) = model.decoder(emb, (h, c))
            logits = model.output_proj(out[:, -1, :])  # 1,V
            next_id = torch.argmax(logits, dim=-1).item()
            if next_id == eos_id or next_id == pad_id:
                break
            generated.append(next_id)
            prev_token = torch.tensor([[next_id]], dtype=torch.long).to(DEVICE)
        return tok.decode(generated)

# quick test
print("Sample generation (likely gibberish before training):")
print(generate_reply("I feel nervous about tomorrow."))



Sample generation (likely gibberish before training):
ĠI Ġhear Ġwhat Ġyou âĢĻ re Ġsaying .


In [10]:
pip list

Package           Version
----------------- ------------
anyio             4.12.0
asttokens         3.0.1
certifi           2025.11.12
click             8.3.1
colorama          0.4.6
comm              0.2.3
debugpy           1.8.17
decorator         5.2.1
einops            0.8.1
exceptiongroup    1.3.1
executing         2.2.1
filelock          3.20.0
fsspec            2025.12.0
h11               0.16.0
hf-xet            1.2.0
httpcore          1.0.9
httpx             0.28.1
huggingface_hub   1.2.1
idna              3.11
ipykernel         7.1.0
ipython           8.37.0
jedi              0.19.2
Jinja2            3.1.6
jupyter_client    8.6.3
jupyter_core      5.9.1
MarkupSafe        2.1.5
matplotlib-inline 0.2.1
mpmath            1.3.0
nest-asyncio      1.6.0
networkx          3.3
numpy             2.1.2
packaging         25.0
parso             0.8.5
pillow            11.3.0
pip               21.2.3
platformdirs      4.5.1
prompt_toolkit    3.0.52
psutil            7.1.3
pure_eval       

You should consider upgrading via the 'c:\Users\aman\Desktop\Compatika-v1\venv\Scripts\python.exe -m pip install --upgrade pip' command.


Name: torch
Version: 2.5.1+cu121
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3-Clause
Location: c:\users\aman\desktop\compatika-v1\venv\lib\site-packages
Requires: typing-extensions, filelock, jinja2, fsspec, sympy, networkx
Required-by: torchvision
Note: you may need to restart the kernel to use updated packages.
