In [61]:
# ============================================================
# COMPATIKA V1-ALPHA TRAINING + TESTING (PURE PYTORCH)
# ============================================================

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import json
import sentencepiece as spm
from tqdm import tqdm
import torch.nn.functional as F

In [62]:

# ============================================================
# 1Ô∏è‚É£ Load tokenizer
# ============================================================
sp = spm.SentencePieceProcessor(model_file="tok/compatika_sp.model")
vocab_size = sp.get_piece_size()
print("‚úÖ Tokenizer loaded. Vocab size:", vocab_size)


‚úÖ Tokenizer loaded. Vocab size: 10000


In [63]:
# 2Ô∏è‚É£ Model definition
# ============================================================
class CompatikaModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=192, n_heads=4, n_layers=3, hidden_dim=256):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=n_heads,
            dim_feedforward=hidden_dim,
            activation="gelu"
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        self.fc_out = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        x = self.embed(x)
        x = x.permute(1, 0, 2)   # (seq, batch, embed)
        x = self.transformer(x)
        x = self.fc_out(x)
        return x.permute(1, 0, 2)  # (batch, seq, vocab)

In [68]:
# ============================================================
# 3Ô∏è‚É£ Dataset loading
# ============================================================
class ChatDataset(Dataset):
    def __init__(self, path):
        with open(path, "r", encoding="utf-8") as f:
            self.samples = [json.loads(line) for line in f]

    def __len__(self): return len(self.samples)

    def __getitem__(self, idx):
        s = self.samples[idx]
        # Input: full conversation up to COMPATIKA
        inp_text  = f"USER: {s['user']}\nCOMPATIKA:"
        # Target: only the reply
        out_text  = s['compatika']

        x = torch.tensor(sp.encode(inp_text, out_type=int), dtype=torch.long)
        y = torch.tensor(sp.encode(out_text, out_type=int), dtype=torch.long)
        return x, y


def collate(batch):
    xs, ys = zip(*batch)
    x = nn.utils.rnn.pad_sequence(xs, batch_first=True, padding_value=0)
    y = nn.utils.rnn.pad_sequence(ys, batch_first=True, padding_value=0)
    min_len = min(x.size(1), y.size(1))
    x = x[:, :min_len]
    y = y[:, :min_len]
    return x, y
train_data = ChatDataset("data/train.jsonl")
val_data = ChatDataset("data/val.jsonl")
train_loader = DataLoader(train_data, batch_size=8, shuffle=True, collate_fn=collate)
val_loader = DataLoader(val_data, batch_size=8, collate_fn=collate)

print(f"Loaded {len(train_data)} train samples and {len(val_data)} val samples.")

Loaded 68940 train samples and 7660 val samples.


In [69]:
# 4Ô∏è‚É£ Training setup
# ============================================================
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CompatikaModel(vocab_size).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.AdamW(model.parameters(), lr=2e-4)


In [70]:
# 5Ô∏è‚É£ Training loop
# ============================================================
epochs = 7
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for x, y in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        out = model(x)
        loss = criterion(out.reshape(-1, vocab_size), y.reshape(-1))

        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Train loss: {total_loss / len(train_loader):.4f}")

# Validation
model.eval()
val_loss = 0
with torch.no_grad():
    for x, y in val_loader:
        x, y = x.to(device), y.to(device)
        out = model(x)
        loss = criterion(out.reshape(-1, vocab_size), y.reshape(-1))
        val_loss += loss.item()
print(f"Val loss: {val_loss / len(val_loader):.4f}\n")

Epoch 1/7: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8618/8618 [01:33<00:00, 92.12it/s]


Train loss: 5.9870


Epoch 2/7: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8618/8618 [01:33<00:00, 91.98it/s]


Train loss: 5.8091


Epoch 3/7: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8618/8618 [01:33<00:00, 91.97it/s]


Train loss: 5.7066


Epoch 4/7: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8618/8618 [01:41<00:00, 85.04it/s]


Train loss: 5.6269


Epoch 5/7: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8618/8618 [01:44<00:00, 82.50it/s]


Train loss: 5.5538


Epoch 6/7: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8618/8618 [01:33<00:00, 91.83it/s]


Train loss: 5.4896


Epoch 7/7: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8618/8618 [01:34<00:00, 91.51it/s]


Train loss: 5.4331
Val loss: 5.7200



In [71]:

# ============================================================
# 6Ô∏è‚É£ Save model







# ============================================================
torch.save(model.state_dict(), "compatika_v1alpha_scratch.pt")
print("‚úÖ Model saved as compatika_v1alpha_scratch.pt")

‚úÖ Model saved as compatika_v1alpha_scratch.pt


In [72]:
# ============================================================
# 7Ô∏è‚É£ Reload model for testing
# ============================================================
model = CompatikaModel(vocab_size)
model.load_state_dict(torch.load("compatika_v1alpha_scratch.pt", map_location="cpu"))
model.eval()

print("\n‚úÖ Model and tokenizer loaded successfully for testing!")


‚úÖ Model and tokenizer loaded successfully for testing!


  model.load_state_dict(torch.load("compatika_v1alpha_scratch.pt", map_location="cpu"))


In [73]:
def generate_response(prompt, max_len=60, temperature=0.8, top_p=0.9, repetition_penalty=1.3):
    # ensure clean input
    prompt = prompt.strip().replace("\n", " ")
    tokens = sp.encode(prompt, out_type=int)
    x = torch.tensor(tokens, dtype=torch.long).unsqueeze(0)
    generated = set(tokens)

    for _ in range(max_len):
        with torch.no_grad():
            out = model(x)

        logits = out[0, -1] / temperature
        logits = torch.clamp(logits, -20, 20)

        for t in generated:
            logits[t] /= repetition_penalty

        probs = F.softmax(logits, dim=-1)
        sorted_probs, sorted_indices = torch.sort(probs, descending=True)
        cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
        sorted_indices_to_remove = cumulative_probs > top_p
        sorted_probs[sorted_indices_to_remove] = 0
        sorted_probs = sorted_probs / sorted_probs.sum()

        next_id = torch.multinomial(sorted_probs, 1).item()
        tokens.append(next_id)
        generated.add(next_id)
        if next_id == 3:
            break

        x = torch.tensor(tokens, dtype=torch.long).unsqueeze(0)

    text = sp.decode(tokens)

    # remove USER/COMPATIKA echoes and stray symbols
    text = text.replace("USER:", "").replace("COMPATIKA:", "").replace("_comma", ",")
    text = text.replace("‚Åá", "").strip()
    return text


In [74]:
# ============================================================
# üß™ TEST MODE (for Jupyter)
# ============================================================

test_inputs = [
   
  "I'm really upset. My friend ignored my message."
 
]


for user_input in test_inputs:
    prompt = f"USER: {user_input}\nCOMPATIKA:"
    reply = generate_response(prompt)
    print(f"USER: {user_input}")
    print(f"Compatika: {reply}\n{'-'*70}")


USER: I'm really upset. My friend ignored my message.
Compatika: I'm really upset. My friend ignored my message.  theyed they It   being get much my.' my
----------------------------------------------------------------------


RuntimeError: probability tensor contains either `inf`, `nan` or element < 0

In [19]:
print(sp.encode("I feel anxious today.", out_type=int))
print(sp.decode(sp.encode("I feel anxious today.", out_type=int)))


[5, 87, 112, 204, 4]
I feel anxious today.


In [26]:
# quick example cleaning step
import re, json
def clean_text(t):
    t = t.replace("_comma", ",")
    t = re.sub(r"\s+", " ", t)
    t = re.sub(r"[0-9]+\|[0-9]+\|[0-9]+[_0-9|]*", "", t)  # remove rating patterns
    t = re.sub(r":\d+", "", t)  # remove stray numeric tokens
    return t.strip()

with open("data/train.jsonl","r",encoding="utf-8") as f, open("data/train_clean.jsonl","w",encoding="utf-8") as out:
    for line in f:
        s = json.loads(line)
        s["user"] = clean_text(s["user"])
        s["compatika"] = clean_text(s["compatika"])
        out.write(json.dumps(s, ensure_ascii=False) + "\n")



In [27]:
input_text = f"USER: {s['user']}\nCOMPATIKA:"
target_text = s['compatika']


In [29]:
state_dict = torch.load("compatika_v1alpha_scratch.pt")
model.load_state_dict(state_dict)



  state_dict = torch.load("compatika_v1alpha_scratch.pt")


FileNotFoundError: [Errno 2] No such file or directory: 'compatika_v1alpha_scratch.pt'

In [25]:
txt = "I feel anxious today."
print(sp.decode(sp.encode(txt, out_type=int)))


I feel anxious today.
