## TEST MODELS INITIALISATION

In [9]:
import sys
sys.path.append('../')

import time
import copy
import random

import torch
import torch.nn as nn
import torch.nn.functional as F

from dataclasses import dataclass

import matplotlib.pyplot as plt

import models
import importlib
importlib.reload(models)

<module 'models' from '/home/cagnetta/shakespeare/models/../models/__init__.py'>

## BUILD A SIMPLE TEXT DATASET TO TEST SMALL LANGUAGE MODEL

In [18]:
vocab = []
for i in range(97, 123):
    vocab.append(chr(i))
vocab_size = len(vocab)
print(vocab)
text = random.choices(vocab, weights=None, k=64)
text = ''.join([c for c in text])
print(text)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
kroziwwhcimuzelmadqkacdlabuyoagzvbzagwnoftoblehbayjacqaatvqlbcoa


In [19]:
class CharacterLevelTokenizer:

    def __init__(self, data):
        self.data = data
        self.vocab = []
        for i in range(97, 123):
            self.vocab.append(chr(i))
            self.vocab_size = len(self.vocab)

        self.i_to_s = { i:ch for i,ch in enumerate(self.vocab)}
        self.s_to_i = { ch:i for i,ch in self.i_to_s.items()}

    def encode(self,s):
        return torch.tensor([self.s_to_i[c] for c in s]).long()

    def decode(self,s):
        return ''.join([self.i_to_s[i.item()] for i in s])

tokenizer = CharacterLevelTokenizer(text)

In [20]:
@dataclass
class config:

    block_size = 64 # context length
    vocab_size = vocab_size

    num_heads = 4
    head_size = 32
    embedding_dim = num_heads*head_size
    num_layers = 3

    dropout = 0.1

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

config.device

'cuda'

In [21]:
model = models.SLM(
    vocab_size=config.vocab_size,
    block_size=config.block_size,
    embedding_dim=config.embedding_dim,
    num_heads=config.num_heads,
    num_layers=config.num_layers,
    dropout=config.dropout
)
model = model.to(device=config.device)
param_count = sum([p.numel() for p in model.parameters()])

context = torch.zeros((1,1), dtype=torch.long, device=config.device)
generated = model.generate(context, num_tokens=64)
print(f'Num. parameters: {param_count}, generated ({64} tokens):\n{tokenizer.decode(generated[0])}')

Num. parameters: 608410, generated (64 tokens):
aevbtbvmosiyjlahyvijjphcfquxkqcslfwaxedianvzyizyfqkmsrrppphpmusru
