In [65]:
from dataclasses import dataclass
from pathlib import Path
import torch
import torch.nn as nn
from jaxtyping import Float, Int
import requests
import unicodedata
from collections import Counter


# Training Data

In [66]:
def get_gutenberg_book(
	id: int|None = 84,
	data_temp: Path|str = "../../../../data/gutenberg_data",
	remove_gutenberg_meta: bool = True,
) -> str:
	
	data_temp = Path(data_temp)
	data_temp.mkdir(parents=True, exist_ok=True)
	
	url: str = f"https://www.gutenberg.org/cache/epub/{id}/pg{id}.txt"
	data_path: Path = Path(data_temp) / f"{id}.txt"
	data: str
	# read from cache if it exists
	if data_path.exists():
		with open(data_path, 'r', encoding='utf-8') as file:
			data = file.read()
	else:
		# download if it doesn't exist
		response = requests.get(url)
		response.raise_for_status()  # Ensure that the download was successful
		data = response.text

		# save to cache
		with open(data_path, 'w', encoding='utf-8') as file:
			file.write(data)

	# remove header/footer
	if remove_gutenberg_meta:
		data = '***'.join(data.split('***')[2:])
		data = '***'.join(data.split('***')[:-1])
	
	return data

def get_many_books(
		ids: list[int],
		data_temp: Path|str = "../data/gutenberg_data",
	) -> list[list[str]]:
	
	data: list[str] = []
	for id in ids:
		print(f"Getting book {id}...")
		item: str = get_gutenberg_book(id, data_temp)
		print(f"\t{len(item)} characters read")
		data.append(item)
	
	return data

# Model Definition

In [67]:
class Tokenizer():
    def __init__(self, raw_data: str):
        self.raw_data = raw_data 
        self.vocab_size = None
        self.vocab = None
        self.vocab_inverse = None
        
    def process_raw_data(self, text: str, 
                        allowed_punctuation: str = "-.,;:!?()\"" + "".join(str(x) for x in range(10)),
                        punctuation_convert: dict[str,str] = {'â€”': '-'}
                        ) -> str:
        for char, replacement in punctuation_convert.items():
            text = text.replace(char, replacement)
              
        text = '\n'.join(
                    line 
                    for line in text.split('\n')
                    if '.jpg' not in line
                )
        
        text = unicodedata.normalize('NFKD', text)

        # Encode to ASCII bytes, then decode back to string, ignoring errors
        text = text.encode('ascii', 'ignore').decode('ascii')

        # remove newlines and tabs
        text = text.replace('\n', ' ').replace('\t', ' ')

        for char in allowed_punctuation:
            text = text.replace(char, f' {char} ')
              
        text = text.strip()

        # remove multiple spaces
        while '  ' in text:
            text = text.replace('  ', ' ')

        text = ''.join((char if (char.isalnum() or char in allowed_punctuation or char == ' ') else ' ') for char in text)
        
        text = text.lower()

        text = text.strip()

        return text
    
    def tokenize(self, 
        text: str,
        process: bool = False,
    ) -> str:
        if process:
            text = self.process_raw_data(text)
        tokenized_text = text.split(' ')

        vocab_counts: Counter[str] = Counter(tokenized_text).most_common()
        self.vocab_size: int = len(vocab_counts)
        self.vocab: list[str] = [token for token, count in vocab_counts]
        self.vocab_inverse: dict[int: str] = {key: value for key, value in (self.vocab, range(0, self.vocab_size))}

        return tokenized_text
        
    def encode(self, data: str) -> Int[torch.Tensor, "seq_len"]:
        return torch.tensor([self.vocab_inverse[word] for word in data], dtype=torch.int)
    
    def decode(self, tokens: Int[torch.Tensor, "seq_len"]) -> str:
        return ''.join(self.vocab[token] for token in tokens)

In [68]:
@dataclass
class Config():
    d_model: int
    d_vocab: int
    d_hidden: int
    tokenizer: Tokenizer

In [69]:
class MLP(nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        self.linear1 = nn.Linear(config.d_model, config.d_hidden)
        self.linear2 = nn.Linear(config.d_hidden, config.d_model)
        

    def forward(self, x: Float[torch.Tensor, "seq_len d_model"]) -> Float[torch.Tensor, "seq_len d_model"]:
        x = self.linear1(x)
        x = torch.relu(x)
        x = self.linear2(x)
        return x

In [70]:
class AttentionHead(nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        self.config = config
        self.W_qk = nn.Linear(config.d_model, config.d_model)
        self.W_vo = nn.Linear(config.d_model, config.d_model)
        self.softmax = nn.Softmax(dim=-1)
        

    def create_mask(self, n_c: int) -> torch.Tensor:
        mask: Float[torch.Tensor, "seq_len seq_len"] = torch.triu(-1 * torch.inf * torch.ones(n_c, n_c), diagonal=1)
        return mask

    def forward(self, x: Float[torch.Tensor, "seq_len d_model"]) -> Float[torch.Tensor, "seq_len d_model"]:
        #create mask, with size n_c x n_c
        mask = self.create_mask(x.shape[0])

        #compute attention scores
        # A = softmax((X @ W_qk @ X^T) + M) @ X @ W_vo
        A = self.softmax((self.W_qk(x)) @ x.transpose(0, -1) + mask) @ self.W_vo(x)
        return A

In [71]:
class TransformerBlock(torch.nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        self.attention_head = AttentionHead(config)
        self.mlp = MLP(config)

    def forward(self, x: Float[torch.Tensor, "seq_len d_model"]) -> Float[torch.Tensor, "seq_len d_model"]:
        return x + self.attention_head(x) + self.mlp(x)

In [72]:
class Transformer(torch.nn.Module):
    def __init__(self, num_blocks: int, config: Config):
        super().__init__()
        self.config = config
        self.embedding = nn.Linear(config.d_vocab, config.d_model)
        self.blocks = nn.ModuleList([TransformerBlock(config) for _ in range(num_blocks)])
        

    def forward(self, x: Int[torch.Tensor, "seq_len"]) -> Float[torch.Tensor, "vocab seq_len"]:
        # eventually take in x as a string and tokenize in this function
        x_onehot = torch.zeros(x.shape[0], self.config.d_vocab)
        for i, token in enumerate(x):
            x_onehot[i, token] = 1.0
        x = self.embedding(x_onehot)
        # print(x.shape)
        for block in self.blocks:
            x = block.forward(x)
        x = (x @ self.embedding.weight)
        return x
    
    def generate_output(self, x:str) -> str:
        x = self.config.tokenizer.tokenize(x,process=True)
        self.forward(self.config.tokenizer.encode(x))
        return ""
    

# Tests

In [None]:
# Attention head test
x: Float[torch.Tensor, "seq_len d_model"] = torch.ones(5, 16)
config = Config(d_model=16, d_vocab=1000, d_hidden=64)
attention_head: AttentionHead = AttentionHead(config)
output: Float[torch.Tensor, "seq_len d_model"] = attention_head.forward(x)
print(output.shape)

TypeError: Config.__init__() missing 1 required positional argument: 'tokenizer'

In [None]:
# Test the whole thing
config = Config(d_model=16, d_vocab=1000, d_hidden=64)
transformer = Transformer(num_blocks=2, config=config)
x = torch.tensor([5, 3, 7, 1, 10], dtype=torch.int)
y: Float[torch.Tensor, "vocab seq_len"] = transformer(x)
print(y.shape)
print(y)
print(x)

torch.Size([5, 16])
torch.Size([5, 1000])
tensor([[ 0.0088,  0.0272,  0.0065,  ...,  0.0366, -0.0292,  0.0105],
        [ 0.0100,  0.0279,  0.0051,  ...,  0.0369, -0.0305,  0.0089],
        [ 0.0099,  0.0291,  0.0090,  ...,  0.0321, -0.0307,  0.0103],
        [ 0.0120,  0.0348,  0.0129,  ...,  0.0316, -0.0317,  0.0108],
        [ 0.0106,  0.0297,  0.0104,  ...,  0.0314, -0.0304,  0.0106]],
       grad_fn=<MmBackward0>)
tensor([ 5,  3,  7,  1, 10], dtype=torch.int32)


# Training Loop

In [None]:
def train_model(
    model: Transformer,
    loss: torch.nn.CrossEntropyLoss = nn.CrossEntropyLoss(),
    lr: Float = 1e-3,
    epochs: Int = 1
    ):
    optimizer: torch.optim.SGD = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    string_data = get_many_books(ids=range(10,15), data_temp="./data/gutenberg_data")
    tokenized_string_data = [x for i in string_data[]]
    labels = {}
    int_labels = []
    for i in range(len(string_data)):
        labels[i] = string_data[i]
        int_labels.append(i)
    
    training_data = torch.tensor(int_labels)
    
    for epoch in range(epochs):
        
        optimizer.zero_grad()
        outputs = model(training_data)
    



SyntaxError: invalid syntax (341824798.py, line 9)

In [None]:
config: Config = Config(d_model=16, d_vocab=1000, d_hidden=64)
model = Transformer(num_blocks=2, config=config)
train_model(model)

Getting book 10...
	4432261 characters read
Getting book 11...
	148062 characters read
Getting book 12...
	168390 characters read
Getting book 13...
	34579 characters read
Getting book 14...
	1951150 characters read


RuntimeError: mat1 and mat2 must have the same dtype, but got Long and Float