# Financial Explainer GPT Model

## Model

In [1]:
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import DataLoader
import torch.nn.init as init
import pandas as pd
from tqdm import tqdm

In [2]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super().__init__()
        pe = torch.zeros((max_seq_length, d_model))
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]
    

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0, "Vetor de embedding precisa ser divisivel pelo número de cabeças da camada de atenção!"
        self.head_dim = d_model // num_heads
        self.d_model, self.num_heads = d_model, num_heads
        self.q = nn.Linear(d_model, d_model)
        self.k = nn.Linear(d_model, d_model)
        self.v = nn.Linear(d_model, d_model)
        self.output_linear = nn.Linear(d_model, d_model)

    def split_heads(self, x, encoder_output=None):
        # Entra Q, K, V com dimensão (batch_size, sequence_length, d_model)
        # Reshape para (batch_size, sequence_length, num_heads, d_model)
        # Reordering para (batch_size, num_heads, sequence_length, d_model)
        if encoder_output is None:
            x = torch.reshape(x, shape=(x.shape[0], x.shape[1], self.num_heads, self.head_dim)) #.contiguous()
            x = x.permute(0, 2, 1, 3)
        else:
            raise NotImplementedError("Modelo ainda não compatível com Encoder.")
        return x

    def compute_attention_scores(self, q_linear_out, k_linear_out, v_linear_out, mask=None):
        qk_dot_product = torch.matmul(q_linear_out, k_linear_out.transpose(2, 3)) / self.head_dim ** 0.5

        if mask is not None:
            qk_dot_product = qk_dot_product.masked_fill(mask == 0, float('-inf'))

        attn_scores = nn.functional.softmax(qk_dot_product, dim=-1)
        attn_weighted_v = torch.matmul(attn_scores, v_linear_out)

        return attn_weighted_v


    def combine_heads(self, x):
        x = x.permute(0, 2, 1, 3).contiguous()
        return torch.reshape(x, shape=(x.shape[0], x.shape[1], int(x.shape[2] * x.shape[3])))

    def forward(self, x, mask):
        q_linear_out = self.split_heads(self.q(x))
        k_linear_out = self.split_heads(self.k(x))
        v_linear_out = self.split_heads(self.v(x))
        
        attn_weighted_v = self.compute_attention_scores(q_linear_out, k_linear_out, v_linear_out, mask=mask)
        attn_weighted_v = self.combine_heads(attn_weighted_v)
        return self.output_linear(attn_weighted_v)


class FeedForwardSubLayer(nn.Module):
    def __init__(self, d_model, hidden_size):
        super().__init__()
        self.ff_1 = nn.Linear(d_model, hidden_size)
        self.ff_2 = nn.Linear(hidden_size, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.ff_2(self.relu(self.ff_1(x)))
    

class DecoderBlock(nn.Module):
    def __init__(self, d_model, hidden_size, num_heads, dropout=0.1):
        super().__init__()
        self.feed_forward = FeedForwardSubLayer(d_model, hidden_size)
        self.mha = MultiHeadAttention(d_model, num_heads) # nn.MultiheadAttention()
        self.norm_1 = nn.LayerNorm(d_model)
        self.norm_2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, tgt_mask):
        x = self.norm_1(x + self.dropout(self.mha(x, mask=tgt_mask)))
        x = self.norm_2(x + self.dropout(self.feed_forward(x)))
        return x
    

class TransformerDecoder(nn.Module):
    def __init__(self, vocab_size, d_model, max_sequence_length, n_layers, hidden_size, num_heads, dropout=0.1):
        super(TransformerDecoder, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_model, padding_idx=0)
        self.pe = PositionalEncoding(d_model, max_sequence_length)
        self.layers = nn.ModuleList(
            [DecoderBlock(d_model, hidden_size, num_heads, dropout) for _ in range(n_layers)]
        )
        self.output_layer = nn.Linear(d_model, vocab_size)

    def forward(self, x, tgt_mask):
        x = self.embedding(x)
        x = self.pe(x)
        for layer in self.layers:
            x = layer(x, tgt_mask)
        out = self.output_layer(x)
        return out


## Dataset Prep

### Tokenizer

In [3]:
class TokenizerChar:
    def __init__(self):
        self.chr_to_idx = {chr(v): v for v in range(1, 257)}
        self.chr_to_idx['<SOS>'] = 257
        self.chr_to_idx['<EOS>'] = 258
        self.chr_to_idx['<PAD>'] = 0
        self.chr_to_idx['<UNK>'] = 259
        self.chr_to_idx['<EOP>'] = 260

        self.idx_to_chr = {v: k for k, v in self.chr_to_idx.items()}

        self.vocab_size = len(self.chr_to_idx.keys())

    def encode(self, char):
        if char in self.chr_to_idx.keys():
            return self.chr_to_idx[char]
        else:
            return 259
    
    def decode(self, token_idx):
        return self.idx_to_chr[token_idx]
    
    def sos_token(self):
        return '<SOS>'
    
    def sos_token_idx(self):
        return self.chr_to_idx['<SOS>']

    def eos_token(self):
        return '<EOS>'
    
    def eos_token_idx(self):
        return self.chr_to_idx['<EOS>']
    
    def pad_token(self):
        return '<PAD>'
    
    def pad_token_idx(self):
        return self.chr_to_idx['<PAD>']
    
    def eop_token(self):
        return '<EOP>'
    
    def eop_token_idx(self):
        return self.chr_to_idx['<EOP>']
    
    def get_vocab_size(self):
        return self.vocab_size

### Dataset

In [4]:
import json
from torch.utils.data import Dataset

class DatasetFinancial(Dataset):
    def __init__(self, data_path='dataset_text/train.json', sequence_length=512):
        self.data_path = data_path
        self.sequence_length = sequence_length
        self.tokenizer = TokenizerChar()
        with open(self.data_path, encoding='utf-8') as f:
            lines = f.readlines()
        self.lines = [line for line in tqdm(lines) if line.isascii()]

    def __len__(self):
        return len(self.lines)

    def readline_as_dict(self, line_number):
        try:
            return json.loads(self.lines[line_number])
        except json.JSONDecodeError:
            return dict()

    def __getitem__(self, line_idx):
        line_dict = self.readline_as_dict(line_idx)
        if len(line_dict.keys()) == 0:
            x = torch.tensor([self.tokenizer.pad_token_idx()] * self.sequence_length)
            y = torch.tensor([self.tokenizer.pad_token_idx()] * self.sequence_length)
            return x, y
        system_text = line_dict['system']
        user_text = line_dict['user']
        assistant_text = line_dict['assistant'] 
        current_sequence = (
            [self.tokenizer.sos_token_idx()]
            + [self.tokenizer.encode(c) for c in system_text]
            + [self.tokenizer.encode(' ')]
            + [self.tokenizer.encode(c) for c in user_text]
            + [self.tokenizer.eop_token_idx()]
            + [self.tokenizer.encode(c) for c in assistant_text]
            + [self.tokenizer.eos_token_idx()]
        )
        if len(current_sequence) < self.sequence_length + 1:
            current_sequence += [self.tokenizer.pad_token_idx()] * (self.sequence_length + 1 - len(current_sequence))
        else:
            current_sequence = current_sequence[:self.sequence_length + 1]
        x = torch.tensor(current_sequence[:-1])
        y = torch.tensor(current_sequence[1:])
        return x, y

In [5]:
dataset = DatasetFinancial()

100%|██████████| 518182/518182 [00:00<00:00, 3699577.23it/s]


In [6]:
dataset[0]

(tensor([257,  10,  32,  69, 120, 112, 108,  97, 105, 110,  32, 116, 104, 101,
          32, 100, 105, 102, 102, 101, 114, 101, 110,  99, 101,  32,  98, 101,
         116, 119, 101, 101, 110,  32, 102, 105, 115,  99,  97, 108,  32,  97,
         110, 100,  32, 109, 111, 110, 101, 116,  97, 114, 121,  32, 112, 111,
         108, 105,  99, 121,  32, 116, 111, 111, 108, 115,  32, 117, 115, 101,
         100,  32, 105, 110,  32, 101,  99, 111, 110, 111, 109, 105,  99, 115,
          46, 260,  70, 105, 115,  99,  97, 108,  32, 112, 111, 108, 105,  99,
         121,  32,  97, 110, 100,  32, 109, 111, 110, 101, 116,  97, 114, 121,
          32, 112, 111, 108, 105,  99, 121,  32,  97, 114, 101,  32, 116, 104,
         101,  32, 116, 119, 111,  32, 109,  97, 105, 110,  32, 116, 111, 111,
         108, 115,  32, 116, 104,  97, 116,  32, 103, 111, 118, 101, 114, 110,
         109, 101, 110, 116, 115,  32, 117, 115, 101,  32, 116, 111,  32, 109,
          97, 110,  97, 103, 101,  32, 116, 104, 101

In [7]:
len(dataset)

442677

## Model Training

### Training Loop

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
sequence_length = 512
batch_size = 32
dataset_train = DatasetFinancial('dataset_text/train.json', sequence_length)
dataset_test = DatasetFinancial('dataset_text/valid.json', sequence_length)
dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=False)
dataloader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=True)
vocab_size = dataset_train.tokenizer.get_vocab_size()

d_model = 512
num_layers = 6
num_heads = 8
d_ff = 2048
dropout = 0.1
max_seq_length = sequence_length
# model = TransformerDecoder(vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_seq_length)
model = TransformerDecoder(vocab_size, d_model, max_seq_length, num_layers, d_ff, num_heads, dropout=0.1)
model.to(device)

tgt_mask = (1 - torch.triu(
  torch.ones(1, sequence_length, sequence_length), diagonal=1)
).bool()

def init_weights(module):
    if isinstance(module, (nn.Linear)):
        init.kaiming_normal_(module.weight, mode='fan_in', nonlinearity='relu')
        if module.bias is not None:
            init.zeros_(module.bias)
model.apply(init_weights)

optimizer = Adam(model.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss(ignore_index=0)
n_epochs = 1000
n_batches = int(dataset_train.__len__() // batch_size)

print("Starting model training...")
for epoch in range(n_epochs):
    print(f"Epoch: {epoch + 1}")
    avg_loss = 0
    model.train()
    for batch_idx, batch in enumerate(tqdm(dataloader_train, total=n_batches)):
        x, y = batch
        x = x.to(device)
        y = y.to(device)
        outputs = model(x, tgt_mask.to(device))
        loss = loss_fn(outputs.view(-1, vocab_size), y.view(-1))
        avg_loss += loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    torch.save(model.state_dict(), f'model_checkpoints/model_checkpoint_{epoch+1}.pth')

    avg_loss /= (batch_idx + 1)
    print(f"Average epoch training loss: {avg_loss}")
    print(f"Last batch training loss: {loss}")

    model.eval()
    avg_loss = 0
    for batch_idx, batch in enumerate(dataloader_test):
        x, y = batch
        x = x.to(device)
        y = y.to(device)
        outputs = model(x, tgt_mask.to(device))
        loss = loss_fn(outputs.view(-1, vocab_size), y.view(-1))
        avg_loss += loss.item()
    
    avg_loss /= (batch_idx + 1)
    print(f"Epoch validation loss: {avg_loss}")
    

cuda


100%|██████████| 518182/518182 [00:00<00:00, 3701202.68it/s]
100%|██████████| 3/3 [00:00<?, ?it/s]


Starting model training...
Epoch: 1


  5%|▍         | 677/13833 [05:21<1:47:07,  2.05it/s]