In [2]:
from model import ModelFactory
from torch import nn
from torch import Tensor
import pickle
import torch

model_factory = ModelFactory(
    coordinates = 6*8,
    words = 70,
    tokens=50258 + 1,
    number_of_blocks = 1,
    number_of_heads = 6,
    bias = 0,
    attention = "metric"# "scaled_dot_product", # or "metric"
)

class SentimentModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = model_factory.create_model(kind="encoder")
        del self.model[-1]
        del self.model[-1]

 
        self.layer_norm = nn.LayerNorm(model_factory.coordinates)
        self.projection = nn.Sequential(
          nn.LayerNorm(model_factory.coordinates),
          nn.GELU(),
          nn.Linear(model_factory.coordinates, model_factory.coordinates // 2),
          nn.GELU(),
          nn.Linear(model_factory.coordinates // 2, model_factory.coordinates // 2),
          nn.GELU(),
          nn.Linear(model_factory.coordinates // 2, 1),
        )
          
      
        self.classification_head = nn.Sequential(
          nn.Linear(model_factory.words, model_factory.words),
          nn.GELU(),
          nn.Linear(model_factory.words, 5)
        )

    def forward(self, sequence_bw: Tensor) -> Tensor:
        sequence_bwc = self.model(sequence_bw)
        sequence_bwc = self.layer_norm(sequence_bwc)
        sequence_bw1 = self.projection(sequence_bwc)
        sequence_bw = sequence_bw1.squeeze(-1)
        sequence_bv = self.classification_head(sequence_bw)
        return sequence_bv

with open("stanfordSentimentTreebank.pickle", "rb") as f:
    dataset = pickle.load(f)




In [51]:
print("here")
DEVICE = "cpu" # "cuda"
from tqdm import tqdm
import tiktoken
import numpy as np

gpt2_encoder = tiktoken.get_encoding("gpt2")





model = SentimentModel().to(DEVICE)
optimizer = torch.optim.Adam(model.parameters())
class_weights = torch.tensor([1.1, 1, 0.95, 1, 1.1], device=DEVICE)
loss_function = nn.CrossEntropyLoss() # weight=class_weights
batch_size = 200
sentences, sentiments = dataset['train']
size = len(sentences)

warmup_steps = 8
get_lr = lambda step: min(step ** -0.5, step * warmup_steps ** -1.5) * model_factory.coordinates ** -0.5
EPOCHS = 100
import numpy as np 




for epoch in range(1, EPOCHS + 1):

    
    training_loss_cumul = 0
    pb = tqdm(
          range(0, size, batch_size),
          desc=f"({epoch})",
          leave=True,
    )

    # Randomly shuffle the indices
    indices = np.arange(len(sentences))
    np.random.shuffle(indices)

    # Use the shuffled indices to randomize sentences and sentiments
    sentences = sentences[indices]
    sentiments = sentiments[indices]

    for i in pb:
        lr = get_lr(i + 1)

        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
        sentence_bw = torch.tensor(sentences[i:i+batch_size]).to(DEVICE)
        sentiment_b = torch.tensor(sentiments[i:i+batch_size]).to(DEVICE)
  
        optimizer.zero_grad()
        pred_logits_bv = model(sentence_bw)
        loss_train = loss_function(pred_logits_bv, sentiment_b)
        loss_train.backward()
        optimizer.step()

        training_loss_cumul += loss_train.item()
        pb.set_postfix({"avg loss": training_loss_cumul / (i + 1), "lr": lr})




here


(1):   0%|          | 0/41 [00:00<?, ?it/s]

(1): 100%|██████████| 41/41 [00:07<00:00,  5.80it/s, avg loss=0.00806, lr=0.00161]
(2): 100%|██████████| 41/41 [00:06<00:00,  6.02it/s, avg loss=0.00804, lr=0.00161]
(3): 100%|██████████| 41/41 [00:07<00:00,  5.82it/s, avg loss=0.00799, lr=0.00161]
(4): 100%|██████████| 41/41 [00:06<00:00,  6.07it/s, avg loss=0.00773, lr=0.00161]
(5): 100%|██████████| 41/41 [00:07<00:00,  5.55it/s, avg loss=0.00728, lr=0.00161]
(6): 100%|██████████| 41/41 [00:06<00:00,  5.94it/s, avg loss=0.00676, lr=0.00161]
(7): 100%|██████████| 41/41 [00:06<00:00,  6.08it/s, avg loss=0.006, lr=0.00161]  
(8): 100%|██████████| 41/41 [00:06<00:00,  6.13it/s, avg loss=0.00532, lr=0.00161]
(9): 100%|██████████| 41/41 [00:06<00:00,  5.99it/s, avg loss=0.00475, lr=0.00161]
(10): 100%|██████████| 41/41 [00:06<00:00,  6.12it/s, avg loss=0.00424, lr=0.00161]
(11): 100%|██████████| 41/41 [00:06<00:00,  6.14it/s, avg loss=0.00371, lr=0.00161]
(12): 100%|██████████| 41/41 [00:06<00:00,  6.07it/s, avg loss=0.00339, lr=0.00161]
(

In [None]:

import tiktoken
# special_tokens
gpt2_encoder = tiktoken

In [4]:
PADDING_VALUE = gpt2_encoder.max_token_value + 1

def to_tensor(sentence):
    
    sentence = gpt2_encoder.encode(sentence)
    if len(sentence) < 70:
        sentence = sentence + [PADDING_VALUE] * (70 - len(sentence))

    
    return torch.Tensor(sentence).unsqueeze(0).to(DEVICE).long()



phrases present in dataset:

In [58]:
classifications = nn.Softmax(dim=1)(model(to_tensor("Offers a guilt-free trip into feel-good territory.")))
classifications.argmax(), classifications

pred = classifications.argmax().item()

print(f"PREDICTED_RATING={pred}/4 ---- Offers a guilt-free trip into feel-good territory.")

PREDICTED_RATING=4/4 ---- Offers a guilt-free trip into feel-good territory.


In [59]:
classifications = nn.Softmax(dim=1)(model(to_tensor("Offers absolutely nothing I hadn't already seen.")))
pred = classifications.argmax().item()

print(f"PREDICTED_RATING={pred}/4 ---- Offers absolutely nothing I hadn't already seen.")

PREDICTED_RATING=1/4 ---- Offers absolutely nothing I hadn't already seen.


phrases I just came up with:

In [60]:
classifications = nn.Softmax(dim=1)(model(to_tensor("Amazing stuff, love it.")))
pred = classifications.argmax().item()

print(f"PREDICTED_RATING={pred}/4 ---- Amazing stuff, love it.")

PREDICTED_RATING=3/4 ---- Amazing stuff, love it.


In [62]:
classifications = nn.Softmax(dim=1)(model(to_tensor("Pffft terrible, how could this be made.")))
pred = classifications.argmax().item()

print(f"PREDICTED_RATING={pred}/4 ---- Pffft terrible, how could this be made.")

PREDICTED_RATING=1/4 ---- Pffft terrible, how could this be made.


In [48]:
classifications = nn.Softmax(dim=1)(model(

    torch.tensor(sentences[1]).unsqueeze(0)
))


classifications.argmax(), classifications

(tensor(1),
 tensor([[0.2414, 0.4820, 0.2110, 0.0403, 0.0254]], grad_fn=<SoftmaxBackward0>))

In [49]:
sentiments[1]

1