# Sentiment Analysis

## Imports

In [1]:
import pickle
from torch import nn
from model import ModelFactory
from model.sequence_encoder import SequenceEncoder as SequenceEmbedder
from torch import Tensor
import pickle
import torch
from tqdm import tqdm
import tiktoken
import numpy as np
DEVICE = "cpu" # "cuda"


## Loading data

In [2]:
with open("stanfordSentimentTreebank.pickle", "rb") as f:
    dataset = pickle.load(f)

print(dataset.keys())

dict_keys(['train', 'test', 'dev'])


In [10]:
dataset['test'][0].shape

(2125, 70)

## Training on simple Embedder + Feedforward

In [228]:


model_factory = ModelFactory(
    coordinates = 200,
    number_of_heads = 1, # ignroe this 
    words = 70,
    tokens=50258 + 1,
)

p = 0.1

class EmbedderPlusFeedForwar(nn.Module):
    def __init__(self):
        super().__init__()

        self.embedder = SequenceEmbedder(model_factory
                                         )
        self.projection = nn.Sequential(
            nn.LayerNorm(model_factory.coordinates),
           nn.Linear(model_factory.coordinates , model_factory.coordinates),
           nn.Dropout(p),
            nn.GELU(),
           nn.Linear(model_factory.coordinates , model_factory.coordinates  // 2 ),
           nn.Dropout(p),

            nn.GELU(),
           nn.Linear(model_factory.coordinates // 2  , model_factory.coordinates // 4 ),
           nn.Dropout(p),
            nn.GELU(),
            nn.Linear(model_factory.coordinates  // 4   , 1),
        )

        self.classification_head = nn.Sequential(
            nn.LayerNorm(model_factory.words),

            nn.Linear(model_factory.words , model_factory.words),
           nn.Dropout(p),
            nn.GELU(),
            nn.Linear(model_factory.words, 5),
        )

    def forward(self, sequence_bw: Tensor) -> Tensor:
        sequence_bwc = self.embedder(sequence_bw)
        sequence_bw1 = self.projection(sequence_bwc)
        sequence_bw = sequence_bw1.squeeze(-1)
        sequence_bv = self.classification_head(sequence_bw)
        return sequence_bv


gpt2_encoder = tiktoken.get_encoding("gpt2")
model = EmbedderPlusFeedForwar().to(DEVICE)
optimizer = torch.optim.Adam(model.parameters())
loss_function = nn.CrossEntropyLoss() # weight=torch.tensor([1.1, 1, 0.95, 1, 1.1], device=DEVICE)
batch_size = 200
sentences, sentiments = dataset['train']
size = len(sentences)
warmup_steps = 8
get_lr = lambda step: min(step ** -0.5, step * warmup_steps ** -1.5) * model_factory.coordinates ** -0.5
EPOCHS = 150
test_sentences, test_sentiments = dataset['test']

def l1_regularization(model, lambda_l1):
    l1_penalty = 0
    for param in model.parameters():
        l1_penalty += torch.abs(param).sum()
    return lambda_l1 * l1_penalty





for epoch in range(1, EPOCHS + 1):
    test_loss_cumul = 0
    training_loss_cumul = 0
    pb = tqdm(
          range(0, size, batch_size),
          desc=f"({epoch})",
          leave=True,
    )

    # Randomly shuffle the indices
    indices = np.arange(len(sentences))
    np.random.shuffle(indices)
    # Use the shuffled indices to randomize sentences and sentiments
    sentences = sentences[indices]
    sentiments = sentiments[indices]
    lr = get_lr(epoch)
    lr = 1e-2
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    model.train()
    for i in pb:
        sentence_bw = torch.tensor(sentences[i:i+batch_size]).to(DEVICE)
        sentiment_b = torch.tensor(sentiments[i:i+batch_size]).to(DEVICE)
        optimizer.zero_grad()
        pred_logits_bv = model(sentence_bw)
        loss_train = loss_function(pred_logits_bv, sentiment_b) + l1_regularization(model, 0.00001)
        loss_train.backward()
        optimizer.step()

        training_loss_cumul += loss_train.item()
        # pb.set_postfix({"avg loss": training_loss_cumul / (i + 1), "lr": lr})
        
        tes_size = len(test_sentences)
        random_index = np.random.randint(0, tes_size - batch_size)
        sentence_bw = torch.tensor(test_sentences[random_index:random_index+batch_size]).to(DEVICE)
        sentiment_b = torch.tensor(test_sentiments[random_index:random_index+batch_size]).to(DEVICE)

        with torch.no_grad():
            
            correct = 0
            pred_logits_bv = model(sentence_bw)
            test_loss_cumul += loss_function(pred_logits_bv, sentiment_b).item()
            pb.set_postfix({
                "avg loss": training_loss_cumul / (i + 1),
                "lr": lr,
                "test loss": test_loss_cumul / (i + 1),
            }
            )
            







(1): 100%|██████████| 41/41 [00:12<00:00,  3.23it/s, avg loss=0.329, lr=0.01, test loss=0.00815]
(2): 100%|██████████| 41/41 [00:12<00:00,  3.20it/s, avg loss=0.184, lr=0.01, test loss=0.0081] 
(3): 100%|██████████| 41/41 [00:12<00:00,  3.22it/s, avg loss=0.0947, lr=0.01, test loss=0.0081] 
(4): 100%|██████████| 41/41 [00:12<00:00,  3.21it/s, avg loss=0.0463, lr=0.01, test loss=0.00809]
(5):  49%|████▉     | 20/41 [00:06<00:06,  3.07it/s, avg loss=0.0277, lr=0.01, test loss=0.00833]


KeyboardInterrupt: 

In [184]:

PADDING_VALUE = gpt2_encoder.max_token_value + 1

def to_tensor(sentence):
    
    sentence = gpt2_encoder.encode(sentence)
    if len(sentence) < 70:
        sentence = sentence + [PADDING_VALUE] * (70 - len(sentence))

    
    return torch.Tensor(sentence).unsqueeze(0).to(DEVICE).long()

def classify(text):

    classifications = nn.Softmax(dim=1)(model(to_tensor(text)))
    return classifications.argmax().item() / 4



This phrase is present in the dataset with about 0.8 classification (which would indeed round to 1 due to labeling)

In [198]:
classify("bad")

0.5

however, the model does not understand when I modify it like so:

In [202]:
classify("very bad")

0.75

In [60]:

model_factory = ModelFactory(
    coordinates = 9,
    words = 70,
    tokens=50258 + 1,
    number_of_blocks = 1,
    number_of_heads = 3,
    bias = 0,
    attention = "metric"# "scaled_dot_product", # or "metric"
)

class SentimentModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = model_factory.create_model(kind="encoder")
        del self.model[-1]
        del self.model[-1]

 
        self.projection = nn.Sequential(
        nn.LayerNorm(model_factory.coordinates),
          nn.Linear(model_factory.coordinates, 1),
        )
          
      
        self.classification_head = nn.Linear(model_factory.words, 5)

    def forward(self, sequence_bw: Tensor) -> Tensor:
        sequence_bwc = self.model(sequence_bw)
        sequence_bw1 = self.projection(sequence_bwc)
        sequence_bw = sequence_bw1.squeeze(-1)
        sequence_bv = self.classification_head(sequence_bw)
        return sequence_bv





In [229]:
print("here")
DEVICE = "cpu" # "cuda"
from tqdm import tqdm
import tiktoken
import numpy as np

gpt2_encoder = tiktoken.get_encoding("gpt2")





model = SentimentModel().to(DEVICE)
optimizer = torch.optim.Adam(model.parameters())
class_weights = torch.tensor([1.1, 1, 0.95, 1, 1.1], device=DEVICE)
loss_function = nn.CrossEntropyLoss() # weight=class_weights
batch_size = 200
sentences, sentiments = dataset['train']
size = len(sentences)

warmup_steps = 8
get_lr = lambda step: min(step ** -0.5, step * warmup_steps ** -1.5) * model_factory.coordinates ** -0.5
EPOCHS = 100




for epoch in range(1, EPOCHS + 1):

    
    training_loss_cumul = 0
    pb = tqdm(
          range(0, size, batch_size),
          desc=f"({epoch})",
          leave=True,
    )

    # Randomly shuffle the indices
    indices = np.arange(len(sentences))
    np.random.shuffle(indices)

    # Use the shuffled indices to randomize sentences and sentiments
    sentences = sentences[indices]
    sentiments = sentiments[indices]
    lr = get_lr(epoch)
    for i in pb:
        

        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
        sentence_bw = torch.tensor(sentences[i:i+batch_size]).to(DEVICE)
        sentiment_b = torch.tensor(sentiments[i:i+batch_size]).to(DEVICE)
  
        optimizer.zero_grad()
        pred_logits_bv = model(sentence_bw)
        loss_train = loss_function(pred_logits_bv, sentiment_b)
        loss_train.backward()
        optimizer.step()

        training_loss_cumul += loss_train.item()
        # pb.set_postfix({"avg loss": training_loss_cumul / (i + 1), "lr": lr})

        with torch.no_grad():
            model.eval()
            test_sentences, test_sentiments = dataset['test']
            size = len(test_sentences)
            test_loss_cumul = 0
            correct = 0

            random_index = np.random.randint(0, size - batch_size)
            sentence_bw = torch.tensor(test_sentences[random_index:random_index+batch_size]).to(DEVICE)
            sentiment_b = torch.tensor(test_sentiments[random_index:random_index+batch_size]).to(DEVICE)
            pred_logits_bv = model(sentence_bw)
            correct += (pred_logits_bv.argmax(dim=-1) == sentiment_b).sum().item()

            accuracy = correct / size
            pb.set_postfix({
                "avg loss": training_loss_cumul / (i + 1),
                "lr": lr,
                "test loss": loss_function(pred_logits_bv, sentiment_b).item(),
                "accuracy": accuracy
            }
            )
            model.train()




here


TypeError: ScaledDotProductAttention.__init__() got an unexpected keyword argument 'is_causal'

In [49]:

PADDING_VALUE = gpt2_encoder.max_token_value + 1

def to_tensor(sentence):
    
    sentence = gpt2_encoder.encode(sentence)
    if len(sentence) < 70:
        sentence = sentence + [PADDING_VALUE] * (70 - len(sentence))

    
    return torch.Tensor(sentence).unsqueeze(0).to(DEVICE).long()

def classify(text):

    classifications = nn.Softmax(dim=1)(model(to_tensor(text)))
    return classifications.argmax().item() / 4



In [50]:
classify("works, it's thanks to Huston's revelatory performance")

0.75

In [53]:
classify("amazing movie")

0.0

In [4]:
PADDING_VALUE = gpt2_encoder.max_token_value + 1

def to_tensor(sentence):
    
    sentence = gpt2_encoder.encode(sentence)
    if len(sentence) < 70:
        sentence = sentence + [PADDING_VALUE] * (70 - len(sentence))

    
    return torch.Tensor(sentence).unsqueeze(0).to(DEVICE).long()



phrases present in dataset:

In [58]:
classifications = nn.Softmax(dim=1)(model(to_tensor("Offers a guilt-free trip into feel-good territory.")))
classifications.argmax(), classifications

pred = classifications.argmax().item()

print(f"PREDICTED_RATING={pred}/4 ---- Offers a guilt-free trip into feel-good territory.")

PREDICTED_RATING=4/4 ---- Offers a guilt-free trip into feel-good territory.


In [59]:
classifications = nn.Softmax(dim=1)(model(to_tensor("Offers absolutely nothing I hadn't already seen.")))
pred = classifications.argmax().item()

print(f"PREDICTED_RATING={pred}/4 ---- Offers absolutely nothing I hadn't already seen.")

PREDICTED_RATING=1/4 ---- Offers absolutely nothing I hadn't already seen.


phrases I just came up with:

In [60]:
classifications = nn.Softmax(dim=1)(model(to_tensor("Amazing stuff, love it.")))
pred = classifications.argmax().item()

print(f"PREDICTED_RATING={pred}/4 ---- Amazing stuff, love it.")

PREDICTED_RATING=3/4 ---- Amazing stuff, love it.


In [62]:
classifications = nn.Softmax(dim=1)(model(to_tensor("Pffft terrible, how could this be made.")))
pred = classifications.argmax().item()

print(f"PREDICTED_RATING={pred}/4 ---- Pffft terrible, how could this be made.")

PREDICTED_RATING=1/4 ---- Pffft terrible, how could this be made.


In [48]:
classifications = nn.Softmax(dim=1)(model(

    torch.tensor(sentences[1]).unsqueeze(0)
))


classifications.argmax(), classifications

(tensor(1),
 tensor([[0.2414, 0.4820, 0.2110, 0.0403, 0.0254]], grad_fn=<SoftmaxBackward0>))

In [49]:
sentiments[1]

1

In [69]:
model_factory = ModelFactory(
    coordinates = 50,
    words = 70,
    tokens=50258 + 1,
    number_of_blocks = 2,
    number_of_heads = 25,
    bias = 0,
    attention = "metric"# "scaled_dot_product", # or "metric"
)


p = 0
class SentimentModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = model_factory.create_model(kind="encoder")[0]
 

 
        self.layer_norm = nn.LayerNorm(model_factory.coordinates)
        self.projection = nn.Sequential(
        nn.LayerNorm(model_factory.coordinates),
          nn.Linear(model_factory.coordinates, 1),
          nn.Dropout(p),
        )
          
      
        self.classification_head = nn.Sequential(
          nn.Linear(model_factory.words, 5)
        )

    def forward(self, sequence_bw: Tensor) -> Tensor:
        sequence_bwc = self.model(sequence_bw)
        sequence_bwc = self.layer_norm(sequence_bwc)
        sequence_bw1 = self.projection(sequence_bwc)
        sequence_bw = sequence_bw1.squeeze(-1)
        sequence_bv = self.classification_head(sequence_bw)
        return sequence_bv


print("here")
DEVICE = "cpu" # "cuda"
from tqdm import tqdm
import tiktoken
import numpy as np

gpt2_encoder = tiktoken.get_encoding("gpt2")



test_sentences, test_sentiments = dataset['test']


model = SentimentModel().to(DEVICE)
optimizer = torch.optim.Adam(model.parameters())
class_weights = torch.tensor([1.1, 1, 0.95, 1, 1.1], device=DEVICE)
loss_function = nn.CrossEntropyLoss() # weight=class_weights
batch_size = 250
sentences, sentiments = dataset['train']
size = len(sentences)

warmup_steps = 8
get_lr = lambda step: min(step ** -0.5, step * warmup_steps ** -1.5) * model_factory.coordinates ** -0.5
EPOCHS = 100
import numpy as np 


def l1_regularization(model, lambda_l1):
    l1_penalty = 0
    for param in model.parameters():
        l1_penalty += torch.abs(param).sum()
    return lambda_l1 * l1_penalty

for param in model.model.parameters():
    param.requires_grad = True


for epoch in range(1, EPOCHS + 1):

    if epoch == 10:
        for param in model.model.parameters():
            param.requires_grad = False
        optimizer = torch.optim.Adam(model.parameters())

    test_loss_cumul = 0
    training_loss_cumul = 0
    pb = tqdm(
          range(0, size, batch_size),
          desc=f"({epoch})",
          leave=True,
    )

    # Randomly shuffle the indices
    indices = np.arange(len(sentences))
    np.random.shuffle(indices)

    # Use the shuffled indices to randomize sentences and sentiments
    sentences = sentences[indices]
    sentiments = sentiments[indices]

    for i in pb:
        lr = 1e-3

        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
        sentence_bw = torch.tensor(sentences[i:i+batch_size]).to(DEVICE)
        sentiment_b = torch.tensor(sentiments[i:i+batch_size]).to(DEVICE)
  
        optimizer.zero_grad()
        pred_logits_bv = model(sentence_bw)
        loss_train = loss_function(pred_logits_bv, sentiment_b) # + l1_regularization(model, 0.0001)
        loss_train.backward()
        optimizer.step()

        training_loss_cumul += loss_train.item()
        # pb.set_postfix({"avg loss": training_loss_cumul / (i + 1), "lr": lr})


        tes_size = len(test_sentences)
        random_index = i if i < tes_size - batch_size else tes_size - batch_size - 1
        sentence_bw = torch.tensor(test_sentences[random_index:random_index+batch_size]).to(DEVICE)
        sentiment_b = torch.tensor(test_sentiments[random_index:random_index+batch_size]).to(DEVICE)

        with torch.no_grad():
            
            correct = 0
            pred_logits_bv = model(sentence_bw)
            test_loss_cumul += loss_function(pred_logits_bv, sentiment_b).item()
            pb.set_postfix({
                "avg loss": training_loss_cumul / (i + 1),
                "lr": lr,
                "test loss": test_loss_cumul / (i + 1),
            })
            





here


(1):   0%|          | 0/33 [00:00<?, ?it/s, avg loss=1.69, lr=0.001, test loss=1.64]

(1): 100%|██████████| 33/33 [00:01<00:00, 28.96it/s, avg loss=0.00656, lr=0.001, test loss=0.00657]
(2): 100%|██████████| 33/33 [00:01<00:00, 28.05it/s, avg loss=0.00647, lr=0.001, test loss=0.00652]
(3): 100%|██████████| 33/33 [00:01<00:00, 29.23it/s, avg loss=0.00645, lr=0.001, test loss=0.00651]
(4): 100%|██████████| 33/33 [00:01<00:00, 30.63it/s, avg loss=0.00643, lr=0.001, test loss=0.00651]
(5): 100%|██████████| 33/33 [00:01<00:00, 27.68it/s, avg loss=0.00641, lr=0.001, test loss=0.00651]
(6): 100%|██████████| 33/33 [00:01<00:00, 29.13it/s, avg loss=0.00639, lr=0.001, test loss=0.00652]
(7): 100%|██████████| 33/33 [00:01<00:00, 30.60it/s, avg loss=0.00635, lr=0.001, test loss=0.00651]
(8): 100%|██████████| 33/33 [00:01<00:00, 29.24it/s, avg loss=0.0063, lr=0.001, test loss=0.00652] 
(9): 100%|██████████| 33/33 [00:01<00:00, 30.89it/s, avg loss=0.00622, lr=0.001, test loss=0.00653]
(10): 100%|██████████| 33/33 [00:00<00:00, 62.85it/s, avg loss=0.00614, lr=0.001, test loss=0.00654]

KeyboardInterrupt: 

In [6]:

model_factory = ModelFactory(
    coordinates = 9,
    words = 70,
    tokens=50258 + 1,
    number_of_blocks = 2,
    number_of_heads = 3,
    bias = 0,
    attention = "metric"# "scaled_dot_product", # or "metric"
)


model = model_factory.create_model(kind="encoder")
lr = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_function = nn.CrossEntropyLoss() 
EPOCHS = 1


gpt2_encoder = tiktoken.get_encoding("gpt2")
loss_function = nn.CrossEntropyLoss() # weight=torch.tensor([1.1, 1, 0.95, 1, 1.1], device=DEVICE)
batch_size = 16
EPOCHS = 1

test_sentences, test_sentiments = dataset['test']
sentences, sentiments = dataset['train']

sentences = torch.tensor(sentences).to(DEVICE)
sentiments = torch.tensor(sentiments).to(DEVICE)
test_sentences = torch.tensor(test_sentences).to(DEVICE)
test_sentiments = torch.tensor(test_sentiments).to(DEVICE)
size = len(sentences)

for epoch in range(1, EPOCHS + 1):

    test_loss_cumul = 0
    training_loss_cumul = 0

    # Randomly shuffle the indices
    indices = np.arange(len(sentences))
    np.random.shuffle(indices)
    sentences = sentences[indices]
    sentiments = sentiments[indices]

    for i in (pb := tqdm(range(0, size, batch_size), desc=f"({epoch})", leave=True)):
     
        sentence_bw = sentences[i:i+batch_size]
        gt_sentence_bw = sentences[i:i+batch_size]
        sentiment_b = sentiments[i:i+batch_size]
        gt_sentence_bw[:, -1] = sentiment_b


        optimizer.zero_grad()
        pred_logits_bwt = model(sentence_bw)
        pred_logits_btw = pred_logits_bwt.transpose(-1, -2)
        loss_train = loss_function(pred_logits_btw, gt_sentence_bw) # + l1_regularization(model, 0.0001)
        loss_train.backward()
        optimizer.step()

        training_loss_cumul += loss_train.item()
        tes_size = len(test_sentences)
        random_index = i if i < tes_size - batch_size else tes_size - batch_size - 1
        sentence_bw = test_sentences[random_index:random_index+batch_size]
        gt_sentence_bw = test_sentences[random_index:random_index+batch_size]
        sentiment_b = test_sentiments[random_index:random_index+batch_size]
        gt_sentence_bw[:, -1] = sentiment_b
    
        with torch.no_grad():
            pred_logits_bwt = model(sentence_bw)
            pred_logits_btw = pred_logits_bwt.transpose(-1, -2)
            test_loss_cumul += loss_function(pred_logits_btw, gt_sentence_bw).item()
            pb.set_postfix({
                "avg loss": training_loss_cumul / (i + 1),
                "lr": lr,
                "test loss": test_loss_cumul / (i + 1),
            })
            



(1):   0%|          | 0/508 [00:00<?, ?it/s]

(1):  85%|████████▌ | 433/508 [15:32<02:37,  2.10s/it, avg loss=0.363, lr=0.001, test loss=0.359]