# Install required packages

In [1]:
!pip install datasets
!pip install loralib
!pip install tiktoken
!pip install rotary-embedding-torch



In [2]:
from datasets import load_dataset
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from tiktoken import get_encoding
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
from typing import Tuple
import torch
import torch.nn.functional as F
from torch import nn
import inspect
from rotary_embedding_torch import RotaryEmbedding
import loralib as lora

In [3]:
import tqdm
import gc
from tqdm import tqdm

# Load the Pretrained Model

In [4]:
enc = tiktoken.get_encoding('gpt2')
vocab_size = enc.n_vocab

# hyperparameters
batch_size = 32 # how many independent sequences will we process in parallel? #16
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 1024
n_head = 8
n_layer = 8
dropout = 0.0
# ------------
class ModelConfig:
    block_size: int = 1024
    vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
    dim: int = 512
    n_layers: int = 8
    n_heads: int = 8
    max_seq_len: int = 512
    layer_norm_eps: float = 1e-6
    dropout: float = 0.0
    hidden_dim: int = None
    n_embd: int = 1024
    multiple_of: int = 32
    rope_dim: int = 64
    bias: bool = True
    weight_decay = 1e-1
    betas = (0.9, 0.99)
    lora_rank: int = 4

# Root Mean Square Layer Normalization (https://arxiv.org/abs/1910.07467)
# borrowed from the official Llama implementation:
# https://github.com/facebookresearch/llama/blob/main/llama/model.py
class RMSNorm(torch.nn.Module):
    def __init__(self, dim: int, eps: float = 1e-6):
        """
        Initialize the RMSNorm normalization layer.

        Args:
            dim (int): The dimension of the input tensor.
            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.

        Attributes:
            eps (float): A small value added to the denominator for numerical stability.
            weight (nn.Parameter): Learnable scaling parameter.

        """
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))

    def _norm(self, x):
        """
        Apply the RMSNorm normalization to the input tensor.

        Args:
            x (torch.Tensor): The input tensor.

        Returns:
            torch.Tensor: The normalized tensor.

        """
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)

    def forward(self, x):
        """
        Forward pass through the RMSNorm layer.

        Args:
            x (torch.Tensor): The input tensor.

        Returns:
            torch.Tensor: The output tensor after applying RMSNorm.

        """
        output = self._norm(x.float()).type_as(x)
        return output * self.weight

class FeedForward(nn.Module):
    def __init__(self, dim: int, hidden_dim: int, multiple_of: int, dropout: float):
        super().__init__()
        if hidden_dim is None:
            hidden_dim = 4 * dim
            hidden_dim = int(2 * hidden_dim / 3)
            hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
        self.dropout = nn.Dropout(dropout)

    def SwiGLU(self, x: torch.Tensor) -> torch.Tensor:
        '''
        Compute the SwiGLU activation function (see Section 2 in
        https://arxiv.org/abs/2204.02311
        '''
        return F.silu(self.w1(x)) * self.w3(x)

    def forward(self, x):
        return self.dropout(self.w2(self.SwiGLU(x)))


class CausalSelfAttention(nn.Module):
    def __init__(self, config: ModelConfig):
        super().__init__()
        assert config.n_embd % config.n_heads == 0
        # key, query, value projections for all heads, but in a batch
        # self.c_attn = lora.MergedLinear(config.n_embd, 3 * config.n_embd, r=config.lora_rank, enable_lora=[True, False, True])
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
        # regularization
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)
        self.n_head = config.n_heads
        self.n_embd = config.n_embd
        self.dropout = config.dropout
        self.rotary = RotaryEmbedding(config.rope_dim)
        # flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        if not self.flash:
            print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
            # causal mask to ensure that attention is only applied to the left in the input sequence
            self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                                        .view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

        # apply RoPE, see https://arxiv.org/abs/2104.09864
        k = self.rotary.rotate_queries_or_keys(k)
        q = self.rotary.rotate_queries_or_keys(q)


        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        if self.flash:
            # efficient attention using Flash Attention CUDA kernels
            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
        else:
            # manual implementation of attention
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
            att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
            att = F.softmax(att, dim=-1)
            att = self.attn_dropout(att)
            y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.resid_dropout(self.c_proj(y))
        return y


class Block(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.rn_1 = RMSNorm(config.n_embd, eps=config.layer_norm_eps)
        self.attn = CausalSelfAttention(config)
        self.rn_2 = RMSNorm(config.n_embd, eps=config.layer_norm_eps)
        self.mlp = FeedForward(config.n_embd, config.hidden_dim, config.multiple_of, config.dropout)

    def forward(self, x):
        x = x + self.attn(self.rn_1(x))
        x = x + self.mlp(self.rn_2(x))
        return x

class GPT(nn.Module):

    def __init__(self, config: ModelConfig):
        super().__init__()
        assert config.vocab_size is not None
        assert config.block_size is not None
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            drop = nn.Dropout(config.dropout),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layers)]),
            ln_f = RMSNorm(config.n_embd, eps=config.layer_norm_eps)
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.transformer.wte.weight = self.lm_head.weight

        # init all weights
        self.apply(self._init_weights)
        # apply special scaled init to the residual projections, per GPT-2 paper
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layers))

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
        pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t)

        # forward the GPT model itself
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
        x = self.transformer.drop(tok_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)

        if targets is not None:
            # if we are given some desired targets also calculate the loss
            logits = self.lm_head(x)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        else:
            # inference-time mini-optimization: only forward the lm_head on the very last position
            logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
            loss = None

        return logits, loss

    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
        # start with all of the candidate parameters
        param_dict = {pn: p for pn, p in self.named_parameters()}
        # filter out those that do not require grad
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)
        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
        # Create AdamW optimizer and use the fused version if it is available
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == 'cuda'
        extra_args = dict(fused=True) if use_fused else dict()
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
        print(f"using fused AdamW: {use_fused}")

        return optimizer

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        """
        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
        the sequence max_new_tokens times, feeding the predictions back into the model each time.
        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
        """
        for _ in range(max_new_tokens):
            # if the sequence context is growing too long we must crop it at block_size
            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
            # forward the model to get the logits for the index in the sequence
            logits, _ = self(idx_cond)
            # pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature
            # optionally crop the logits to only the top k options
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')
            # apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1)

        return idx

config = ModelConfig()
model = GPT(config)
m = model.to(device)

# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
#optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
optimizer = model.configure_optimizers(weight_decay=1e-1, learning_rate=0.001, betas= (0.9, 0.99), device_type=device)


152.749312 M parameters
num decayed parameter tensors: 41, with 152,698,880 parameters
num non-decayed parameter tensors: 33, with 50,176 parameters
using fused AdamW: True


## Load pretrained weights here

In [5]:
checkpoint = torch.load("/content/checkpoint_iter_3600_3_99_v3.pth")
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

# Dataset
Dataset: https://huggingface.co/datasets/financial_phrasebank

labels:
*   0 for negative
*   1 for neutral
*   2 for positive

In [6]:
dataset = load_dataset("financial_phrasebank", "sentences_50agree")
dataset = dataset['train'].train_test_split(test_size=0.1)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/392k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4846 [00:00<?, ? examples/s]

In [7]:
print("train set length:", len(dataset['train']))
print("train set sample:", dataset['train'][0])
print("test set length:", len(dataset['test']))
print("test set sample:", dataset['test'][0])

train set length: 4361
train set sample: {'sentence': 'The price for logs has clearly improved from 2009 and also the price of pulpwood has gone up .', 'label': 2}
test set length: 485
test set sample: {'sentence': 'The sellers were the founders of the company .', 'label': 1}


In [8]:
class SentimentAnalysisTrainDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, encoder, model):
        self.dataset = []
        self.encoder = encoder
        self.length = len(dataset)
        self.model = model
        self.model.eval()
        for data in dataset:
            text = self.encoder.encode(data['sentence'])
            text = torch.tensor(text, dtype=torch.long)
            embed = torch.stack([text]).to(device)
            with torch.no_grad():
                data['sentence'] = torch.squeeze(self.model(embed)[0]).to('cpu')
                torch.cuda.empty_cache()

            self.dataset.append(data)

    def __len__(self):

        return self.length

    def __getitem__(self, ind):
        sentence = self.dataset[ind]['sentence']

        label = self.dataset[ind]['label']
        label = torch.tensor(label, dtype=torch.long)

        return sentence, label

train_dataset = SentimentAnalysisTrainDataset(dataset['train'], enc, m)
val_dataset = SentimentAnalysisTrainDataset(dataset['test'], enc, m)
sample_sentence, sample_label = train_dataset[0]

print("Train data sample: ")
print("Encoded Sentence:", sample_sentence.shape)
print("Label:", sample_label)
sample_sentence, sample_label = val_dataset[0]
print("Val data sample: ")
print("Encoded Sentence:", sample_sentence.shape)
print("Label:", sample_label)

Train data sample: 
Encoded Sentence: torch.Size([50304])
Label: tensor(2)
Val data sample: 
Encoded Sentence: torch.Size([50304])
Label: tensor(1)


# Dataloader

In [9]:
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=256, shuffle=True)
for batch in train_loader:
    inputs, labels = batch
    print(inputs.shape)
    print(labels.shape)
    break

torch.Size([256, 50304])
torch.Size([256])


# Added Classification Layer

In [10]:
class SimpleClassifier(nn.Module):
    def __init__(self, input_dim=50304, output_dim=3):
        super(SimpleClassifier, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        logits = self.linear(x)
        return logits
sentiment_model = SimpleClassifier()
sentiment_model.to(device)

SimpleClassifier(
  (linear): Linear(in_features=50304, out_features=3, bias=True)
)

# Finetuning

In [11]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(sentiment_model.parameters(), lr=0.000001)

In [12]:
# logging
import wandb
wandb.login(key="197d96ebfe1ad37dfd2180d901ca0f779e76bdfe")
wandb.init(project='HW5', name='sentiment-fineTune_v1')

[34m[1mwandb[0m: Currently logged in as: [33mabasrith[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
num_epochs = 1000

for epoch in range(num_epochs):
    sentiment_model.train()
    total_loss = 0
    train_loader_tqdm = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs} Training", leave=False)

    train_correct = 0
    total_length = 0
    for inputs, labels in train_loader_tqdm:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = sentiment_model(inputs)

        loss = criterion(outputs, labels)

        loss.backward()

        optimizer.step()

        total_loss += loss.item()
        train_loader_tqdm.set_postfix({'Train Loss': f'{loss.item():.4f}'})
        _, train_predicted = torch.max(outputs, 1)
        total_length += labels.size(0)
        train_correct += (train_predicted == labels).sum().item()

    avg_train_loss = total_loss / len(train_loader)
    train_accuracy = 100 * train_correct / total_length
    tqdm.write(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, train_accuracy:{train_accuracy:.2f}')

    wandb.log({'Epoch': (epoch + 1),'train_acc': train_accuracy, 'Train Loss': avg_train_loss})

    sentiment_model.eval()
    total_val_loss = 0
    correct = 0
    total = 0
    val_loader_tqdm = tqdm(val_loader, desc=f"Epoch {epoch + 1}/{num_epochs} Validation", leave=True)

    with torch.no_grad():
        for inputs, labels in val_loader_tqdm:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = sentiment_model(inputs)
            loss = criterion(outputs, labels)
            total_val_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()


    avg_val_loss = total_val_loss / len(val_loader)
    accuracy = 100 * correct / total
    wandb.log({'val_acc': accuracy, 'Val Loss': avg_val_loss})

    if epoch == num_epochs - 1:
      pass
    tqdm.write(f'Validation Loss: {avg_val_loss:.4f}, Accuracy: {accuracy:.2f}')


In [17]:
sentiment_model.eval()
total_val_loss = 0
correct = 0
total = 0
val_loader_tqdm = tqdm(val_loader, desc=f"Epoch {epoch + 1}/{num_epochs} Validation", leave=True)

with torch.no_grad():
    for inputs, labels in val_loader_tqdm:
        inputs, labels = inputs.to(device), labels.to(device)
        print(inputs)
        outputs = sentiment_model(inputs)
        loss = criterion(outputs, labels)
        total_val_loss += loss.item()

        _, predicted = torch.max(outputs, 1)
        print("Pred Sentiment:\n %s \n" % enc.decode(predicted.tolist()))
        print("True Sentiment:\n %s \n\n" % enc.decode(labels.tolist()))
        total += labels.size(0)
        correct += (predicted == labels).sum().item()


avg_val_loss = total_val_loss / len(val_loader)
accuracy = 100 * correct / total
wandb.log({'val_acc': accuracy, 'Val Loss': avg_val_loss})

if epoch == num_epochs - 1:
  pass
tqdm.write(f'Validation Loss: {avg_val_loss:.4f}, Accuracy: {accuracy:.2f}')

Epoch 1000/1000 Validation: 100%|██████████| 2/2 [00:00<00:00, 21.48it/s]

tensor([[-0.6307,  2.4028,  1.6233,  ..., -4.4695, -4.3534, -4.2683],
        [-1.7271,  0.2669,  1.9578,  ..., -5.5495, -5.0852, -5.3667],
        [-0.9770,  2.7099,  1.4513,  ..., -4.2283, -4.0324, -4.2043],
        ...,
        [-1.3192,  0.2989,  0.9295,  ..., -4.7991, -3.8555, -4.1432],
        [-0.8778,  4.2319,  2.3042,  ..., -6.0256, -5.4981, -5.5739],
        [-1.2731,  2.5529,  1.6683,  ..., -4.7275, -4.3610, -4.4150]],
       device='cuda:0')
Pred Sentiment:
 #!""""##"#""!"""#"#"#""""#"#"##"!""""""""#"###"#!"#""#"""#"""""""""#!""""""!"""###""""#!#"""#"""#"""""""""""#"""""""#"""""#"""!"!""""#""#""!"""#""""""""""""#"#""""""#"""##""""#""##"#"#"""#""""#"""""""###"!"!#!##"#"""""#"""!##""#"""""""""!""""###""#"!"""### 

True Sentiment:
 ##"""##""###!""##"#"#"""!""!##!"#""!"#"#"!"##!""!"#"#!"""""""#"""""!!""""""!""##"""#""!##"""#"#"#"#!"!"!"!!"#""""####""#"#!"""!"!!"!#"!"!""#""##""""""""!#"##"#""#"#""""###"#!"!#"##"""#"#"#"###!""""!"##"#"!"!"!##""#""""#"""###""!""""""""##"""###"""#"




In [59]:
#1,4, 5, 15
# idx = 15
for idx in range(50):
  sample = dataset['test'][idx]
  def int_to_label(num):
      if num == 0:
          return "Negative"
      if num == 1:
          return "Neutral"
      else:
          return "Positive"
  # print("Sample sentence:", dataset['test'][idx]['sentence'])
  # print("Sample label:", int_to_label(dataset['test'][idx]['label']))
  sample_set = SentimentAnalysisTrainDataset([sample], enc, m)
  sample_loader = DataLoader(sample_set, batch_size=1, shuffle=False)
  sentiment_model.eval()
  with torch.no_grad():
      for input, label in sample_loader:
          input, label = input.to(device), label.to(device)
          output = sentiment_model(input)
          _, prediction = torch.max(output, 1)
          break
  sentiment_model.eval()
  with torch.no_grad():
      for input, label in sample_loader:
          input, label = input.to(device), label.to(device)
          output = sentiment_model(input)
          _, prediction = torch.max(output, 1)
          break
  print("Input:", dataset['test'][idx]['sentence'])
  print("Actual:", int_to_label(dataset['test'][idx]['label']))
  print("Predicted:", int_to_label(prediction))

Input: The sellers were the founders of the company .
Actual: Neutral
Predicted: Neutral
Input: More staff has been recruited in Japan to further complement its network of close to 50 service locations in more than 20 countries worldwide .
Actual: Positive
Predicted: Positive
Input: Regulatory News : The Nomination Committee of Cybercom ( STO : CYBE ) , which is unanimous in its proposal , proposes the election of Jon Risfelt as the new Chairman of the Board .
Actual: Neutral
Predicted: Neutral
Input: Operating profit was EUR 1.6 mn in 2005 compared to EUR 5.9 mn in 2004 .
Actual: Negative
Predicted: Positive
Input: Based on strong customer interest and feedback , using RPM mobile phone manufacturers , operators , enterprises and developer forums can significantly increase their product and service quality , user-experience and cost-efficiency '' , Tulonen continues .
Actual: Positive
Predicted: Positive
Input: GET MIDNIGHTTRADER IN REALTIME : This report is delayed .
Actual: Neutral
P

In [54]:
sample_set = SentimentAnalysisTrainDataset([sample], enc, m)
sample_loader = DataLoader(sample_set, batch_size=1, shuffle=False)

In [55]:
sentiment_model.eval()
with torch.no_grad():
    for input, label in sample_loader:
        input, label = input.to(device), label.to(device)
        output = sentiment_model(input)
        _, prediction = torch.max(output, 1)
        break

In [57]:
print("Input:", dataset['test'][idx]['sentence'])
print("Actual:", int_to_label(dataset['test'][idx]['label']))
print("Predicted:", int_to_label(prediction))

Input: `` Several growth initiatives in the chosen geographic areas are already ongoing , '' it continued , noting Lindex opened its first store in the Czech Republic this autumn in Brno .
Actual: Positive
Predicted: Positive
