# Alunos Regulares IA-024-2024S1 FEEC-UNICAMP - Elton Cardoso do Nascimento
# Notebook 2
versão 26 de fevereiro de 2024, 19h

RA 233840

## Instalação e importação de pacotes

In [6]:
!pip install torchtext
!pip install 'portalocker>=2.0.0'



In [7]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchtext.datasets import IMDB
from collections import Counter
import torch.nn as nn
import torch.optim as optim

#Added
import time
import string

## I - Vocabulário e Tokenização

In [8]:
def clean_text(text:str) -> str:
  text = text.lower()

  for punctuation in string.punctuation:
    text = text.replace(punctuation, " ")

  return text

In [9]:
# limit the vocabulary size to 20000 most frequent tokens
vocab_size = 20000

counter = Counter()
for (target, line) in list(IMDB(split='train')):
    line = clean_text(line)

    counter.update(line.split())

# create a vocabulary of the 20000 most frequent tokens
most_frequent_words = sorted(counter, key=counter.get, reverse=True)[:vocab_size]
vocab = {word: i for i, word in enumerate(most_frequent_words, 1)}
vocab_size = len(vocab)

In [10]:
def encode_sentence(sentence, vocab):
  sentence = clean_text(sentence)
  return [vocab.get(word, 0) for word in sentence.split()] # 0 for OOV

encode_sentence("I like Pizza.", vocab)

[10, 39, 7762]

## II - Dataset

In [11]:
from torch.nn.functional import one_hot
# Dataset Class with One-hot Encoding
class IMDBDataset(Dataset):
    def __init__(self, split, vocab):
        self.data = list(IMDB(split=split))
        self.vocab = vocab

        # Cache data encoding and target
        self.itens = []
        for idx in range(len(self.data)):
          target, line = self.data[idx]

          target = 1 if target == 1 else 0

          # one-hot encoding
          X = torch.zeros(len(self.vocab) + 1)
          for word in encode_sentence(line, self.vocab):
              X[word] = 1

          self.itens.append((X, torch.tensor(target)))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.itens[idx]

# Load Data with One-hot Encoding
train_data = IMDBDataset('train', vocab)
test_data = IMDBDataset('test', vocab)


## III - Data Loader

In [12]:
batch_size = 128
# define dataloaders
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_data,  batch_size=batch_size, shuffle=False)


## IV - Modelo

In [13]:
class OneHotMLP(nn.Module):
    def __init__(self, vocab_size):
        super(OneHotMLP, self).__init__()

        self.fc1 = nn.Linear(vocab_size+1, 200)
        self.fc2 = nn.Linear(200, 1)

        self.relu = nn.ReLU()

    def forward(self, x):
        o = self.fc1(x.float())
        o = self.relu(o)
        return self.fc2(o)

# Model instantiation
model = OneHotMLP(vocab_size)

## V - Laço de Treinamento - Otimização da função de Perda pelo Gradiente descendente

In [14]:
# Verifica se há uma GPU disponível e define o dispositivo para GPU se possível, caso contrário, usa a CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device.type == 'cuda':
    print('GPU:', torch.cuda.get_device_name(torch.cuda.current_device()))
else:
    print('using CPU')

GPU: Tesla T4


In [15]:
def evaluate(model):
  ## evaluation
  model.eval()

  with torch.no_grad():
      correct = 0
      total = 0
      total_loss = 0

      for inputs, targets in test_loader:
          inputs = inputs.to(device)
          targets = targets.to(device)
          logits = model(inputs)
          predicted = torch.round(torch.sigmoid(logits.squeeze()))
          total += targets.size(0)
          correct += (predicted == targets).sum().item()

          total_loss += criterion(logits.squeeze(), targets.float())*targets.size(0)

      total_loss /= total
      PPL = torch.exp(total_loss)

      print(f'TEST Accuracy {100 * correct / total}%, \
            Total loss: {total_loss.item():.4f}, \
            Perplexity: {PPL.item():.4f}')

In [16]:
lr_opt = 0.05

In [18]:
import time

model = model.to(device)
# Define loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(), lr=lr_opt)

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    start_time = time.time()  # Start time of the epoch
    model.train()

    total = 0
    total_loss = 0
    for inputs, targets in train_loader:
        inputs = inputs.to(device)
        targets = targets.to(device)
        # Forward pass
        logits = model(inputs)
        loss = criterion(logits.squeeze(), targets.float())

        total += targets.size(0)
        total_loss += loss*targets.size(0)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    total_loss /= total
    PPL = torch.exp(total_loss)


    end_time = time.time()  # End time of the epoch
    epoch_duration = end_time - start_time  # Duration of epoch

    print(f'Epoch [{epoch+1}/{num_epochs}], \
            Loss: {loss.item():.4f}, \
            Elapsed Time: {epoch_duration:.2f} sec, \
            Perplexity: {PPL.item():.4f}')

    evaluate(model)

Epoch [1/5],             Loss: 0.3232,             Elapsed Time: 1.28 sec,             Perplexity: 1.4948
TEST Accuracy 85.288%,             Total loss: 0.3622,             Perplexity: 1.4365
Epoch [2/5],             Loss: 0.2621,             Elapsed Time: 1.94 sec,             Perplexity: 1.3774
TEST Accuracy 86.96%,             Total loss: 0.3206,             Perplexity: 1.3780
Epoch [3/5],             Loss: 0.3153,             Elapsed Time: 1.08 sec,             Perplexity: 1.3261
TEST Accuracy 86.028%,             Total loss: 0.3285,             Perplexity: 1.3888
Epoch [4/5],             Loss: 0.1741,             Elapsed Time: 1.21 sec,             Perplexity: 1.2925
TEST Accuracy 88.004%,             Total loss: 0.2924,             Perplexity: 1.3396
Epoch [5/5],             Loss: 0.1996,             Elapsed Time: 1.93 sec,             Perplexity: 1.2684
TEST Accuracy 88.256%,             Total loss: 0.2894,             Perplexity: 1.3356
