# Laboratorio 4

Autores:

* Kuntur Muenala
* Diego Villacreses

In [2]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import spacy

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
#from skopt import BayesSearchCV
#from skopt.space import Real, Integer
  # Intento fallido de optimización bayesiana sobre hyperparámetros de Naive Bayes

import time
import seaborn as sns

os.chdir("/home/kmuenala/nlp/data")
print(f"{os.getcwd()=}")

os.getcwd()='/home/kmuenala/nlp/data'


## Load Data

Dataset:
* https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews/data

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator, GloVe
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

from google.colab import userdata
from transformers import AutoTokenizer

import re
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device= 'cpu'

# Tokenizer
# tokenizer = get_tokenizer("spacy", language="en_core_web_sm")
# hf_token= userdata.get('HF_TOKEN')

# reader_model_name = "meta-llama/Meta-Llama-3.1-8B"
# tokenizer = AutoTokenizer.from_pretrained(reader_model_name)

# Function to yield tokens from phrases for vocabulary building
def yield_tokens(phrases):
    for phrase in phrases:
        yield word_tokenize(phrase)

# Custom dataset class
class MovieReviewsDataset(Dataset):
    def __init__(self, phrases, sentiments, vocab, max_len=100):
        self.phrases = phrases
        self.sentiments = sentiments
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.phrases)

    def __getitem__(self, idx):
        phrase = self.phrases[idx]
        sentiment = self.sentiments[idx]
        tokens = word_tokenize(phrase)
        # tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
        # tokens = [stemmer.stem(word) for word in tokens]  # Apply stemming
        # tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Apply lemmatization
        token_ids = [self.vocab[token] if token in self.vocab else self.vocab["<unk>"] for token in tokens]
        # Padding or truncation
        if len(token_ids) > self.max_len:
            token_ids = token_ids[:self.max_len]
        else:
            token_ids += [self.vocab["<pad>"]] * (self.max_len - len(token_ids))

        return torch.tensor(token_ids), torch.tensor(sentiment, dtype=torch.long)

# Preprocessing the data
def preprocess_data(file_path):
    df = pd.read_csv(file_path, delimiter='\t')

    df = df[['Phrase', 'Sentiment']]
    recode_dict = {0:1,1:1,2:2,3:3,4:3}
    df = df.replace({'Sentiment': recode_dict})

    # df['Phrase'] = df['Phrase'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s.,]', '', x))
    # df['Phrase'] = df['Phrase'].apply(lambda x: x.lower())
    phrases = df['Phrase'].values
    sentiments = df['Sentiment'].values

    return phrases, sentiments

# Load data
phrases, sentiments = preprocess_data('train.tsv')

# Split the data
train_phrases, valid_phrases, train_sentiments, valid_sentiments = train_test_split(phrases, sentiments, test_size=0.2, random_state=13, stratify=sentiments)

# Build the vocabulary
vocab = build_vocab_from_iterator(yield_tokens(train_phrases), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])

# Load GloVe vectors
glove_vectors = GloVe(name="6B", dim=100)

# Create datasets
train_dataset = MovieReviewsDataset(train_phrases, train_sentiments, vocab)
valid_dataset = MovieReviewsDataset(valid_phrases, valid_sentiments, vocab)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
vocab

Vocab()

In [10]:
pd.Series(train_dataset.sentiments).value_counts(normalize=True)

2    0.509948
3    0.269976
1    0.220076
Name: proportion, dtype: float64

In [11]:
pd.Series(valid_dataset.sentiments).value_counts(normalize=True)

2    0.509932
3    0.269992
1    0.220076
Name: proportion, dtype: float64

## GRU

In [12]:
class SentimentGRU(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx):
        super(SentimentGRU, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=n_layers,
                          bidirectional=bidirectional, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        output, hidden = self.gru(embedded)
        if self.gru.bidirectional:
            hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        else:
            hidden = hidden[-1,:,:]

        hidden = self.dropout(hidden)
        return self.fc(hidden)

In [13]:
# Accuracy calculation function
def multi_class_accuracy(preds, y):
    """
    Returns accuracy per batch.
    Get the class with the highest probability and compare it with the true class.
    """
    max_preds = preds.argmax(dim=1, keepdim=True)  # get the index of the max probability
    correct = max_preds.squeeze(1).eq(y)
    return correct.sum().float() / torch.FloatTensor([y.shape[0]]).to(y.device)

# Training function
def train(model, loader, optimizer, criterion):
    model.train()
    epoch_loss = 0
    epoch_acc = 0

    for text, sentiment in loader:
        text, sentiment = text.to(device), sentiment.to(device)
        optimizer.zero_grad()
        predictions = model(text)
        loss = criterion(predictions, sentiment)
        acc = multi_class_accuracy(predictions, sentiment)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(loader), epoch_acc / len(loader)

# Evaluation function
def evaluate(model, loader, criterion):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    all_preds = []
    with torch.no_grad():
        for text, sentiment in loader:
            text, sentiment = text.to(device), sentiment.to(device)
            predictions = model(text)
            loss = criterion(predictions, sentiment)
            acc = multi_class_accuracy(predictions, sentiment)
            y_hat = predictions.argmax(dim=1, keepdim=True)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            all_preds.append(y_hat)

    return epoch_loss / len(loader), epoch_acc / len(loader), all_preds

In [17]:
# Model hyperparameters
# Initialize the model
# Sin Remove stopwords
# Sin Apply stemming
# Sin Apply lemmatization
INPUT_DIM = len(vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 5
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = vocab["<pad>"]
BATCH_SIZE = 64

# Model
model = SentimentGRU(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX).to(device)

glove_vectors = GloVe(name="6B", dim=EMBEDDING_DIM)

# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE)

# Use pretrained embeddings
embedding_matrix = torch.zeros(INPUT_DIM, EMBEDDING_DIM)
for i, token in enumerate(vocab.get_itos()):
    if token in glove_vectors.stoi:
        embedding_matrix[i] = glove_vectors[token]
model.embedding.weight.data.copy_(embedding_matrix)

# Define optimizer and loss function
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss().to(device)

# Training loop
N_EPOCHS = 10
nn_res = []
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc,all_preds = evaluate(model, valid_loader, criterion)

    y_hat_val = []
    for n in range(len(all_preds)):
      y_hat_val += [i[0] for i in all_preds[n].cpu().numpy()]

    f1_w = f1_score(y_true=valid_dataset.sentiments,
         y_pred=y_hat_val,
         average='weighted')

    print(f'Epoch {epoch+1}')
    print(f'Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'Valid Loss: {valid_loss:.3f} | Valid Acc: {valid_acc*100:.2f}% | Valid F1-Weighted: {f1_w*100:.2f}%')

    nn_res.append([epoch,train_loss,train_acc,valid_loss,valid_acc,f1_w])

Epoch 1
Train Loss: 0.764 | Train Acc: 66.69%
Valid Loss: 0.644 | Valid Acc: 72.53% | Valid F1-Weighted: 71.76%
Epoch 2
Train Loss: 0.650 | Train Acc: 72.57%
Valid Loss: 0.599 | Valid Acc: 75.45% | Valid F1-Weighted: 75.23%
Epoch 3
Train Loss: 0.618 | Train Acc: 74.05%
Valid Loss: 0.586 | Valid Acc: 75.86% | Valid F1-Weighted: 75.85%
Epoch 4
Train Loss: 0.574 | Train Acc: 76.13%
Valid Loss: 0.575 | Valid Acc: 76.40% | Valid F1-Weighted: 76.37%
Epoch 5
Train Loss: 0.546 | Train Acc: 77.45%
Valid Loss: 0.582 | Valid Acc: 76.55% | Valid F1-Weighted: 76.52%
Epoch 6
Train Loss: 0.525 | Train Acc: 78.24%
Valid Loss: 0.576 | Valid Acc: 76.56% | Valid F1-Weighted: 76.55%
Epoch 7
Train Loss: 0.505 | Train Acc: 79.15%
Valid Loss: 0.582 | Valid Acc: 76.83% | Valid F1-Weighted: 76.79%
Epoch 8
Train Loss: 0.489 | Train Acc: 79.79%
Valid Loss: 0.583 | Valid Acc: 76.97% | Valid F1-Weighted: 76.92%
Epoch 9
Train Loss: 0.474 | Train Acc: 80.28%
Valid Loss: 0.599 | Valid Acc: 76.70% | Valid F1-Weighted:

In [None]:
# Model hyperparameters
# Initialize the model
# con Remove stopwords
# con Apply stemming
# con Apply lemmatization
INPUT_DIM = len(vocab)
EMBEDDING_DIM = 200
HIDDEN_DIM = 256
OUTPUT_DIM = 5
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = vocab["<pad>"]
BATCH_SIZE = 32
N_EPOCHS = 10

# Model
model = SentimentGRU(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX).to(device)

glove_vectors = GloVe(name="6B", dim=EMBEDDING_DIM)

# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE)

# Use pretrained embeddings
embedding_matrix = torch.zeros(INPUT_DIM, EMBEDDING_DIM)
for i, token in enumerate(vocab.get_itos()):
    if token in glove_vectors.stoi:
        embedding_matrix[i] = glove_vectors[token]
model.embedding.weight.data.copy_(embedding_matrix)

# Define optimizer and loss function
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss().to(device)


# Training loop
nn_res = []
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc,all_preds = evaluate(model, valid_loader, criterion)

    y_hat_val = []
    for n in range(len(all_preds)):
      y_hat_val += [i[0] for i in all_preds[n].cpu().numpy()]

    f1_w = f1_score(y_true=valid_dataset.sentiments,
         y_pred=y_hat_val,
         average='weighted')

    print(f'Epoch {epoch+1}')
    print(f'Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'Valid Loss: {valid_loss:.3f} | Valid Acc: {valid_acc*100:.2f}% | Valid F1-Weighted: {f1_w*100:.2f}%')

    nn_res.append([epoch,train_loss,train_acc,valid_loss,valid_acc,f1_w])

Epoch 1
Train Loss: 1.092 | Train Acc: 56.13%
Valid Loss: 1.009 | Valid Acc: 59.48% | Valid F1-Weighted: 55.64%
Epoch 2
Train Loss: 1.021 | Train Acc: 58.88%
Valid Loss: 0.981 | Valid Acc: 60.71% | Valid F1-Weighted: 57.73%
Epoch 3
Train Loss: 0.986 | Train Acc: 60.31%
Valid Loss: 0.968 | Valid Acc: 61.27% | Valid F1-Weighted: 57.87%
Epoch 4
Train Loss: 0.962 | Train Acc: 61.46%
Valid Loss: 0.957 | Valid Acc: 61.76% | Valid F1-Weighted: 59.10%
Epoch 5
Train Loss: 0.982 | Train Acc: 60.83%
Valid Loss: 0.975 | Valid Acc: 61.00% | Valid F1-Weighted: 59.50%
Epoch 6
Train Loss: 0.969 | Train Acc: 61.36%
Valid Loss: 0.969 | Valid Acc: 61.24% | Valid F1-Weighted: 59.23%
Epoch 7
Train Loss: 0.957 | Train Acc: 61.99%
Valid Loss: 0.964 | Valid Acc: 61.60% | Valid F1-Weighted: 59.72%
Epoch 8
Train Loss: 0.950 | Train Acc: 62.20%
Valid Loss: 0.962 | Valid Acc: 61.92% | Valid F1-Weighted: 59.98%
Epoch 9
Train Loss: 0.941 | Train Acc: 62.54%
Valid Loss: 0.961 | Valid Acc: 61.71% | Valid F1-Weighted:

In [None]:
# Model hyperparameters
# Initialize the model
INPUT_DIM = len(vocab)
EMBEDDING_DIM = 200
HIDDEN_DIM = 300
OUTPUT_DIM = 5
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.6
PAD_IDX = vocab["<pad>"]
BATCH_SIZE = 64
N_EPOCHS = 10

# Model
model = SentimentGRU(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX).to(device)

glove_vectors = GloVe(name="6B", dim=EMBEDDING_DIM)

# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE)

# Use pretrained embeddings
embedding_matrix = torch.zeros(INPUT_DIM, EMBEDDING_DIM)
for i, token in enumerate(vocab.get_itos()):
    if token in glove_vectors.stoi:
        embedding_matrix[i] = glove_vectors[token]
model.embedding.weight.data.copy_(embedding_matrix)

# Define optimizer and loss function
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss().to(device)

# Training loop
nn_res = []
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc,all_preds = evaluate(model, valid_loader, criterion)

    y_hat_val = []
    for n in range(len(all_preds)):
      y_hat_val += [i[0] for i in all_preds[n].cpu().numpy()]

    f1_w = f1_score(y_true=valid_dataset.sentiments,
         y_pred=y_hat_val,
         average='weighted')

    print(f'Epoch {epoch+1}')
    print(f'Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'Valid Loss: {valid_loss:.3f} | Valid Acc: {valid_acc*100:.2f}% | Valid F1-Weighted: {f1_w*100:.2f}%')

    nn_res.append([epoch,train_loss,train_acc,valid_loss,valid_acc,f1_w])

Epoch 1
Train Loss: 1.115 | Train Acc: 55.42%
Valid Loss: 1.028 | Valid Acc: 58.97% | Valid F1-Weighted: 56.62%
Epoch 2
Train Loss: 1.047 | Train Acc: 57.69%
Valid Loss: 0.996 | Valid Acc: 60.06% | Valid F1-Weighted: 56.79%
Epoch 3
Train Loss: 1.017 | Train Acc: 59.23%
Valid Loss: 0.984 | Valid Acc: 60.80% | Valid F1-Weighted: 59.05%
Epoch 4
Train Loss: 0.993 | Train Acc: 60.10%
Valid Loss: 0.992 | Valid Acc: 60.80% | Valid F1-Weighted: 57.26%
Epoch 5
Train Loss: 0.978 | Train Acc: 60.83%
Valid Loss: 0.966 | Valid Acc: 61.54% | Valid F1-Weighted: 58.37%
Epoch 6
Train Loss: 0.962 | Train Acc: 61.43%
Valid Loss: 0.964 | Valid Acc: 61.46% | Valid F1-Weighted: 59.09%
Epoch 7
Train Loss: 0.953 | Train Acc: 61.75%
Valid Loss: 0.974 | Valid Acc: 61.45% | Valid F1-Weighted: 58.17%
Epoch 8
Train Loss: 0.938 | Train Acc: 62.42%
Valid Loss: 0.958 | Valid Acc: 61.84% | Valid F1-Weighted: 59.17%
Epoch 9
Train Loss: 0.931 | Train Acc: 62.67%
Valid Loss: 0.953 | Valid Acc: 61.73% | Valid F1-Weighted:

In [None]:
# Model hyperparameters
# Initialize the model
INPUT_DIM = len(vocab)
EMBEDDING_DIM = 200
HIDDEN_DIM = 300
OUTPUT_DIM = 5
N_LAYERS = 2
BIDIRECTIONAL = False
DROPOUT = 0.6
PAD_IDX = vocab["<pad>"]
BATCH_SIZE = 128
N_EPOCHS = 10

# Model
model = SentimentGRU(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX).to(device)

glove_vectors = GloVe(name="6B", dim=EMBEDDING_DIM)

# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE)

# Use pretrained embeddings
embedding_matrix = torch.zeros(INPUT_DIM, EMBEDDING_DIM)
for i, token in enumerate(vocab.get_itos()):
    if token in glove_vectors.stoi:
        embedding_matrix[i] = glove_vectors[token]
model.embedding.weight.data.copy_(embedding_matrix)

# Define optimizer and loss function
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss().to(device)


# Training loop

nn_res = []
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc,all_preds = evaluate(model, valid_loader, criterion)

    y_hat_val = []
    for n in range(len(all_preds)):
      y_hat_val += [i[0] for i in all_preds[n].cpu().numpy()]

    f1_w = f1_score(y_true=valid_dataset.sentiments,
         y_pred=y_hat_val,
         average='weighted')

    print(f'Epoch {epoch+1}')
    print(f'Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'Valid Loss: {valid_loss:.3f} | Valid Acc: {valid_acc*100:.2f}% | Valid F1-Weighted: {f1_w*100:.2f}%')

    nn_res.append([epoch,train_loss,train_acc,valid_loss,valid_acc,f1_w])

Epoch 1
Train Loss: 1.267 | Train Acc: 51.11%
Valid Loss: 1.168 | Valid Acc: 53.22% | Valid F1-Weighted: 44.52%
Epoch 2
Train Loss: 1.143 | Train Acc: 54.63%
Valid Loss: 1.045 | Valid Acc: 58.43% | Valid F1-Weighted: 53.02%
Epoch 3
Train Loss: 1.053 | Train Acc: 57.98%
Valid Loss: 1.008 | Valid Acc: 59.64% | Valid F1-Weighted: 55.61%
Epoch 4
Train Loss: 1.019 | Train Acc: 59.09%
Valid Loss: 0.992 | Valid Acc: 60.06% | Valid F1-Weighted: 55.84%
Epoch 5
Train Loss: 0.996 | Train Acc: 59.99%
Valid Loss: 0.985 | Valid Acc: 60.55% | Valid F1-Weighted: 58.17%
Epoch 6
Train Loss: 0.979 | Train Acc: 60.55%
Valid Loss: 0.972 | Valid Acc: 61.10% | Valid F1-Weighted: 58.66%
Epoch 7
Train Loss: 0.964 | Train Acc: 61.23%
Valid Loss: 0.969 | Valid Acc: 61.31% | Valid F1-Weighted: 59.52%
Epoch 8
Train Loss: 0.952 | Train Acc: 61.78%
Valid Loss: 0.966 | Valid Acc: 61.48% | Valid F1-Weighted: 58.71%
Epoch 9
Train Loss: 0.940 | Train Acc: 62.46%
Valid Loss: 0.959 | Valid Acc: 61.03% | Valid F1-Weighted:

In [None]:
# Model hyperparameters
# Initialize the model
INPUT_DIM = len(vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 5
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.7
PAD_IDX = vocab["<pad>"]
BATCH_SIZE = 16
LR = 0.001
WD = 1e-5
N_EPOCHS = 20
# Model
model = SentimentGRU(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX).to(device)

glove_vectors = GloVe(name="6B", dim=EMBEDDING_DIM)

# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE)

# Use pretrained embeddings
embedding_matrix = torch.zeros(INPUT_DIM, EMBEDDING_DIM)
for i, token in enumerate(vocab.get_itos()):
    if token in glove_vectors.stoi:
        embedding_matrix[i] = glove_vectors[token]
model.embedding.weight.data.copy_(embedding_matrix)

# Define optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=WD)
criterion = nn.CrossEntropyLoss().to(device)

# Training loop
nn_res = []
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc,all_preds = evaluate(model, valid_loader, criterion)

    y_hat_val = []
    for n in range(len(all_preds)):
      y_hat_val += [i[0] for i in all_preds[n].cpu().numpy()]

    f1_w = f1_score(y_true=valid_dataset.sentiments,
         y_pred=y_hat_val,
         average='weighted')

    print(f'Epoch {epoch+1}')
    print(f'Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'Valid Loss: {valid_loss:.3f} | Valid Acc: {valid_acc*100:.2f}% | Valid F1-Weighted: {f1_w*100:.2f}%')

    nn_res.append([epoch,train_loss,train_acc,valid_loss,valid_acc,f1_w])

Epoch 1
Train Loss: 1.154 | Train Acc: 53.91%
Valid Loss: 1.059 | Valid Acc: 57.58% | Valid F1-Weighted: 51.43%
Epoch 2
Train Loss: 1.082 | Train Acc: 56.78%
Valid Loss: 1.018 | Valid Acc: 59.49% | Valid F1-Weighted: 55.07%
Epoch 3
Train Loss: 1.057 | Train Acc: 57.81%
Valid Loss: 1.012 | Valid Acc: 59.51% | Valid F1-Weighted: 55.95%
Epoch 4
Train Loss: 1.042 | Train Acc: 58.36%
Valid Loss: 1.000 | Valid Acc: 59.99% | Valid F1-Weighted: 55.39%
Epoch 5
Train Loss: 1.033 | Train Acc: 58.70%
Valid Loss: 1.010 | Valid Acc: 58.88% | Valid F1-Weighted: 53.17%
Epoch 6
Train Loss: 1.023 | Train Acc: 59.18%
Valid Loss: 0.991 | Valid Acc: 60.10% | Valid F1-Weighted: 55.85%
Epoch 7
Train Loss: 1.017 | Train Acc: 59.36%
Valid Loss: 0.992 | Valid Acc: 60.14% | Valid F1-Weighted: 55.32%
Epoch 8
Train Loss: 1.011 | Train Acc: 59.55%
Valid Loss: 0.984 | Valid Acc: 60.42% | Valid F1-Weighted: 56.12%
Epoch 9
Train Loss: 1.007 | Train Acc: 59.87%
Valid Loss: 0.987 | Valid Acc: 60.12% | Valid F1-Weighted:

In [None]:
# Model hyperparameters
# Initialize the model
INPUT_DIM = len(vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 5
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.7
PAD_IDX = vocab["<pad>"]
BATCH_SIZE = 32
N_EPOCHS = 20

# Model
model = SentimentGRU(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX).to(device)

# glove_vectors = GloVe(name="6B", dim=EMBEDDING_DIM)

# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE)

# Use pretrained embeddings
# embedding_matrix = torch.zeros(INPUT_DIM, EMBEDDING_DIM)
# for i, token in enumerate(vocab.get_itos()):
#     if token in glove_vectors.stoi:
#         embedding_matrix[i] = glove_vectors[token]
# model.embedding.weight.data.copy_(embedding_matrix)

# Define optimizer and loss function
optimizer = optim.Adam(lr = 0.0001,params=model.parameters())
criterion = nn.CrossEntropyLoss().to(device)


# Training loop
nn_res = []
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc,all_preds = evaluate(model, valid_loader, criterion)

    y_hat_val = []
    for n in range(len(all_preds)):
      y_hat_val += [i[0] for i in all_preds[n].cpu().numpy()]

    f1_w = f1_score(y_true=valid_dataset.sentiments,
         y_pred=y_hat_val,
         average='weighted')

    print(f'Epoch {epoch+1}')
    print(f'Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'Valid Loss: {valid_loss:.3f} | Valid Acc: {valid_acc*100:.2f}% | Valid F1-Weighted: {f1_w*100:.2f}%')

    nn_res.append([epoch,train_loss,train_acc,valid_loss,valid_acc,f1_w])

Epoch 1
Train Loss: 1.241 | Train Acc: 51.02%
Valid Loss: 1.215 | Valid Acc: 51.83% | Valid F1-Weighted: 38.25%
Epoch 2
Train Loss: 1.215 | Train Acc: 51.54%
Valid Loss: 1.225 | Valid Acc: 51.70% | Valid F1-Weighted: 37.67%
Epoch 3
Train Loss: 1.205 | Train Acc: 52.04%
Valid Loss: 1.207 | Valid Acc: 52.70% | Valid F1-Weighted: 40.80%
Epoch 4
Train Loss: 1.193 | Train Acc: 52.51%
Valid Loss: 1.164 | Valid Acc: 53.82% | Valid F1-Weighted: 43.99%
Epoch 5
Train Loss: 1.182 | Train Acc: 52.96%
Valid Loss: 1.161 | Valid Acc: 54.02% | Valid F1-Weighted: 44.10%
Epoch 6
Train Loss: 1.173 | Train Acc: 53.29%
Valid Loss: 1.145 | Valid Acc: 54.33% | Valid F1-Weighted: 45.03%
Epoch 7
Train Loss: 1.166 | Train Acc: 53.63%
Valid Loss: 1.136 | Valid Acc: 54.88% | Valid F1-Weighted: 46.83%
Epoch 8
Train Loss: 1.159 | Train Acc: 53.72%
Valid Loss: 1.133 | Valid Acc: 54.68% | Valid F1-Weighted: 45.66%
Epoch 9
Train Loss: 1.153 | Train Acc: 53.99%
Valid Loss: 1.120 | Valid Acc: 55.30% | Valid F1-Weighted:

In [None]:
# Model hyperparameters
# Initialize the model
INPUT_DIM = len(vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 512
OUTPUT_DIM = 5
N_LAYERS = 3
BIDIRECTIONAL = True
DROPOUT = 0.6
PAD_IDX = vocab["<pad>"]
BATCH_SIZE = 128
N_EPOCHS = 100

# Model
model = SentimentGRU(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX).to(device)

# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE)

glove_vectors = GloVe(name="6B", dim=EMBEDDING_DIM)

# Use pretrained embeddings
embedding_matrix = torch.zeros(INPUT_DIM, EMBEDDING_DIM)
for i, token in enumerate(vocab.get_itos()):
    if token in glove_vectors.stoi:
        embedding_matrix[i] = glove_vectors[token]
model.embedding.weight.data.copy_(embedding_matrix)

# Define optimizer and loss function
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss().to(device)

# Training loop
nn_res = []
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc,all_preds = evaluate(model, valid_loader, criterion)

    y_hat_val = []
    for n in range(len(all_preds)):
      y_hat_val += [i[0] for i in all_preds[n].cpu().numpy()]

    f1_w = f1_score(y_true=valid_dataset.sentiments,
         y_pred=y_hat_val,
         average='weighted')

    print(f'Epoch {epoch+1}')
    print(f'Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'Valid Loss: {valid_loss:.3f} | Valid Acc: {valid_acc*100:.2f}% | Valid F1-Weighted: {f1_w*100:.2f}%')

    nn_res.append([epoch,train_loss,train_acc,valid_loss,valid_acc,f1_w])


Epoch 1
Train Loss: 1.153 | Train Acc: 53.73%
Valid Loss: 1.064 | Valid Acc: 56.97% | Valid F1-Weighted: 50.48%
Epoch 2
Train Loss: 1.091 | Train Acc: 56.01%
Valid Loss: 1.034 | Valid Acc: 58.11% | Valid F1-Weighted: 52.35%
Epoch 3
Train Loss: 1.061 | Train Acc: 57.27%
Valid Loss: 1.008 | Valid Acc: 59.63% | Valid F1-Weighted: 56.88%
Epoch 4
Train Loss: 1.079 | Train Acc: 56.98%
Valid Loss: 1.069 | Valid Acc: 57.83% | Valid F1-Weighted: 51.13%
Epoch 5
Train Loss: 1.056 | Train Acc: 57.70%
Valid Loss: 0.993 | Valid Acc: 60.02% | Valid F1-Weighted: 57.30%
Epoch 6
Train Loss: 1.032 | Train Acc: 58.36%
Valid Loss: 0.993 | Valid Acc: 60.33% | Valid F1-Weighted: 58.48%
Epoch 7
Train Loss: 1.022 | Train Acc: 58.75%
Valid Loss: 0.994 | Valid Acc: 60.36% | Valid F1-Weighted: 58.92%
Epoch 8
Train Loss: 1.014 | Train Acc: 59.48%
Valid Loss: 0.988 | Valid Acc: 60.48% | Valid F1-Weighted: 57.09%
Epoch 9
Train Loss: 1.003 | Train Acc: 59.56%
Valid Loss: 0.972 | Valid Acc: 61.07% | Valid F1-Weighted:

In [None]:
results

Unnamed: 0,total_tokens,vocab_sizes,training_time,accuracy,f1_score
caso_0,676776.0,18087.0,16.007943,0.594236,0.585323
caso_1,677364.0,16465.0,17.848892,0.594236,0.584347
caso_2,428043.0,16334.0,18.545607,0.591563,0.579889
caso_3,428043.0,11973.0,13.133028,0.594824,0.578652
caso_4,428043.0,11953.0,11.305312,0.594717,0.578476
caso_5,367998.0,11622.0,10.973437,0.593488,0.576263
caso_6,367998.0,11622.0,13.390586,0.593006,0.575775


Se aplican acumulativamente las técnicas de preprocesamiento para texto:

- Case folding
- remove stop words
- stemming
- lemmatization
- remove special characters and numbers
- instead of using a 0/1 (binary) vector, use the number of times that a word occurs in the text

Cada preprocesamiento reduce la cantidad de variables predictoras de nuestro modelo, pasando de un BoW completo de 18087, a 16465 gracias Case Folding (caso 1), 11622 para el caso 5 (case folding, stop word removing, stemming, lemmatization, numbers and special characters remove). El caso 6 (caso 5 pero en lugar de variables 1/0 se incluye el conteo de ocurrencia) presenta la misma cantidad de variables predictoras que el caso 5. Gracias a esta constante reducción en la dimensionalidad del problema de clasificación se observa una constante reducción en el tiempo de entrenamiento.

Se utilizan las metodologías de preprocesamiento de texto por defecto de la famosa librería `NLTK`, un análisis más exhaustivo de diferentes versiones de este preprocesamiento y calibración de sus hyperparámetros es recomendado para futuras investigaciones.

Con respecto a las metricas del accuracy y f1-score se observa una ligera reducción a medida que se implementa cada regla de procesamiento de texto. Estas diferencias podrían no ser estadísticamente significativas al momento de aplicar k-fold cross-validation. En el caso de ser diferencias significativas, se debería analizar la posibilidad de cambio de versión o hyperparámetros de la respectiva técnica.

Una metrica alta del modelo en el analsis de sentimientos no era objetivo de este estudio, sin embargo, mediante un modelo simple se puede obtener un accuracy de 0.59, un valor considerablemente alto al compararse con el campeón de Kaggle (0.76). Se considera valioso la visualización del preprocesamiento de texto sobre la eficiencia y métricas del modelo.