In [None]:
# download files for sentiment classification
from requests import get

def download(url, filename):
    with open(filename, "wb") as file:
        response = get(url)
        file.write(response.content)

download("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", "ratings_train.txt")
download("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", "ratings_test.txt")

# print first 5 lines of the file
with open("ratings_train.txt", "r") as file:
    for i in range(5):
        print(file.readline(), end="")

In [None]:
# build a vocabulary with training data
with open("ratings_train.txt", "r", encoding="utf-8") as file:
    contents = file.read()
    lines = contents.split("\n")[1:]
    train_data = [line.split("\t") for line in lines if len(line) > 0]

with open("ratings_test.txt", "r", encoding="utf-8") as file:
    contents = file.read()
    lines = contents.split("\n")[1:]
    test_data = [line.split("\t") for line in lines if len(line) > 0]

vocab = {"[PAD]":0, "[UNK]":1}
vocab_idx = 2
for data in train_data:
    line = data[1]
    for char in line:
        if char not in vocab:
            vocab[char] = vocab_idx
            vocab_idx += 1

In [None]:
# build a pytorch dataset
import torch
from torch.utils.data import Dataset

class SentimentDataset(Dataset):
    def __init__(self, data, vocab):
        self.data = data
        self.vocab = vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        label = int(self.data[index][2])
        line = self.data[index][1]
        # convert characters to indices with unk token 
        line = [self.vocab.get(char, 1) for char in line]
        
        if len(line) > 100:
            line = line[:100]
        else:
            line = line[:100] + [0] * (100 - len(line))
            
        return torch.tensor(line), torch.tensor(label)

In [None]:
# build a pytorch sentiment classification model using MLP
import torch.nn as nn
import torch.nn.functional as F

class SentimentClassifier(nn.Module):
    def __init__(self, vocab_size):
        super(SentimentClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, 32)
        self.fc1 = nn.Linear(32 * 100, 100)
        self.fc2 = nn.Linear(100, 2)

    def forward(self, x):
        x = self.embedding(x)
        x = x.view(-1, 32 * 100)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [None]:
# set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create the model instance
model = SentimentClassifier(len(vocab)).to(device)

# Create the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train the model
train_dataset = SentimentDataset(train_data, vocab)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4)

val_dataset = SentimentDataset(test_data, vocab)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=4)

In [None]:
import wandb

logger = wandb.init(project="NLP", name="Lec01_sentiment_classification_wo_pl")

best_train_loss = 10000
best_val_loss = 10000

# Training loop with early stopping patience=3
patience = 2

for epoch in range(3):
    for batch in train_loader:
        inputs, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs.to(device))
        loss = criterion(outputs, labels.to(device))
        logger.log({"train_loss": loss.item()})
        
        #early stopping
        if loss < best_train_loss:
            best_train_loss = loss
            patience = 2
        else:
            patience -= 1
            if patience == 0:
                break
            
        loss.backward()
        optimizer.step()
    
    # Validation loop for each epoch with accuracy
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in val_loader:
            inputs, labels = batch
            outputs = model(inputs.to(device))
            loss = criterion(outputs, labels.to(device))
            logger.log({"val_loss": loss.item()})
            if loss < best_val_loss:
                best_val_loss = loss
                torch.save(model.state_dict(), "best_sentiment_model.pth")    
                
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels.to(device)).sum().item()
    print(f"Epoch {epoch+1}, Train Loss: {loss.item()}")
    print(f"Epoch {epoch+1}, Validation Accuracy: {100 * correct / total:.2f}%")
    
# Save the model
torch.save(model.state_dict(), "end_sentiment_model.pth")

In [None]:
# Load the model
model = SentimentClassifier(len(vocab))
model.load_state_dict(torch.load("best_sentiment_model.pth"))
model.to(device)
model.eval()

# Test the model
test_dataset = SentimentDataset(test_data, vocab)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=4)

correct = 0
total = 0
with torch.no_grad():
    for batch in test_loader:
        inputs, labels = batch
        outputs = model(inputs.to(device))
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels.to(device)).sum().item()
        
print(f"Accuracy: {100 * correct / total}%")