In [None]:
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torchtext.data import get_tokenizer
from torchtext.vocab import GloVe
import torch
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import os

class IMDBDataset(Dataset):
    def __init__(self,data,tokenizer):
        super().__init__()
        self.tokenizer = tokenizer
        self.data = data
        self.texts = data["text"]
        self.labels = data["label"]
        self.GLOVE_DIM = 100
        self.GLOVE = GloVe(name='6B', dim=self.GLOVE_DIM)
        

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        sentence = self.tokenizer(self.texts[index])
        x = self.GLOVE.get_vecs_by_tokens(sentence)
        label = self.labels[index]
        return x, label


def get_dataloader(data,tokenizer):
    def collate_fn(batch):
        x, y = zip(*batch)
        x_pad = pad_sequence(x, batch_first=True)
        y = torch.Tensor(y)
        return x_pad, y
    dataloader = DataLoader(IMDBDataset(data,tokenizer),
                    batch_size=32,
                    shuffle=True,
                    collate_fn=collate_fn)

    return dataloader


class RNN(torch.nn.Module):
    def __init__(self, hidden_units=64, dropout_rate=0.5):
        super().__init__()
        self.drop = nn.Dropout(dropout_rate)
        self.GLOVE_DIM = 100
        self.rnn = nn.LSTM(self.GLOVE_DIM, hidden_units, 1, batch_first=True)
        self.linear = nn.Linear(hidden_units, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x: torch.Tensor):
        # x shape: [batch, max_word_length, embedding_length]
        emb = self.drop(x)
        output, _ = self.rnn(emb)
        output = output[:, -1]
        output = self.linear(output)
        output = self.sigmoid(output)

        return output

device = 'cuda:0'
tokenizer = get_tokenizer('basic_english')
data_train = pd.read_csv("/content/drive/MyDrive/DDA4210project/IMDB_dataset/Train.csv")
data_val = pd.read_csv("/content/drive/MyDrive/DDA4210project/IMDB_dataset/Test.csv")
train_dataloader = get_dataloader(data_train,tokenizer)
test_dataloader = get_dataloader(data_val,tokenizer)
model = RNN().to(device)
# train
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
citerion = torch.nn.BCELoss()
for epoch in range(10):
    loss_sum = 0
    dataset_len = len(train_dataloader.dataset)
    for x, y in train_dataloader:
        batchsize = y.shape[0]
        x = x.to(device)
        y = y.to(device)
        hat_y = model(x)
        hat_y = hat_y.squeeze(-1)
        loss = citerion(hat_y, y)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        loss_sum += loss * batchsize

    print(f'Epoch {epoch}. loss: {loss_sum / dataset_len}')

torch.save(model.state_dict(), 'rnn.pth')

# val

# model.load_state_dict(
#     torch.load('rnn.pth', 'cuda:0'))

accuracy = 0

dataset_len = len(test_dataloader.dataset)
model.eval()
for x, y in test_dataloader:
    x = x.to(device)
    y = y.to(device)
    with torch.no_grad():
        hat_y = model(x)
    hat_y.squeeze_(1)
    predictions = torch.where(hat_y > 0.5, 1, 0)
    score = torch.sum(torch.where(predictions == y, 1, 0))
    accuracy += score.item()
accuracy /= dataset_len

print(f'Accuracy: {accuracy}')

.vector_cache/glove.6B.zip: 862MB [02:40, 5.37MB/s]                           
100%|█████████▉| 399999/400000 [00:19<00:00, 20994.12it/s]


Epoch 0. loss: 0.6931819319725037
Epoch 1. loss: 0.6926447153091431
Epoch 2. loss: 0.691540539264679
Epoch 3. loss: 0.689765453338623
Epoch 4. loss: 0.6411824822425842
Epoch 5. loss: 0.6022836565971375
Epoch 6. loss: 0.593013346195221
Epoch 7. loss: 0.5925948619842529
Epoch 8. loss: 0.5849927067756653
Epoch 9. loss: 0.5718404054641724
Accuracy: 0.7588


In [None]:
# val
def get_dataloader(data,tokenizer):
    def collate_fn(batch):
        x, y = zip(*batch)
        x_pad = pad_sequence(x, batch_first=True)
        y = torch.Tensor(y)
        return x_pad, y
    dataloader = DataLoader(IMDBDataset(data,tokenizer),
                    batch_size=1,
                    shuffle=True,
                    collate_fn=collate_fn)

    return dataloader

model.load_state_dict(
     torch.load('rnn.pth', 'cuda:0'))

accuracy = 0
n = data_val.shape[0]
test_dataloader = get_dataloader(data_val,tokenizer)
dataset_len = len(test_dataloader.dataset)
pre_label = []
labels = []
model.eval()
for x, y in test_dataloader:
    x = x.to(device)
    y = y.to(device)
    with torch.no_grad():
        hat_y = model(x)
    hat_y.squeeze_(1)
    predictions = torch.where(hat_y > 0.5, 1, 0)
    score = torch.sum(torch.where(predictions == y, 1, 0))
    accuracy += score.item()
    pre_label.append(predictions.detach().cpu().numpy()[0])
    labels.append(y.detach().cpu().numpy()[0])
accuracy /= dataset_len

print(f'Accuracy: {accuracy}')

Accuracy: 0.7824


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
report = classification_report(labels, pre_label, digits=4)

In [None]:
print(report)

              precision    recall  f1-score   support

         0.0     0.7394    0.8709    0.7998      2495
         1.0     0.8438    0.6942    0.7617      2505

    accuracy                         0.7824      5000
   macro avg     0.7916    0.7826    0.7807      5000
weighted avg     0.7917    0.7824    0.7807      5000

