In [1]:
import matplotlib.pyplot as plt
import numpy as np
import torch
import torchinfo
from sklearn.model_selection import train_test_split
from torch import nn
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
from tqdm import tqdm
from torch.nn import functional as F
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from loadData import MovieReviews
import gensim.downloader


In [None]:
# setting hyperparameters
batch_size = 32
valid_size=0.3 # validation + testing
test_size=0.5 # tesing / (testing+validation)
learning_rate = 0.001
weight_decay = 0.0001
epoch = 20
seed = 42
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device=torch.device('cpu')

tokenizer = get_tokenizer("basic_english")
glove_vectors = gensim.downloader.load('glove-twitter-25')
unknown_vector = torch.randn(glove_vectors.vector_size)
max_length = 40
embed_dim = 25
label_size = 2

print(torch.__version__)
print(device)

In [3]:
# data loading
df = MovieReviews()
train_data, temp_data = train_test_split(df, test_size=valid_size, random_state=seed)
val_data, test_data = train_test_split(temp_data, test_size=test_size, random_state=seed)

In [4]:
def text_transform(sentence):
    sentence_embedding = []
    for word in sentence:
        try:
            sentence_embedding.append(torch.tensor(glove_vectors[word]))
        except KeyError:
            sentence_embedding.append(torch.tensor(unknown_vector))
    sentence_embedding = torch.stack((sentence_embedding))
    padded_embedding = torch.nn.functional.pad(sentence_embedding, (0, 0, 0, max_length - sentence_embedding.size(0)), mode='constant', value=0)
    return padded_embedding

def collate_fn(batch: np.ndarray):
    text_placeholder, label_placeholder = [], []

    for text, label in batch:
        text = tokenizer(text)
        text = text_transform(text) # shape: [max_len, embed_dim]
        text_placeholder.append(text)
        label_placeholder.append(label)

    return torch.stack(text_placeholder), torch.Tensor(label_placeholder)

In [5]:
# DataLoader
train_iter = DataLoader(
    train_data.to_numpy(),
    batch_size=batch_size,
    collate_fn=collate_fn,
    shuffle=True,
    drop_last=True,
)
test_iter = DataLoader(
    test_data.to_numpy(),
    batch_size=batch_size,
    collate_fn=collate_fn,
    shuffle=False           #　don't shuffle to check answer with original data
)
val_iter = DataLoader(
    val_data.to_numpy(),
    batch_size=batch_size,
    collate_fn=collate_fn,
    shuffle=True
)

In [6]:
# model
class TextCNN(nn.Module):
    def __init__(
        self,
        embed_dim: int,
        label_size: int,
        out_chaennels: int,
        kernel_size: list[int] = [2, 3, 4],
    ):
        super(TextCNN, self).__init__()
        
        self.convs = nn.ModuleList(
            [
                nn.Sequential(nn.Conv2d(1, out_chaennels, (kernel, embed_dim)), nn.LeakyReLU())
                for kernel in kernel_size
            ]
        )
        self.fc = nn.Linear(out_chaennels * len(kernel_size), label_size)
        self.dropout = nn.Dropout(p=0.5)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # Batch_size, 1, Seq_len, embed_dim
        y = x.unsqueeze(1) 
        # Batch_size, filter_number, Width
        ys = [conv(y).squeeze(3) for conv in self.convs]        
        # Batch_size, filter_number
        ys = [F.max_pool1d(y, y.size(2)).squeeze(2) for y in ys] 

        y = torch.cat(ys, 1)  # Combine outputs from different kernels; 
        y = self.dropout(y) 
        y = self.fc(y)       
        return y

In [8]:
model = TextCNN(embed_dim=embed_dim, label_size=label_size, out_chaennels=32).to(device)
print(torchinfo.summary(model))

Layer (type:depth-idx)                   Param #
TextCNN                                  --
├─ModuleList: 1-1                        --
│    └─Sequential: 2-1                   --
│    │    └─Conv2d: 3-1                  1,632
│    │    └─LeakyReLU: 3-2               --
│    └─Sequential: 2-2                   --
│    │    └─Conv2d: 3-3                  2,432
│    │    └─LeakyReLU: 3-4               --
│    └─Sequential: 2-3                   --
│    │    └─Conv2d: 3-5                  3,232
│    │    └─LeakyReLU: 3-6               --
├─Linear: 1-2                            194
├─Dropout: 1-3                           --
Total params: 7,490
Trainable params: 7,490
Non-trainable params: 0

train

In [9]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

In [None]:
model.train()
training_loss = []
valid_loss = []
training_accuracy = []
valid_accuracy = []

for e in range(epoch):
    correct = 0 # for train_accuracy
    for text, label in tqdm(train_iter):
        optimizer.zero_grad()
        text = text.to(device)
        label = label.to(device)
        output = model(text)
        loss = criterion(output, label.long())
        loss.backward()
        optimizer.step()

        # Calculate training accuracy
        preds = torch.argmax(output, dim=1)
        correct += (preds == label).sum().item()
        
    training_accuracy.append(correct / len(train_iter.dataset))
    training_loss.append(loss.item())
    print(f'Epoch [{e+1}/{epoch}], Training_Loss: {loss.item():.4f}')

    # Validation
    num_correct = 0 # for valid_accuracy
    model.eval()
    val_loss = 0.0
    for x, y in val_iter:
        preds = model(x)
        loss = criterion(preds, y.long())
        val_loss += loss.item()
        num_correct += (torch.argmax(preds, dim=1) == y).sum().item()
    
    valid_accuracy.append(num_correct / len(val_iter.dataset))
    val_loss /= len(val_iter)
    valid_loss.append(val_loss)
    print(f'Validation loss: {val_loss:.4f}')

In [None]:
plt.plot(training_loss, 'b', valid_loss, 'r')

In [None]:
plt.plot(training_accuracy, 'b', valid_accuracy, 'r')

In [None]:
model.eval()
preds = []
labels = []
with torch.no_grad():
    running_hit = 0.0

    for text, label in tqdm(test_iter):
        text = text.to(device)
        label = label.to(device)

        output = model(text)
        pred = output.argmax(dim=1)
        labels.append(label)
        preds.append(pred)
    preds_cat = torch.cat(preds)
    labels_cat = torch.cat(labels)
    
    accuracy = accuracy_score(preds_cat, labels_cat)
    precision = precision_score(preds_cat, labels_cat)
    recall = recall_score(preds_cat, labels_cat)
    f1 = f1_score(preds_cat, labels_cat)
    print(f'Test Accuracy: {accuracy * 100:.2f}%')
    print(f'Test Precision: {precision * 100:.2f}%')
    print(f'Test Recall: {recall * 100:.2f}%')
    print(f'Test F1 Score: {f1 * 100:.2f}%')

In [22]:
# Save the model if needed
# torch.save(model.state_dict(), 'TextCNN.pth')

In [23]:
# Instantiate the model
# model = TextCNN(vocab_size, 32, label_size, padding_idx=padding_idx)

# Load the saved state dictionary
# model_path = 'TextCNN.pth'
# model_state_dict = torch.load(model_path)

# Load the state dictionary into the model
# model.load_state_dict(model_state_dict)