# Training TextCNN classifier for sentimental analysis of Reddit data
Training a deep learning model (TextCNN) that is based on 1D convolutional layers. Pre-trained [embeddings by glove](https://nlp.stanford.edu/projects/glove/) are used when setting up the model. TextCNN proposed in [this paper](https://arxiv.org/abs/1408.5882) implementation taken from [Dive into Deep Learning](https://d2l.ai/chapter_natural-language-processing-applications/sentiment-analysis-cnn.html).

Results:
- f1 score   macro avg: 0.69
- f1 score  weighted avg: 0.75

Note (from [scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html)):
- F1 = 2 * (precision * recall) / (precision + recall)
- 'micro': Calculate metrics globally by counting the total true positives, false negatives and false positives.
- 'macro': Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.

Model weights:\
No pretrained model weights required.

Data:\
Data is downloaded when the notebook is executed from hugging face.

In [None]:
!module load CUDA/11.3
!module load cuDNN/8.2.1.32-CUDA-11.3

In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from tqdm import tqdm

import utils

import datasets

import skmultilearn.model_selection.iterative_stratification

import sklearn.metrics
import keras.preprocessing.sequence
import keras.preprocessing.text

import torch
import torch.utils.data

## Check if gpus are available  

In [None]:
!python -m torch.utils.collect_env
print(torch.version.cuda)
torch.cuda.is_available()

## Load dataset

In [None]:
emotions = datasets.load_dataset("go_emotions", "simplified")
num_labels = 28

In [None]:
df = pd.concat(
    [
        emotions.data["train"].table.to_pandas(),
        emotions.data["validation"].table.to_pandas(),
        emotions.data["test"].table.to_pandas(),
    ]
)

### Reduce labels from **27 categories of emotions + neutral** to **emotional + neutral** 

In [None]:
y = utils.convert_df_labels(df, num_labels)
df = utils.remove_ambiguous_data(df, y)
y = utils.convert_df_labels(df, num_labels)

## Split data into training and test set

In [None]:
# explanation for iterative stratification of labels http://videolectures.net/ecmlpkdd2011_tsoumakas_stratification/?q=stratification%20multi%20label
(
    X_train,
    y_train,
    X_test,
    y_test,
) = skmultilearn.model_selection.iterative_stratification.iterative_train_test_split(
    df["text"].values.reshape(-1, 1), y, 0.1
)
X_train = X_train[:, 0]
X_test = X_test[:, 0]

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

## Preprocess text

In [None]:
def preprocess_text(sen):
    # Remove punctuations and numbers
    sentence = re.sub("[^a-zA-Z]", " ", sen)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", " ", sentence)

    # Removing multiple spaces
    sentence = re.sub(r"\s+", " ", sentence)

    return sentence

In [None]:
vectorized_preprocess_text = np.vectorize(preprocess_text)
X_train = vectorized_preprocess_text(X_train)
X_test = vectorized_preprocess_text(X_test)

## Tokenize text

In [None]:
num_words = 5000
tokenizer = keras.preprocessing.text.Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1

maxlen = 200

X_train = keras.preprocessing.sequence.pad_sequences(X_train, padding="post", maxlen=maxlen)
X_test = keras.preprocessing.sequence.pad_sequences(X_test, padding="post", maxlen=maxlen)

## Load [glove text embeddings](https://nlp.stanford.edu/projects/glove/) 

In [None]:
embeddings_dictionary = dict()

glove_file = open(
    "/p/project/deepacf/maelstrom/ehlert1/embeddings/glove.6B/glove.6B.100d.txt",
    encoding="utf8",
)

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype="float32")
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

embedding_matrix = np.zeros((num_words, 100))
for word, index in tokenizer.word_index.items():
    if index > num_words - 1:
        continue
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

## Define functions to binarize labels depending on situation

In [None]:
def binarize_labels_torch(labels):
    """
    returns labels in format [0, 1, 1, 0,.....]
    """
    y_binary = np.zeros(labels.shape[0])
    mask = labels[:, -1] == 1
    y_binary[mask] = 1
    return y_binary


def binarize_labels_scikitlearn(labels):
    """
    returns labels in format [[1, 0], [0, 1], [0, 1], [1, 0],.....]
    """
    y_binary = np.zeros((labels.shape[0], 2))
    mask = labels[:, -1] == 1
    y_binary[mask, 1] = 1
    y_binary[np.logical_not(mask), 0] = 1
    return y_binary

## Define model

In [None]:
def accuracy(y_hat, y):
    """Compute the number of correct predictions.
    Defined in :numref:`sec_utils`"""
    if len(y_hat.shape) > 1 and y_hat.shape[1] > 1:
        y_hat = y_hat.argmax(axis=1)
    cmp = y_hat == y
    return float(cmp.sum())


def evaluate_accuracy_gpu(net, data_iter, device=None):
    """Compute the accuracy for a model on a dataset using a GPU.
    Defined in :numref:`sec_utils`"""
    if isinstance(net, torch.nn.Module):
        net.eval()  # Set the model to evaluation mode
        if not device:
            device = next(iter(net.parameters())).device
    # No. of correct predictions, no. of predictions
    metric = Accumulator(2)

    with torch.no_grad():
        for X, y in data_iter:
            if isinstance(X, list):
                # Required for BERT Fine-tuning (to be covered later)
                X = [x.to(device) for x in X]
            else:
                X = X.to(device)
            y = y.to(device)
            metric.add(accuracy(net(X), y), y.numel())
    return metric[0] / metric[1]


class Accumulator:
    """For accumulating sums over `n` variables."""

    def __init__(self, n):
        """Defined in :numref:`sec_utils`"""
        self.data = [0.0] * n

    def add(self, *args):
        self.data = [a + float(b) for a, b in zip(self.data, args)]

    def reset(self):
        self.data = [0.0] * len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]


class EarlyStopping:
    def __init__(self, tolerance=5, min_delta=0):
        self.tolerance = tolerance
        self.min_delta = min_delta  # relative
        self.counter = 0
        self.early_stop = False

    def __call__(self, train_loss, validation_loss):
        if (validation_loss - train_loss) / (validation_loss) > self.min_delta:
            self.counter += 1
            if self.counter >= self.tolerance:
                self.early_stop = True

In [None]:
# code taken from https://d2l.ai/chapter_computer-vision/image-augmentation.html


def train_batch(net, X, y, loss, trainer, devices):
    """Train for a minibatch with mutiple GPUs (defined in Chapter 13)."""
    if isinstance(X, list):
        # Required for BERT fine-tuning (to be covered later)
        X = [x.to(devices[0]) for x in X]
    else:
        X = X.to(devices[0])
    y = y.to(devices[0])
    net.train()
    trainer.zero_grad()
    pred = net(X)
    l = loss(pred, y)
    l.sum().backward()
    trainer.step()
    train_loss_sum = l.sum()
    train_acc_sum = accuracy(pred, y)
    return train_loss_sum, train_acc_sum


def train(
    net,
    train_iter,
    test_iter,
    loss,
    trainer,
    num_epochs,
    devices=list(range(torch.cuda.device_count())),
    early_stopping=False,
):
    """Train a model with mutiple GPUs."""
    num_batches = len(train_iter)
    net = torch.nn.DataParallel(net, device_ids=devices).to(devices[0])
    if early_stopping:
        _early_stopping = EarlyStopping(tolerance=5, min_delta=0.01)
    for epoch in range(num_epochs):
        print(f"starting epoch: {epoch}")
        # Sum of training loss, sum of training accuracy, no. of examples,
        # no. of predictions
        metric = Accumulator(4)
        for i, (features, labels) in enumerate(tqdm(train_iter)):
            l, acc = train_batch(net, features, labels, loss, trainer, devices)
            metric.add(l, acc, labels.shape[0], labels.numel())
        test_acc = evaluate_accuracy_gpu(net, test_iter)
        print("train loss: %g, train acc: %g, test_acc: %g" % (metric[0] / metric[2], metric[1] / metric[3], test_acc))
        if early_stopping and _early_stopping.early_stop:
            print("Early stopping at epoch:", epoch)
            break
    print(f"loss {metric[0] / metric[2]:.3f}, train acc " f"{metric[1] / metric[3]:.3f}, test acc {test_acc:.3f}")

In [None]:
class TextCNN(torch.nn.Module):
    def __init__(self, vocab_size, embed_size, kernel_sizes, num_channels, **kwargs):
        super(TextCNN, self).__init__(**kwargs)
        self.embedding = torch.nn.Embedding(vocab_size, embed_size)
        # The embedding layer not to be trained
        self.constant_embedding = torch.nn.Embedding(vocab_size, embed_size)
        self.dropout = torch.nn.Dropout(0.5)
        self.decoder = torch.nn.Linear(sum(num_channels), 2)
        # The max-over-time pooling layer has no parameters, so this instance
        # can be shared
        self.pool = torch.nn.AdaptiveAvgPool1d(1)
        self.relu = torch.nn.ReLU()
        # Create multiple one-dimensional convolutional layers
        self.convs = torch.nn.ModuleList()
        for c, k in zip(num_channels, kernel_sizes):
            self.convs.append(torch.nn.Conv1d(2 * embed_size, c, k))

    def forward(self, inputs):
        # Concatenate two embedding layer outputs with shape (batch size, no.
        # of tokens, token vector dimension) along vectors
        embeddings = torch.cat((self.embedding(inputs), self.constant_embedding(inputs)), dim=2)
        # Per the input format of one-dimensional convolutional layers,
        # rearrange the tensor so that the second dimension stores channels
        embeddings = embeddings.permute(0, 2, 1)
        # For each one-dimensional convolutional layer, after max-over-time
        # pooling, a tensor of shape (batch size, no. of channels, 1) is
        # obtained. Remove the last dimension and concatenate along channels
        encoding = torch.cat(
            [torch.squeeze(self.relu(self.pool(conv(embeddings))), dim=-1) for conv in self.convs],
            dim=1,
        )
        outputs = self.decoder(self.dropout(encoding))
        return outputs


def try_all_gpus():
    """Return all available GPUs, or [cpu(),] if no GPU exists.

    Defined in :numref:`sec_use_gpu`"""
    devices = [torch.device(f"cuda:{i}") for i in range(torch.cuda.device_count())]
    return devices if devices else [torch.device("cpu")]

In [None]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, X_train, y_train):
        self.x_train = torch.tensor(X_train, dtype=torch.long)
        self.y_train = torch.tensor(y_train, dtype=torch.long)

    def __len__(self):
        return len(self.y_train)

    def __getitem__(self, idx):
        return self.x_train[idx], self.y_train[idx]


def load_array(data_arrays, batch_size, is_train=True):
    """Construct a PyTorch data iterator.

    Defined in :numref:`sec_linear_concise`"""
    dataset = torch.utils.data.TensorDataset(*data_arrays)
    return torch.utils.data.DataLoader(dataset, batch_size, shuffle=is_train)


def load_data(batch_size, train_data, train_features, test_data, test_features):
    train_iter = torch.utils.data.DataLoader(
        MyDataset(train_data, train_features),
        batch_size=batch_size,
        shuffle=True,
    )
    test_iter = torch.utils.data.DataLoader(
        MyDataset(test_data, test_features),
        batch_size=batch_size,
        shuffle=False,
    )
    return train_iter, test_iter

## Pre-load embeddings of model

In [None]:
def init_weights(m):
    if type(m) in (torch.nn.Linear, torch.nn.Conv1d):
        torch.nn.init.xavier_uniform_(m.weight)

In [None]:
batch_size = 128
num_labels = 2
train_iter, test_iter = load_data(
    batch_size,
    X_train,
    binarize_labels_torch(y_train),
    X_test,
    binarize_labels_torch(y_test),
)

embed_size = 100
kernel_sizes = [3, 4, 5]
nums_channels = [100, 100, 100]
net = TextCNN(num_words, embed_size, kernel_sizes, nums_channels)

net.apply(init_weights)

embeds = torch.Tensor(embedding_matrix)
net.embedding.weight.data.copy_(embeds)
net.constant_embedding.weight.data.copy_(embeds)
net.constant_embedding.weight.requires_grad = False

## Train model

In [None]:
learning_rate = 0.001
num_epochs = 3  # model seems to overfit after 3 epochs
trainer = torch.optim.Adam(net.parameters(), lr=learning_rate)
loss = torch.nn.CrossEntropyLoss(reduction="none")
train(
    net,
    train_iter,
    test_iter,
    loss,
    trainer,
    num_epochs,
    try_all_gpus(),
    early_stopping=True,
)

## Evaluate model

In [None]:
# switch to evaluate mode
net.eval()

In [None]:
preds = []
with torch.no_grad():
    for i, j in iter(test_iter):
        if torch.cuda.is_available():
            x = torch.as_tensor(i, device=torch.device("cuda"))
        else:
            x = i
        out_data = net(x).cpu().detach().numpy()
        preds.extend(out_data)
preds = np.array(preds)

In [None]:
y_test_scikitlearn = binarize_labels_scikitlearn(y_test)

In [None]:
false_positive_rate = dict()
true_positive_rate = dict()
roc_auc = dict()
for i in range(num_labels):
    (
        false_positive_rate[i],
        true_positive_rate[i],
        _,
    ) = sklearn.metrics.roc_curve(y_test_scikitlearn[:, i], preds[:, i])
    roc_auc[i] = sklearn.metrics.auc(false_positive_rate[i], true_positive_rate[i])

In [None]:
plt.figure()
lw = 2
for i in range(num_labels):
    plt.plot(
        false_positive_rate[i],
        true_positive_rate[i],
        lw=lw,
        label="ROC curve (area = %0.2f) for %i" % (roc_auc[i], i),
    )
plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic example")
plt.legend(loc="lower right")
plt.show()

In [None]:
print(
    sklearn.metrics.classification_report(
        y_test_scikitlearn.argmax(-1),
        preds.argmax(-1),
        target_names=["emotional", "neutral"],
    )
)

In [None]:
cm = sklearn.metrics.confusion_matrix(y_test_scikitlearn.argmax(-1), preds.argmax(-1))
disp = sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["emotional", "neutral"])
disp.plot()
ax = plt.gca()
ax.tick_params(axis="x", labelrotation=45)