# Setup

In [1]:
!pip install tensorflow transformers pytorch_lightning datasets evaluate tensorflow_text tensorflow_hub > /dev/null 2>&1

In [2]:
!nvidia-smi

Thu Dec  5 01:35:41 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   60C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.nn.functional as F
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text

from torch.utils.data import DataLoader
from datasets import Dataset, load_dataset
import evaluate
import numpy as np

from typing import List, Dict

In [4]:
pl.seed_everything(445326, workers=True)

INFO:lightning_fabric.utilities.seed:Seed set to 445326


445326

# Sentence Embeddings

- In the original notebook the link to the model points to TensorFlow Hub, but given that Google acquired Kaggle in 2017, now Kaggle works as the TF Hub and the repository for the Universal Sentence Encoder (USE) is this [USE in Kaggle](https://www.kaggle.com/models/google/universal-sentence-encoder/tensorFlow2/multilingual-large).

In [6]:
MODEL_PATH = "https://www.kaggle.com/models/google/universal-sentence-encoder/TensorFlow2/multilingual-large/2"

In [7]:
encoder = hub.load(MODEL_PATH)

In [8]:
def embed_text(text: List[str]) -> List[np.ndarray]:
    vectors = encoder(text)
    return [vector.numpy() for vector in vectors]

# Data

We'll use the `yelp_polarity` dataset which has a `Text` field with a given text and its label called `Label`.

In [9]:
class YelpDataModule(pl.LightningDataModule):
    def __init__(self,
                 batch_size: int = 32,
                 num_workers: int = 2):
        super().__init__()
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.pin_memory = torch.cuda.is_available()

    def prepare_data(self):
        self.test_ds = load_dataset('yelp_polarity', split="test[:2%]")
        self.train_ds = load_dataset('yelp_polarity', split="train[:2%]")
        self.val_ds = load_dataset('yelp_polarity', split="train[99%:]")

        self.label_names = self.train_ds.unique("label")
        label2int = {str(label): n for n, label in enumerate(self.label_names)}
        self.encoder = encoder_factory(label2int)

    def setup(self):
        # Compute embeddings in batches, so that they fit in the GPU's RAM.
        self.train = self.train_ds.map(self.encoder, batched=True, batch_size=self.batch_size)
        self.train.set_format(type="torch", columns=["embedding", "label"],
                              output_all_columns=True)

        self.val = self.val_ds.map(self.encoder, batched=True, batch_size=self.batch_size)
        self.val.set_format(type="torch", columns=["embedding", "label"],
                            output_all_columns=True)

        self.test = self.test_ds.map(self.encoder, batched=True, batch_size=self.batch_size)
        self.test.set_format(type="torch", columns=["embedding", "label"],
                             output_all_columns=True)

    def train_dataloader(self):
        return DataLoader(self.train,
                          batch_size=self.batch_size,
                          num_workers=self.num_workers,
                          pin_memory=self.pin_memory,
                          shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val,
                          batch_size=self.batch_size,
                          num_workers=self.num_workers,
                          pin_memory=self.pin_memory)

    def test_dataloader(self):
        return DataLoader(self.test,
                          batch_size=self.batch_size,
                          num_workers=self.num_workers)


def encoder_factory(label2int: Dict[str, int]):
    def encode(batch):
        batch["embedding"] = embed_text(batch["text"])
        batch["label"] = [label2int[str(x)] for x in batch["label"]]
        return batch

    return encode

In [12]:
data = YelpDataModule()

In [13]:
data.prepare_data()

In [14]:
data.setup()



Map:   0%|          | 0/11200 [00:00<?, ? examples/s]

Map:   0%|          | 0/5600 [00:00<?, ? examples/s]

Map:   0%|          | 0/760 [00:00<?, ? examples/s]

In [15]:
print(len(data.train))
print(len(data.val))
print(len(data.test))

11200
5600
760


# Model

- A model worked on Pytorch Lightning is slightly different. It is described here:

![](https://drive.google.com/uc?export=download&id=1qcOpJ0NlOfmHRo9Njf_e-aSw37JNgJIL)

## Multilingual binary classifier

In [16]:
import evaluate

In [17]:
class Model(pl.LightningModule):
    def __init__(self,
                 hidden_dims: List[int] = [768, 128],
                 dropout_prob: float = 0.5,
                 learning_rate: float = 1e-3):
        super().__init__()
        self.train_acc = evaluate.load("accuracy")
        self.val_acc = evaluate.load("accuracy")
        self.test_acc = evaluate.load("accuracy")
        self.hidden_dims = hidden_dims
        self.dropout_prob = dropout_prob
        self.learning_rate = learning_rate

        self.embedding_dim = 512

        layers = []
        prev_dim = self.embedding_dim

        if dropout_prob > 0:
            layers.append(nn.Dropout(dropout_prob))

        for h in hidden_dims:
            layers.append(nn.Linear(prev_dim, h))
            prev_dim = h
            if dropout_prob > 0:
                layers.append(nn.Dropout(dropout_prob))
            layers.append(nn.ReLU())
            if dropout_prob > 0:
                layers.append(nn.Dropout(dropout_prob))
        # output layer
        layers.append(nn.Linear(prev_dim, 2))

        self.layers = nn.Sequential(*layers)

    def forward(self, x):
        # x will be a batch of mUSE vectors
        logits = self.layers(x)
        return logits

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer

    def __compute_loss(self, batch):
        x, y = batch["embedding"], batch["label"]
        logits = self(x)
        preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
        loss = F.cross_entropy(logits, y)
        return loss, preds, y

    def training_step(self, batch, batch_idx):
        loss, preds, y = self.__compute_loss(batch)
        self.train_acc.add_batch(predictions=preds, references=y)
        acc = self.train_acc.compute()["accuracy"]
        values = {"train_loss": loss, "train_accuracy": acc}
        self.log_dict(values, on_step=True, on_epoch=True,
                      prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        loss, preds, y = self.__compute_loss(batch)
        self.val_acc.add_batch(predictions=preds, references=y)
        acc = self.val_acc.compute()["accuracy"]
        values = {"val_loss": loss, "val_accuracy": acc}
        self.log_dict(values, on_step=True, on_epoch=True,
                      prog_bar=True, logger=True)
        return loss

    def test_step(self, batch, batch_idx):
        loss, preds, y = self.__compute_loss(batch)
        self.test_acc.add_batch(predictions=preds, references=y)
        acc = self.test_acc.compute()["accuracy"]
        values = {"test_loss": loss, "test_accuracy": acc}
        self.log_dict(values, on_step=False, on_epoch=True,
                      prog_bar=True, logger=True)
        return loss

## Train

In [18]:
model = Model()

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [29]:
MAX_EPOCHS = 5

checkpoint_callback = pl.callbacks.ModelCheckpoint(
    monitor="val_loss",
    dirpath="model",
    filename="yelp-sentiment-multilingual-{epoch:02d}-{val_loss:.3f}",
    save_top_k=3,
    mode="min")

trainer = pl.Trainer(num_nodes=1, max_epochs=MAX_EPOCHS,
                     callbacks=[checkpoint_callback])

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [30]:
trainer.fit(model, data.train_dataloader(), data.val_dataloader())

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /content/model exists and is not empty.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name   | Type       | Params | Mode 
----------------------------------------------
0 | layers | Sequential | 492 K  | train
----------------------------------------------
492 K     Trainable params
0         Non-trainable params
492 K     Total params
1.971     Total estimated model params size (MB)
11        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.


## Test

In [31]:
trainer.test(dataloaders=data.test_dataloader())

INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/model/yelp-sentiment-multilingual-epoch=01-val_loss=0.246.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from the checkpoint at /content/model/yelp-sentiment-multilingual-epoch=01-val_loss=0.246.ckpt


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 0.32891881465911865, 'test_accuracy': 0.8539473414421082}]

# Inference

## Example

In [32]:
best_model = Model.load_from_checkpoint(checkpoint_callback.best_model_path)

In [39]:
def predict(text: List[str]):
    embeddings = torch.Tensor(embed_text(text))
    embeddings = embeddings.to(best_model.device)
    print(best_model.device)

    logits = best_model(embeddings)
    preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
    scores = torch.softmax(logits, dim=1).detach().cpu().numpy()

    results = []
    for t, best_index, score_pair in zip(text, preds, scores):
        results.append({
            "text": t,
            "label": "positive" if best_index == 1 else "negative",
            "score": score_pair[best_index]
        })
    return results

In [40]:
predict(["I love that restaurant!", "I hate italian food."])

cuda:0


[{'text': 'I love that restaurant!', 'label': 'positive', 'score': 0.99998915},
 {'text': 'I hate italian food.', 'label': 'negative', 'score': 0.99653685}]

## Inference on non-English text

Since we used USEm embeddings, we should be able to predict sentiment for non-English languages. Let's try it out. mUSE supports 16 languages:

Arabic, Chinese-simplified, Chinese-traditional, English, French, German, Italian, Japanese, Korean, Dutch, Polish, Portuguese, Spanish, Thai, Turkish, Russian

In [41]:
from pprint import PrettyPrinter
pp = PrettyPrinter()

In [42]:
english_text = "Our server was horrid. He messed up the order and didn't even apologize when he spilled wine on my sister's hair!"
german_translation = "Unser Server war schrecklich. Er hat die Bestellung durcheinander gebracht und sich nicht einmal entschuldigt, als er Wein in die Haare meiner Schwester verschüttet hat!"

In [43]:
pp.pprint(predict([english_text, german_translation]))

cuda:0
[{'label': 'negative',
  'score': 0.8970209,
  'text': "Our server was horrid. He messed up the order and didn't even "
          "apologize when he spilled wine on my sister's hair!"},
 {'label': 'negative',
  'score': 0.9948944,
  'text': 'Unser Server war schrecklich. Er hat die Bestellung durcheinander '
          'gebracht und sich nicht einmal entschuldigt, als er Wein in die '
          'Haare meiner Schwester verschüttet hat!'}]


Compare predictions for English and Italian. For kicks, let's also see how it performs on a European language that USEm does not support, Finnish.

In [44]:
english_text = "My least favorite film is Showgirls. I hate it so much. In fact, it's so bad that it makes me angry."
italian_translation = "Il mio film meno preferito è Showgirls. Lo odio così tanto. In effetti, è così brutto che mi fa arrabbiare."
finnish_translation = "Minun lempi elokuva on Showgirls. Vihaan sitä niin paljon. Itse asiassa se on niin paha, että se saa minut vihaiseksi."
spanish_translation = "Mi película menos favorita es Showgirls. La odio tanto. De hecho, es tan mala que me enoja."

In [45]:
pp.pprint(predict([english_text, italian_translation, finnish_translation, spanish_translation]))

cuda:0
[{'label': 'negative',
  'score': 0.87167186,
  'text': 'My least favorite film is Showgirls. I hate it so much. In fact, '
          "it's so bad that it makes me angry."},
 {'label': 'negative',
  'score': 0.8921931,
  'text': 'Il mio film meno preferito è Showgirls. Lo odio così tanto. In '
          'effetti, è così brutto che mi fa arrabbiare.'},
 {'label': 'positive',
  'score': 0.547113,
  'text': 'Minun lempi elokuva on Showgirls. Vihaan sitä niin paljon. Itse '
          'asiassa se on niin paha, että se saa minut vihaiseksi.'},
 {'label': 'negative',
  'score': 0.9480996,
  'text': 'Mi película menos favorita es Showgirls. La odio tanto. De hecho, '
          'es tan mala que me enoja.'}]


USEm even works on Finnish (not so well, but hey, it works). But why? Without digging into things, it would be difficult to know for sure. Our guess is that in the training process, the subword units used in USEm's tokenization lets the Transformer learn which subword units are used across languages. The layers we added onto USEm, which are trained for classification, lets the model learn which subword units are related to positive or negative sentiment. It must be that the subword units used in Finnish are close enough to those in one of the 16 languages that USEm supports.