In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m53.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m59.9 MB/s[0m eta [36m0:00:00[0m
Col

In [3]:
from transformers import BartTokenizer

model_name = 'facebook/bart-base'

tokenizer = BartTokenizer.from_pretrained(model_name)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

In [4]:
import pandas as pd

dataframe = pd.read_parquet('/content/drive/MyDrive/toxicity.parquet')
dataframe.head()

Unnamed: 0,text,labels
0,"If Alkar is flooding her with psychic waste, t...",0.014195
1,"if Alkar floods her with her mental waste, it ...",0.981983
2,Now you're getting nasty.,0.065473
3,you're becoming disgusting.,0.999039
4,"Well, we could spare your life, for one.",0.213313


In [5]:
texts = dataframe.text.values
labels = dataframe.labels.values

In [6]:
def get_tokenized(text: str):
    encoded_dict = tokenizer.encode_plus(
                        text,
                        add_special_tokens = True,
                        max_length = 512,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        truncation=True,
                        return_tensors = 'pt'
                        )
    return encoded_dict['input_ids']

In [7]:
input_ids = list(map(get_tokenized, texts))



In [8]:
import torch

input_ids = torch.cat(input_ids, dim=0)
labels = torch.from_numpy(labels).unsqueeze(1).float()

In [9]:
from torch.utils.data import TensorDataset, random_split

dataset = TensorDataset(input_ids, labels)

generator = torch.Generator().manual_seed(123)

train_dataset, validation_dataset, test_dataset = random_split(dataset, [0.8, 0.1, 0.1], generator=generator)

In [10]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 16

train_dataloader = DataLoader(
    train_dataset,
    sampler = RandomSampler(train_dataset, generator=generator),
    batch_size = batch_size
)

validation_dataloader = DataLoader(
    validation_dataset,
    sampler = SequentialSampler(validation_dataset),
    batch_size = batch_size
)

test_dataloader = DataLoader(
    test_dataset,
    sampler = SequentialSampler(test_dataset),
    batch_size = batch_size
)

In [11]:
from torch import nn
import torch.nn.functional as F


class ToxicityEvaluation(nn.Module):
    def __init__(self):
        super(ToxicityEvaluation, self).__init__()
        self.embedding = nn.Embedding(tokenizer.vocab_size, 100)
        self.conv1 = nn.Conv1d(100, 128, kernel_size=3)
        self.conv2 = nn.Conv1d(128, 256, kernel_size=3)
        self.fc1 = nn.Linear(256, 128)
        self.fc2 = nn.Linear(128, 1)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.max_pool1d(x, kernel_size=x.size(2))
        x = x.squeeze(2)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = ToxicityEvaluation().to(device)

In [13]:
from torch.optim import Adam

optimizer = Adam(model.parameters())

In [14]:
loss = nn.MSELoss()

In [17]:
import random
import numpy as np
from tqdm import tqdm

seed_val = 123

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
best_loss = float('inf')

epochs = 10

for epoch in range(epochs):
    total_train_loss = 0

    train_loop = tqdm(
        train_dataloader, total=len(train_dataloader), desc=f"Epoch {epoch}"
    )

    model.train()

    for batch in train_loop:
        b_input_ids = batch[0].to(device)
        b_labels = batch[1].to(device)

        model.zero_grad()

        output = model(b_input_ids)

        _loss = loss(output, b_labels)

        total_train_loss += _loss.item()
        _loss.backward()

        optimizer.step()
        train_loop.set_postfix({"loss": _loss.item()})

    avg_train_loss = total_train_loss / len(train_dataloader)


    model.eval()
    total_eval_loss = 0
    nb_eval_steps = 0

    validation_loop = tqdm(validation_dataloader, total=len(validation_dataloader), desc="Val")

    for batch in validation_loop:
        b_input_ids = batch[0].to(device)
        b_labels = batch[1].to(device)

        with torch.no_grad():
            output = model(b_input_ids)

        _loss = loss(output, b_labels)

        total_eval_loss += _loss.item()

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    if avg_val_loss > best_loss:
        torch.save(model.state_dict(), 'best.pt')


Epoch 0: 100%|██████████| 28889/28889 [04:39<00:00, 103.18it/s, loss=0.108]
Val: 100%|██████████| 3612/3612 [00:05<00:00, 688.50it/s]
Epoch 1: 100%|██████████| 28889/28889 [04:39<00:00, 103.38it/s, loss=0.089]
Val: 100%|██████████| 3612/3612 [00:05<00:00, 690.23it/s]
Epoch 2: 100%|██████████| 28889/28889 [04:39<00:00, 103.23it/s, loss=0.0552]
Val: 100%|██████████| 3612/3612 [00:05<00:00, 694.08it/s]
Epoch 3: 100%|██████████| 28889/28889 [04:39<00:00, 103.23it/s, loss=0.024]
Val: 100%|██████████| 3612/3612 [00:05<00:00, 685.38it/s]
Epoch 4: 100%|██████████| 28889/28889 [04:40<00:00, 103.03it/s, loss=0.0196]
Val: 100%|██████████| 3612/3612 [00:05<00:00, 616.09it/s]
Epoch 5: 100%|██████████| 28889/28889 [04:38<00:00, 103.89it/s, loss=0.127]
Val: 100%|██████████| 3612/3612 [00:05<00:00, 618.08it/s]
Epoch 6: 100%|██████████| 28889/28889 [04:38<00:00, 103.55it/s, loss=0.00505]
Val: 100%|██████████| 3612/3612 [00:05<00:00, 639.47it/s]
Epoch 7: 100%|██████████| 28889/28889 [04:38<00:00, 103.65

In [24]:
model(get_tokenized('you are so pretty person').to(device))

tensor([[0.2071]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [25]:
torch.save(model.state_dict(), 'best.pt')