In [23]:
# ! pip install --quiet pytorch_lightning
# !pip install --quiet transformers

In [5]:
import os
import warnings
import numpy as np
import pandas as pd
from pathlib import Path

import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchmetrics import AUROC, Accuracy

from pytorch_lightning import LightningDataModule, LightningModule, Trainer, seed_everything
from pytorch_lightning.callbacks.progress import TQDMProgressBar
from pytorch_lightning.loggers import CSVLogger

from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModel,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup,
)

seed_everything(42)

INFO:lightning_lite.utilities.seed:Global seed set to 42


42

In [6]:
# data = pd.read_csv('/content/drive/MyDrive/DLS/jigsaw-toxic-comment-train.csv.zip')
# data.tail()
# data = pd.read_csv('../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv')
# data.head()
# data.shape
# pd.read_csv("../input/jigsaw-multilingual-toxic-comment-classification/validation.csv")

In [7]:
class JigsawDataset(Dataset):
    def __init__(self, comments, labels, tokenizer, max_seq_length):
        self.comments = comments
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length

    def __len__(self):
        return len(self.comments)

    def __getitem__(self, index):
        comment_text = " ".join(str(self.comments[index]).split())
        encoding = self.tokenizer.encode_plus(
              comment_text,
              add_special_tokens=True,
              return_token_type_ids=True,
              return_attention_mask=True,
              truncation=True,
              padding='max_length',
              max_length=self.max_seq_length,
              return_tensors='pt',
            )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'token_type_ids': encoding["token_type_ids"].squeeze(),
            'labels': torch.tensor(self.labels[index], dtype=torch.float),
        }

In [8]:
class JigsawDataModule(LightningDataModule):
    def __init__(self, model_name_or_path, max_seq_length=256, train_batch_size=128, valid_batch_size=64, num_workers=1):
        super().__init__()
        
        self.input_path = Path("/content/drive/MyDrive/DLS/")
        self.model_name_or_path = model_name_or_path
        self.max_seq_length = max_seq_length
        self.train_batch_size = train_batch_size
        self.valid_batch_size = valid_batch_size
        self.num_workers = num_workers

        self.num_labels = 1
        self.tokenizer =  AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True)

    def prepare_data(self):
        AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True)

    def setup(self, stage=None):

        if stage == "fit" or stage is None:

            df_train = pd.read_csv(self.input_path/"jigsaw-toxic-comment-train.csv.zip", 
                                            usecols=["comment_text", "toxic"]).fillna("none")
            df_train = df_train.sample(frac=1).reset_index(drop=True).head(100000)

            df_valid = pd.read_csv(self.input_path/"validation.csv.zip")

            self.train = JigsawDataset(
                            comments=df_train.comment_text.values,
                            labels=df_train.toxic.values,
                            tokenizer=self.tokenizer,
                            max_seq_length=self.max_seq_length
                          )

            self.valid = JigsawDataset(
                            comments=df_valid.comment_text.values,
                            labels=df_valid.toxic.values,
                            tokenizer=self.tokenizer,
                            max_seq_length=self.max_seq_length
                          )

    def train_dataloader(self):
        return DataLoader(
                    self.train,
                    batch_size=self.train_batch_size,
                    num_workers=self.num_workers,
                    pin_memory=True,
                    shuffle=True,
                    drop_last=True,
                )

    def val_dataloader(self):
        return DataLoader(
                    self.valid,
                    batch_size=self.valid_batch_size,
                    num_workers=self.num_workers,
                    pin_memory=True,
                    shuffle=False,
                    drop_last=False,
                )

In [9]:
class JigsawModel(LightningModule):
    def __init__(
        self, 
        model_name_or_path,
        num_labels,
        learning_rate = 2e-5, 
        adam_epsilon = 1e-8,
        warmup_steps = 0,
        weight_decay = 0.0001,
    ):
        super().__init__()
        
        self.save_hyperparameters()
                
        self.learning_rate = learning_rate
        self.model_name = model_name_or_path
        self.config = AutoConfig.from_pretrained(model_name_or_path, num_labels=num_labels)
        self.model = AutoModel.from_pretrained(model_name_or_path, config=self.config)

        self.dropout = nn.Dropout(0.3)
        self.out = nn.Linear(768 * 2, 1)
        
        self.auc = AUROC(task='binary')
        self.accuracy = Accuracy(task='binary')
    
    def forward(self, input_ids, attention_masks, token_type_ids=None):

        if token_type_ids is not None:
            output = self.model(input_ids=input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids)
        else:
            output = self.model(input_ids=input_ids, attention_mask=attention_masks)

        o = output.last_hidden_state
        apool = torch.mean(o, 1)
        mpool, _ = torch.max(o, 1)
        cat = torch.cat((apool, mpool), 1)

        return self.out(self.dropout(cat))


    def configure_optimizers(self):

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': self.hparams.weight_decay},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
        optimizer = optim.AdamW(grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.hparams.warmup_steps,
            num_training_steps=self.trainer.estimated_stepping_batches,
        ) 
        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
        return [optimizer], [scheduler]

    def loss(self, y_hat, y):
        return F.binary_cross_entropy_with_logits(y_hat, y)
        
    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_masks = batch["attention_mask"]
        token_type_ids = batch["token_type_ids"]
        labels = batch["labels"]
        
        if "roberta" in self.model_name or "distilbert" in self.model_name:
            logits = self(input_ids, attention_masks)
        else:
            logits = self(input_ids, attention_masks, token_type_ids)

        loss = self.loss(logits.squeeze(), labels)
        self.log("train_loss", loss, prog_bar=True)

        return loss

    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_masks = batch["attention_mask"]
        token_type_ids = batch["token_type_ids"]
        labels = batch["labels"]
        
        if "roberta" in self.model_name or "distilbert" in self.model_name:
            logits = self(input_ids, attention_masks)
        else:
            logits = self(input_ids, attention_masks, token_type_ids)
        preds = logits.squeeze()

        loss = self.loss(preds, labels)
        self.log("valid_loss", loss, on_step=False, on_epoch=True, prog_bar=True)

        self.auc.update(preds, labels)
        self.log("valid_auc", self.auc, on_step=False, on_epoch=True, prog_bar=True)
        
        self.accuracy.update(preds, labels)
        self.log("valid_accuracy", self.accuracy, on_step=False, on_epoch=True, prog_bar=True)

In [6]:
MODEL = "bert-base-multilingual-cased"
# MODEL = "bert-base-multilingual-uncased"
# MODEL = "distilbert-base-multilingual-cased"
# MODEL =  "xlm-roberta-base"

EPOCHS = 2
config = dict(
    model_name_or_path = MODEL,
    train_batch_size = 64,
    valid_batch_size = 64,
    max_seq_length = 192,
    num_workers = os.cpu_count(),
)
jigsaw_dm = JigsawDataModule(**config)
jigsaw_dm.setup("fit")

In [7]:
model = JigsawModel(
    model_name_or_path=MODEL,
    num_labels=jigsaw_dm.num_labels,
)
trainer = Trainer(
    max_epochs=EPOCHS,
    accelerator="auto",
    devices=1 if torch.cuda.is_available() else None,
    callbacks=[TQDMProgressBar (refresh_rate=20)],
    logger=CSVLogger(save_dir="../working/logs/")
)
trainer.fit(model, jigsaw_dm)
torch.save(model.state_dict(), "../working/logs/model_bert.pth")

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: Fals

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=2` reached.


In [9]:
trainer.validate(model, jigsaw_dm)


INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     valid_accuracy         0.8518750071525574
        valid_auc           0.8600189685821533
       valid_loss           0.5742074251174927
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'valid_loss': 0.5742074251174927,
  'valid_auc': 0.8600189685821533,
  'valid_accuracy': 0.8518750071525574}]

In [13]:
MODEL =  "xlm-roberta-base"

model = JigsawModel(
    model_name_or_path=MODEL,
    num_labels=1,
)
state_dict = torch.load("/content/drive/MyDrive/model.pth")
model.load_state_dict(state_dict, strict=False)

config = dict(
    model_name_or_path = MODEL,
    train_batch_size = 64,
    valid_batch_size = 64,
    max_seq_length = 192,
    num_workers = os.cpu_count(),
)
tokenizer =  AutoTokenizer.from_pretrained(MODEL, use_fast=True)

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [14]:
def predict(comment, model, tokenizer):
    model.eval()
    
    comment_text = " ".join(comment.split())
    encoding = tokenizer.encode_plus(
          comment_text,
          add_special_tokens=True,
          return_token_type_ids=True,
          return_attention_mask=True,
          truncation=True,
          padding='max_length',
          max_length=192,
          return_tensors='pt',
        )

    with torch.no_grad():
        input_ids = encoding['input_ids'].squeeze().reshape(1, -1)
        attention_masks = encoding['attention_mask'].squeeze().reshape(1, -1)
        token_type_ids = encoding["token_type_ids"].squeeze().reshape(1, -1)

        if "roberta" in MODEL or "distilbert" in MODEL:
            logits = model(input_ids, attention_masks)
        else:
            logits = model(input_ids, attention_masks, token_type_ids)
        return nn.Sigmoid()(logits.squeeze())

In [24]:
# comment = "Voucher. I only expose my past. All past time was better, far from it, I would not want to go back 31 years at a particular level. I would pass them over whores. Fernando"
comment = "Vale. Sólo expongo mi pasado. Todo tiempo pasado fue mejor, ni mucho menos, yo no quisiera retroceder 31 años a nivel particular. Las volveria a pasar putas.Fernando "
predict(comment, model, tokenizer)

tensor(0.7136)

In [22]:
# comment = "The imbesile ete de la luna doesn't even know, damn it, I'm a bitch!! This is an ecological niche and not a fucking forum of the moon Damn now! add me: metemelamasadentro_ahsi@hotmail.com thank you."
comment = "El imbesil ete dela luna no se entera ni ostias so ijo puta!! esto es un nicho ecológico i no un puto forum dela luna Coño ya!!! agregame: metemelamasadentro_ahsi@hotmail.com acias."
predict(comment, model, tokenizer)

tensor(0.9493)