In [None]:
# Mount Google Drive
from google.colab import drive # import drive from google colab

ROOT = "/content/drive"     # default location for the drive
print(ROOT)                 # print content of ROOT (Optional)

drive.mount(ROOT)           # we mount the google drive at /content/drive

/content/drive
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# cd drive/MyDrive/Code/DETOXIS/Machine-Learning-Tweets-Classification/Bert/Data/

In [None]:
# !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
# !python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev

In [None]:
# !apt-get install git-lfs

In [None]:
# !git lfs install
# !git clone https://huggingface.co/bert-base-multilingual-uncased

In [None]:
# !pip install transformers==3

In [1]:
import os
import torch
import pandas as pd
from scipy import stats
import numpy as np

from tqdm import tqdm
from collections import OrderedDict, namedtuple
import torch.nn as nn
from torch.optim import lr_scheduler
import joblib

import logging
import transformers
from transformers import AdamW, get_linear_schedule_with_warmup, get_constant_schedule
import sys
from sklearn import metrics, model_selection

import warnings
import torch_xla
import torch_xla.debug.metrics as met
import torch_xla.distributed.data_parallel as dp
import torch_xla.distributed.parallel_loader as pl
import torch_xla.utils.utils as xu
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_multiprocessing as xmp
import torch_xla.test.test_utils as test_utils
import warnings



warnings.filterwarnings("ignore")

class AverageMeter:
    """
    Computes and stores the average and current value
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


class BERTBaseUncased(nn.Module):
    def __init__(self, bert_path):
        super(BERTBaseUncased, self).__init__()
        self.bert_path = bert_path
        self.bert = transformers.BertModel.from_pretrained(self.bert_path)
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768 * 2, 1)

    def forward(
            self,
            ids,
            mask,
            token_type_ids
    ):
        o1, o2 = self.bert(
            ids,
            attention_mask=mask,
            token_type_ids=token_type_ids)
        
        apool = torch.mean(o1, 1)
        mpool, _ = torch.max(o1, 1)
        cat = torch.cat((apool, mpool), 1)

        bo = self.bert_drop(cat)
        p2 = self.out(bo)
        return p2


class BERTDatasetTraining:
    def __init__(self, comment_text, targets, tokenizer, max_length):
        self.comment_text = comment_text
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.targets = targets

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, item):
        comment_text = str(self.comment_text[item])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_length,
        )
        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]
        
        padding_length = self.max_length - len(ids)
        
        ids = ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[item], dtype=torch.float)
        }

ModuleNotFoundError: ignored

In [None]:
# mx = BERTBaseUncased(bert_path="../content/bert-base-multilingual-uncased/")
# df_train1 = pd.read_csv("../content/drive/MyDrive/Code/DETOXIS/Machine-Learning-Tweets-Classification/Bert/Data/jigsaw-toxic-comment-train.csv", usecols=["comment_text", "toxic"]).fillna("none")
# df_train2 = pd.read_csv("../content/drive/MyDrive/Code/DETOXIS/Machine-Learning-Tweets-Classification/Bert/Data/jigsaw-unintended-bias-train.csv", usecols=["comment_text", "toxic"]).fillna("none")
# df_train_full = pd.concat([df_train1, df_train2], axis=0).reset_index(drop=True)
# df_train = df_train_full.sample(frac=1).reset_index(drop=True).head(200000)

# df_valid = pd.read_csv('../content/drive/MyDrive/Code/DETOXIS/Machine-Learning-Tweets-Classification/Bert/Data/validation.csv', 
#                        usecols=["comment_text", "toxic"])

# df_train = pd.concat([df_train, df_valid], axis=0).reset_index(drop=True)
# df_train = df_train.sample(frac=1).reset_index(drop=True)

In [None]:
# df_valid.head()

In [None]:
# df_train.head()

In [None]:
# df_train['toxic'] = df_train['toxic'].apply(lambda x: 1 if x>= 0.5 else 0)

In [None]:
# df_train.head()

In [None]:
mx = BERTBaseUncased(bert_path="../content/bert-base-multilingual-uncased/")
df_train = pd.read_csv("../content/drive/MyDrive/Code/DETOXIS/Data/train.csv", usecols=["comment", "toxicity"]).fillna("none")
df_train = df_train.rename(columns={"comment": "comment_text", "toxicity": "toxic"})

df_valid = df_train.loc[2770:,['comment_text','toxic']].reset_index(drop=True)
df_valid = df_valid.sample(frac=1).reset_index(drop=True)

df_train = df_train.sample(frac=1).reset_index(drop=True).head(200000)

In [None]:
df_train.head()

Unnamed: 0,comment_text,toxic
0,"Expulsión ya, o políticos que lo toleran fuera.",0
1,"Esta bien que se debate sobre la inmigración, ...",0
2,Hemos llegado a una situación kafkiana en este...,0
3,Suma y sigue...,0
4,Quien se pica ajos come.,0


In [None]:
df_valid.head()

Unnamed: 0,comment_text,toxic
0,Lo que no se es como no pone varias lineas de ...,1
1,"poco le han dado, no justifico la violencia p...",1
2,Venia a esto,0
3,Y esto es lo que pasa con la impunidad y el bu...,1
4,"Que le follen al Islam, ya de por si es una re...",1


In [None]:
def _run():
    def loss_fn(outputs, targets):
        return nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))

    def train_loop_fn(data_loader, model, optimizer, device, scheduler=None):
        model.train()
        for bi, d in enumerate(data_loader):
            ids = d["ids"]
            mask = d["mask"]
            token_type_ids = d["token_type_ids"]
            targets = d["targets"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            optimizer.zero_grad()
            outputs = model(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )

            loss = loss_fn(outputs, targets)
            if bi % 10 == 0:
                xm.master_print(f'bi={bi}, loss={loss}')

            loss.backward()
            xm.optimizer_step(optimizer)
            if scheduler is not None:
                scheduler.step()

    def eval_loop_fn(data_loader, model, device):
        model.eval()
        fin_targets = []
        fin_outputs = []
        for bi, d in enumerate(data_loader):
            ids = d["ids"]
            mask = d["mask"]
            token_type_ids = d["token_type_ids"]
            targets = d["targets"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            outputs = model(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )

            targets_np = targets.cpu().detach().numpy().tolist()
            # outputs_np = outputs.cpu().detach().numpy().tolist()
            outputs_np = torch.sigmoid(outputs).cpu().detach().numpy().tolist()
            fin_targets.extend(targets_np)
            fin_outputs.extend(outputs_np)    

        return fin_outputs, fin_targets

    
    MAX_LEN = 192
    TRAIN_BATCH_SIZE = 8
    EPOCHS = 4

    tokenizer = transformers.BertTokenizer.from_pretrained("../content/bert-base-multilingual-uncased/", do_lower_case=True)

    train_targets = df_train.toxic.values
    valid_targets = df_valid.toxic.values

    train_dataset = BERTDatasetTraining(
        comment_text=df_train.comment_text.values,
        targets=train_targets,
        tokenizer=tokenizer,
        max_length=MAX_LEN
    )

    train_sampler = torch.utils.data.distributed.DistributedSampler(
          train_dataset,
          num_replicas=xm.xrt_world_size(),
          rank=xm.get_ordinal(),
          shuffle=True)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=TRAIN_BATCH_SIZE,
        sampler=train_sampler,
        drop_last=True,
        num_workers=1
    )

    valid_dataset = BERTDatasetTraining(
        comment_text=df_valid.comment_text.values,
        targets=valid_targets,
        tokenizer=tokenizer,
        max_length=MAX_LEN
    )

    valid_sampler = torch.utils.data.distributed.DistributedSampler(
          valid_dataset,
          num_replicas=xm.xrt_world_size(),
          rank=xm.get_ordinal(),
          shuffle=False)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=16,
        sampler=valid_sampler,
        drop_last=False,
        num_workers=1
    )

    device = xm.xla_device()
    model = mx.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]

    lr = 0.4 * 1e-5 * xm.xrt_world_size()
    num_train_steps = int(len(train_dataset) / TRAIN_BATCH_SIZE / xm.xrt_world_size() * EPOCHS)
    xm.master_print(f'num_train_steps = {num_train_steps}, world_size={xm.xrt_world_size()}')

    optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_train_steps
    )

    for epoch in range(EPOCHS):
        para_loader = pl.ParallelLoader(train_data_loader, [device])
        train_loop_fn(para_loader.per_device_loader(device), model, optimizer, device, scheduler=scheduler)

        para_loader = pl.ParallelLoader(valid_data_loader, [device])
        o, t = eval_loop_fn(para_loader.per_device_loader(device), model, device)
        
        outputs = list(np.where(np.array(o) > 0.5, 1, 0))
        accuracy = metrics.accuracy_score(t, outputs)

        
        # print(f" output = {o}")
        # print(f" output transformede = {outputs}")
        # print(f"Accuracy Score = {accuracy}")       
        

        xm.save(model.state_dict(), "model.bin")
        # auc = metrics.roc_auc_score(np.array(t) > 0.5, o)
        # xm.master_print(f'AUC = {auc}')
        xm.master_print(f'Accuracy = {accuracy}')

In [None]:
def _mp_fn(rank, flags):
    torch.set_default_tensor_type('torch.FloatTensor')
    a = _run()

FLAGS={}
xmp.spawn(_mp_fn, args=(FLAGS,), nprocs=8, start_method='fork')

num_train_steps = 216, world_size=8
bi=0, loss=0.522037923336029
bi=10, loss=0.5219879150390625
bi=20, loss=0.4478000998497009
bi=30, loss=0.6720322370529175
bi=40, loss=0.8567501306533813
bi=50, loss=0.6391257643699646
Accuracy = 0.7126436781609196
bi=0, loss=0.6371829509735107
bi=10, loss=0.3864307701587677
bi=20, loss=0.302539587020874
bi=30, loss=0.6594336032867432
bi=40, loss=0.7259899973869324
bi=50, loss=0.6465256214141846
Accuracy = 0.8275862068965517
bi=0, loss=0.5253359079360962
bi=10, loss=0.12475106120109558
bi=20, loss=0.07676361501216888
bi=30, loss=0.517693817615509
bi=40, loss=0.9251872897148132
bi=50, loss=0.31703656911849976
Accuracy = 0.8390804597701149
bi=0, loss=0.2429567128419876
bi=10, loss=0.07902736961841583
bi=20, loss=0.06569371372461319
bi=30, loss=0.27807116508483887
bi=40, loss=0.47507619857788086
bi=50, loss=0.4257700443267822
Accuracy = 0.8850574712643678


In [None]:
def _mp_fn(rank, flags):
    torch.set_default_tensor_type('torch.FloatTensor')
    a = _run()

FLAGS={}
xmp.spawn(_mp_fn, args=(FLAGS,), nprocs=8, start_method='fork')

num_train_steps = 108, world_size=8
bi=0, loss=0.6936578154563904
Accuracy = 0.7126436781609196
bi=0, loss=0.6138836741447449
Accuracy = 0.7471264367816092
bi=0, loss=0.587805449962616
Accuracy = 0.7241379310344828
bi=0, loss=0.4813682436943054
Accuracy = 0.7471264367816092
bi=0, loss=0.4692201316356659
Accuracy = 0.7586206896551724
bi=0, loss=0.3989015221595764
Accuracy = 0.7586206896551724
bi=0, loss=0.33032160997390747
Accuracy = 0.8160919540229885
bi=0, loss=0.2905537188053131
Accuracy = 0.8390804597701149
bi=0, loss=0.28658801317214966
Accuracy = 0.8505747126436781
bi=0, loss=0.20122572779655457
Accuracy = 0.8620689655172413
bi=0, loss=0.13385723531246185
Accuracy = 0.8850574712643678
bi=0, loss=0.1290844827890396
Accuracy = 0.8735632183908046
bi=0, loss=0.16487327218055725
Accuracy = 0.8735632183908046
bi=0, loss=0.18263642489910126
Accuracy = 0.8850574712643678
bi=0, loss=0.06743225455284119
Accuracy = 0.896551724137931
bi=0, loss=0.1459122598171234
Accuracy = 0.896551724137931


In [None]:
def _mp_fn(rank, flags):
    torch.set_default_tensor_type('torch.FloatTensor')
    a = _run()

FLAGS={}
xmp.spawn(_mp_fn, args=(FLAGS,), nprocs=8, start_method='fork')

num_train_steps = 54, world_size=8
bi=0, loss=0.6936578154563904
Accuracy = 0.7126436781609196
bi=0, loss=0.6161950826644897
Accuracy = 0.735632183908046
bi=0, loss=0.5965586304664612
Accuracy = 0.7126436781609196
bi=0, loss=0.48412102460861206
Accuracy = 0.7471264367816092
bi=0, loss=0.4748290181159973
Accuracy = 0.7586206896551724
bi=0, loss=0.43007123470306396
Accuracy = 0.7471264367816092
bi=0, loss=0.39868661761283875
Accuracy = 0.7471264367816092
bi=0, loss=0.39363548159599304
Accuracy = 0.7816091954022989


In [None]:
def _mp_fn(rank, flags):
    torch.set_default_tensor_type('torch.FloatTensor')
    a = _run()

FLAGS={}
xmp.spawn(_mp_fn, args=(FLAGS,), nprocs=8, start_method='fork')

num_train_steps = 13, world_size=8
bi=0, loss=0.6936578154563904
Accuracy = 0.7126436781609196
bi=0, loss=0.6221399307250977
Accuracy = 0.7126436781609196


In [None]:
def _mp_fn(rank, flags):
    torch.set_default_tensor_type('torch.FloatTensor')
    a = _run()

FLAGS={}
xmp.spawn(_mp_fn, args=(FLAGS,), nprocs=8, start_method='fork')

num_train_steps = 27, world_size=8
bi=0, loss=0.7086488604545593
Accuracy = 0.6896551724137931
bi=0, loss=0.7508302330970764
Accuracy = 0.6896551724137931
bi=0, loss=0.6652076840400696
Accuracy = 0.6896551724137931
bi=0, loss=0.678643524646759
Accuracy = 0.6896551724137931


In [None]:
def _mp_fn(rank, flags):
    torch.set_default_tensor_type('torch.FloatTensor')
    a = _run()

FLAGS={}
xmp.spawn(_mp_fn, args=(FLAGS,), nprocs=8, start_method='fork')

num_train_steps = 13, world_size=8
bi=0, loss=0.7292062044143677
Accuracy = 0.6781609195402298
bi=0, loss=0.643303632736206
Accuracy = 0.6781609195402298


In [None]:
def _mp_fn(rank, flags):
    torch.set_default_tensor_type('torch.FloatTensor')
    a = _run()

FLAGS={}
xmp.spawn(_mp_fn, args=(FLAGS,), nprocs=8, start_method='fork')

num_train_steps = 13, world_size=8
bi=0, loss=0.6957964301109314
 output = [[0.3479381501674652], [0.27270492911338806], [0.29710906744003296], [0.32393479347229004], [0.31561630964279175], [0.3235277533531189], [0.27226123213768005], [0.2685788869857788], [0.3026195764541626], [0.30227673053741455], [0.29815980792045593], [0.2816947400569916], [0.27408236265182495], [0.2928442358970642], [0.2934141457080841], [0.26663678884506226], [0.2832680344581604], [0.32473024725914], [0.27798908948898315], [0.3471658229827881], [0.30381160974502563], [0.314896821975708], [0.3229576349258423], [0.24998706579208374], [0.3195156455039978], [0.3219197392463684], [0.28678908944129944], [0.2831529378890991], [0.29220110177993774], [0.29602229595184326], [0.31326520442962646], [0.27756333351135254], [0.2935493588447571], [0.3193564713001251], [0.3677065670490265], [0.24919286370277405], [0.26494812965393066], [0.2699984908103943], [0.31119710206985474], [0.2988032102584839], [0.2996219992637634], [0.25

In [None]:
def _mp_fn(rank, flags):
    torch.set_default_tensor_type('torch.FloatTensor')
    a = _run()

FLAGS={}
xmp.spawn(_mp_fn, args=(FLAGS,), nprocs=8, start_method='fork')

num_train_steps = 812, world_size=8
bi=0, loss=0.6596669554710388
bi=10, loss=0.442869633436203
bi=20, loss=0.27767646312713623
bi=30, loss=0.2492581605911255
bi=40, loss=0.24852316081523895
bi=50, loss=0.23251605033874512
bi=60, loss=0.1346609890460968
bi=70, loss=0.24908529222011566
bi=80, loss=0.27563270926475525
bi=90, loss=0.22951166331768036
bi=100, loss=0.13338381052017212
bi=110, loss=0.13826321065425873
bi=120, loss=0.10452469438314438
bi=130, loss=0.17281179130077362
bi=140, loss=0.12374229729175568
bi=150, loss=0.09505578130483627
bi=160, loss=0.10981842130422592
bi=170, loss=0.1613001674413681
bi=180, loss=0.18103162944316864
bi=190, loss=0.12160386145114899
bi=200, loss=0.24840252101421356
bi=210, loss=0.146152064204216
bi=220, loss=0.21994732320308685
bi=230, loss=0.16992920637130737
bi=240, loss=0.22806310653686523
bi=250, loss=0.11159994453191757
bi=260, loss=0.15508773922920227
bi=270, loss=0.10628771781921387
bi=280, loss=0.14105890691280365
bi=290, loss=0.09192243963

In [None]:
# def _mp_fn(rank, flags):
#     torch.set_default_tensor_type('torch.FloatTensor')
#     a = _run()

# FLAGS={}
# xmp.spawn(_mp_fn, args=(FLAGS,), nprocs=8, start_method='fork')

num_train_steps = 812, world_size=8
bi=0, loss=0.6442265510559082
bi=10, loss=0.3530527353286743
bi=20, loss=0.1502324789762497
bi=30, loss=0.2668073773384094
bi=40, loss=0.09927172213792801
bi=50, loss=0.19340580701828003
bi=60, loss=0.08746522665023804
bi=70, loss=0.2013911008834839
bi=80, loss=0.15485046803951263
bi=90, loss=0.14158804714679718
bi=100, loss=0.07098883390426636
bi=110, loss=0.19030413031578064
bi=120, loss=0.22227028012275696
bi=130, loss=0.07657205313444138
bi=140, loss=0.11730407923460007
bi=150, loss=0.12540556490421295
bi=160, loss=0.23628656566143036
bi=170, loss=0.10813035815954208
bi=180, loss=0.053886447101831436
bi=190, loss=0.09251704066991806
bi=200, loss=0.06599804013967514
bi=210, loss=0.17956028878688812
bi=220, loss=0.07320220023393631
bi=230, loss=0.17697928845882416
bi=240, loss=0.1228395625948906
bi=250, loss=0.11344517022371292
bi=260, loss=0.07547663897275925
bi=270, loss=0.05892716348171234
bi=280, loss=0.3614422380924225
bi=290, loss=0.160551831