# Downloading Dependences

In [None]:
# !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
# !python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev

In [None]:
# !apt-get install git-lfs

In [None]:
# !git lfs install
# !git clone https://huggingface.co/dccuchile/bert-base-spanish-wwm-uncased

In [None]:
# !git lfs install
# !git clone https://huggingface.co/bert-base-multilingual-uncased

In [None]:
# !pip install transformers==3

# Load Dependences

In [None]:
### add NLP dependences
import pickle
import os
import torch
import pandas as pd
from scipy import stats
import numpy as np

from sklearn import metrics
from sklearn.model_selection import StratifiedKFold, train_test_split

from tqdm import tqdm
from collections import OrderedDict, namedtuple
import torch.nn as nn
from torch.optim import lr_scheduler
import joblib

import logging
import transformers
from transformers import AdamW, get_linear_schedule_with_warmup, get_constant_schedule
import sys
from sklearn import metrics, model_selection

import warnings
import torch_xla
import torch_xla.debug.metrics as met
import torch_xla.distributed.data_parallel as dp
import torch_xla.distributed.parallel_loader as pl
import torch_xla.utils.utils as xu
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_multiprocessing as xmp
import torch_xla.test.test_utils as test_utils
import warnings

from torch_xla.core.xla_model import mesh_reduce

warnings.filterwarnings("ignore")

In [None]:
# Mount Google Drive
from google.colab import drive # import drive from google colab

ROOT = "/content/drive"     # default location for the drive
print(ROOT)                 # print content of ROOT (Optional)

drive.mount(ROOT)           # we mount the google drive at /content/drive

/content/drive
Mounted at /content/drive


# Functions

In [None]:
class BERTBaseUncased(nn.Module):
    def __init__(self, bert_path, output_bert='pooler', NumberOfClasses=2):
        super(BERTBaseUncased, self).__init__()
        self.bert_path = bert_path
        self.bert = transformers.BertModel.from_pretrained(self.bert_path)
        self.bert_drop = nn.Dropout(0.3)
        self.output_bert = output_bert
        self.NumberOfClasses = NumberOfClasses
        self.OutPutHidden = nn.Linear(768 * 2, NumberOfClasses)
        self.OutPoller = nn.Linear(768, NumberOfClasses)

    def forward(
            self,
            ids,
            mask,
            token_type_ids
    ):
        o1, o2 = self.bert(
            ids,
            attention_mask=mask,
            token_type_ids=token_type_ids)
          
        if self.output_bert=='hidden':
          apool = torch.mean(o1, 1)
          mpool, _ = torch.max(o1, 1)
          cat = torch.cat((apool, mpool), 1)
          bo = self.bert_drop(cat)

          output = self.OutPutHidden(bo) 

        else:
          bo = self.bert_drop(o2)
          output = self.OutPoller(bo)
        
        return output

In [None]:
class BERTDatasetTraining:
    def __init__(self, comment, targets, tokenizer, max_length):
        self.comment = comment
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.targets = targets

    def __len__(self):
        return len(self.comment)

    def __getitem__(self, item):
        comment = str(self.comment[item])
        comment = " ".join(comment.split())

        inputs = self.tokenizer.encode_plus(
            comment,
            None,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_length,
        )
        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]
        
        padding_length = self.max_length - len(ids)
        
        ids = ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[item], dtype=torch.float)
        }

In [None]:
#### Handcraft metric for multiclass evaluation
def cem_metric(target, output):
  conf_metrix = metrics.confusion_matrix(target, output)
  cem_metrix = np.zeros(conf_metrix.shape)

  for column in range(conf_metrix.shape[1]):
    for row in range(conf_metrix.shape[0]):
    
      if row == column :
        cem_metrix[row,column] = (conf_metrix.sum(axis=0)[column]/2)/conf_metrix.sum()
                                          
      elif row < column:
        cem_metrix[row,column] = (conf_metrix.sum(axis=0)[column]/2 + conf_metrix.sum(axis=0)[row:column].sum())/conf_metrix.sum()

      elif row > column:
        cem_metrix[row,column] = (conf_metrix.sum(axis=0)[column]/2 + conf_metrix.sum(axis=0)[column+1:row+1].sum())/conf_metrix.sum()

  cem_metrix= - np.log2( np.where(cem_metrix !=0, cem_metrix, cem_metrix+0000000.1 ))

  return np.sum(cem_metrix * conf_metrix.T) / np.sum( np.diag(cem_metrix) * conf_metrix.sum(axis=0))

In [None]:
class TrainModel():
  def __init__(self, PathSaveFiles, BertVersion, BertPath,  OutputBert, LearningRate, BatchSize, Epochs, FileName, X_train, X_valid, y_train ,y_valid, MaxLen = 192, SaveModel=False):
    self.BertVersion = BertVersion
    self.BertPath = BertPath
    self.OutputBert = OutputBert
    self.LearningRate = LearningRate
    self.BatchSize = BatchSize
    self.Epochs = Epochs
    self.FileName = FileName
    self.X_train = X_train
    self.X_valid = X_valid
    self.y_train = y_train
    self.y_valid = y_valid
    self.NumberOfLabels = y_train.nunique()
    self.average_metrics =  'macro' if self.NumberOfLabels > 2 else 'binary'
    self.PathSaveFiles = PathSaveFiles
    self.MaxLen = MaxLen
    self.SaveModel = SaveModel


  def _run(self):
      def OpenEndSave(CurrentEpoch, module):
          if module == 'open'and CurrentEpoch == 1:
            with open(self.PathSaveFiles + self.FileName + ".pkl", "rb") as f:
              self.Results = pickle.load(f)

          elif module == 'save' and CurrentEpoch == self.Epochs:
            with open(self.PathSaveFiles + self.FileName + ".pkl",'wb') as f:
              pickle.dump(self.Results, f)


      def loss_fn(outputs, targets):
        return nn.CrossEntropyLoss()(outputs, targets)
            

      def train_loop_fn(data_loader, model, optimizer, device, scheduler=None, epoch=None):
          model.train()
          for bi, d in enumerate(data_loader):
              ids = d["ids"]
              mask = d["mask"]
              token_type_ids = d["token_type_ids"]
              targets = d["targets"]

              ids = ids.to(device, dtype=torch.long)
              mask = mask.to(device, dtype=torch.long)
              token_type_ids = token_type_ids.to(device, dtype=torch.long)
              targets = targets.to(device, dtype=torch.float)
              

              optimizer.zero_grad()
              outputs = model(
                  ids=ids,
                  mask=mask,
                  token_type_ids=token_type_ids
              )

              loss = loss_fn(outputs, targets)
              if bi % 10 == 0:
                  xm.master_print(f'bi={bi}, loss={loss}')

                  ValueLoss = loss.cpu().detach().numpy().tolist()
                  ValueLoss = xm.mesh_reduce('test_loss',ValueLoss, np.mean)
                  self.Results[self.BertVersion][self.OutputBert][self.LearningRate][self.BatchSize][epoch]['loss'].append(ValueLoss)

              loss.backward()
              xm.optimizer_step(optimizer)
              if scheduler is not None:
                  scheduler.step()

      def eval_loop_fn(data_loader, model, device):
          model.eval()
          fin_targets = []
          fin_outputs = []
          for bi, d in enumerate(data_loader):
              ids = d["ids"]
              mask = d["mask"]
              token_type_ids = d["token_type_ids"]
              targets = d["targets"]

              ids = ids.to(device, dtype=torch.long)
              mask = mask.to(device, dtype=torch.long)
              token_type_ids = token_type_ids.to(device, dtype=torch.long)
              targets = targets.to(device, dtype=torch.float)

              outputs = model(
                  ids=ids,
                  mask=mask,
                  token_type_ids=token_type_ids
              )

              targets_np = targets.cpu().detach().numpy().tolist()
              outputs = torch.argmax(outputs, dim=1)
              outputs_np = outputs.detach().cpu().numpy().tolist()

              fin_targets.extend(targets_np)
              fin_outputs.extend(outputs_np)    

          return fin_outputs, fin_targets

      # tokenizer
      tokenizer = transformers.BertTokenizer.from_pretrained(self.BertPath, do_lower_case=True)

      train_dataset = BERTDatasetTraining(
          comment=self.X_train.values,
          targets=self.y_train.values,
          tokenizer=tokenizer,
          max_length=self.MaxLen
      )

      train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset,
            num_replicas=xm.xrt_world_size(),
            rank=xm.get_ordinal(),
            shuffle=True)

      train_data_loader = torch.utils.data.DataLoader(
          train_dataset,
          batch_size=self.BatchSize,
          sampler=train_sampler,
          drop_last=True,
          num_workers=1
      )

      valid_dataset = BERTDatasetTraining(
          comment=self.X_valid.values,
          targets=self.y_valid.values,
          tokenizer=tokenizer,
          max_length=self.MaxLen
      )

      valid_sampler = torch.utils.data.distributed.DistributedSampler(
            valid_dataset,
            num_replicas=xm.xrt_world_size(),
            rank=xm.get_ordinal(),
            shuffle=False)

      valid_data_loader = torch.utils.data.DataLoader(
          valid_dataset,
          batch_size=16,
          sampler=valid_sampler,
          drop_last=False,
          num_workers=1
      )

      device = xm.xla_device()
      model = mx.to(device)
      

      param_optimizer = list(model.named_parameters())
      no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
      optimizer_grouped_parameters = [
          {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
          {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]

      
      lr = 0.4 * self.LearningRate * xm.xrt_world_size()
      num_train_steps = int(len(train_dataset) / self.BatchSize / xm.xrt_world_size() * self.Epochs)
      xm.master_print(f'num_train_steps = {num_train_steps}, world_size={xm.xrt_world_size()}')

      optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
      scheduler = get_linear_schedule_with_warmup(
          optimizer,
          num_warmup_steps=0,
          num_training_steps=num_train_steps
      )

      best_f1, f1, best_cem, cem = 0,0,0,0

      for epoch in range(1, self.Epochs+1):
        ## print epoch
          xm.master_print(f'Epoch: {epoch} of {self.Epochs}')
        ## Open file to save results
          OpenEndSave(CurrentEpoch=epoch, module='open')

          para_loader = pl.ParallelLoader(train_data_loader, [device])
          train_loop_fn(para_loader.per_device_loader(device), model, optimizer, device, scheduler=scheduler, epoch=epoch)

          para_loader = pl.ParallelLoader(valid_data_loader, [device])
          o, t = eval_loop_fn(para_loader.per_device_loader(device), model, device)
          
          if self.NumberOfLabels == 2:
            f1 = xm.mesh_reduce('validation_f1', metrics.f1_score(t, o), np.mean)
            self.Results[self.BertVersion][self.OutputBert][self.LearningRate][self.BatchSize][epoch]['f1'].append(f1)

          else:
            self.Results[self.BertVersion][self.OutputBert][self.LearningRate][self.BatchSize][epoch]['f1_macro'].append(xm.mesh_reduce('validation_f1_macro', metrics.f1_score(t, o, average=self.average_metrics), np.mean))
            self.Results[self.BertVersion][self.OutputBert][self.LearningRate][self.BatchSize][epoch]['f1_weighted'].append(cem)
            cem = xm.mesh_reduce('validation_cem', cem_metric(t, o), np.mean)
            self.Results[self.BertVersion][self.OutputBert][self.LearningRate][self.BatchSize][epoch]['cem'].append(xm.mesh_reduce('validation_cem', cem_metric(t, o), np.mean))

          accuracy = metrics.accuracy_score(t, o)
          accuracy = xm.mesh_reduce('test_accuracy', accuracy, np.mean)
          self.Results[self.BertVersion][self.OutputBert][self.LearningRate][self.BatchSize][epoch]['accuracy'].append(accuracy)
          self.Results[self.BertVersion][self.OutputBert][self.LearningRate][self.BatchSize][epoch]['recall'].append(xm.mesh_reduce('validation_recall', metrics.recall_score(t, o, average=self.average_metrics), np.mean))
          self.Results[self.BertVersion][self.OutputBert][self.LearningRate][self.BatchSize][epoch]['precision'].append(xm.mesh_reduce('validation_precison', metrics.precision_score(t, o, average=self.average_metrics), np.mean))
              
        ## save file with save results
          OpenEndSave(CurrentEpoch=epoch, module='save')

        ## Save model
          if self.SaveModel and epoch == self.Epochs:
            xm.save(model.state_dict(), self.PathSaveFiles + self.FileName + '.bin')
        
        ## print accuracy
          xm.master_print(f'Accuracy = {accuracy}')


#Load data

In [None]:
PathDataSet = "../content/drive/MyDrive/Code/DETOXIS/Data/train.csv"
## Task 1
# df_train = pd.read_csv(PathDataSet, usecols=["comment", "toxicity"]).fillna("none")
# NewColumnsNames = {"comment":"Data","toxicity":"Label"}

## Task 2
df_train = pd.read_csv(PathDataSet, usecols=["comment", "toxicity_level"]).fillna("none")
NewColumnsNames = {"comment":"Data","toxicity_level":"Label"}

df_train = df_train.rename(columns=NewColumnsNames)
df_train = df_train.sample(frac=1).reset_index(drop=True)

In [None]:
df_train.head()

Unnamed: 0,Data,Label
0,"Haz que pase ya Pedro por Dios, porque a mí se...",1
1,"Precisamente por eso, va a acabar discutiendo ...",0
2,Bueno.. así es la izquierda.. desviando la res...,0
3,"No todas tienen dónde ir, ni pueden dejar el t...",0
4,La Policía Nacional ha arrestado ya a uno de l...,0


#Load Weights

In [None]:
def CriateFileName(BertVersionDict, NumberOfClasses):
  
  NameFile = str()
  for BertModel in BertVersionDict.keys():
    NameFile += BertModel

  if NumberOfClasses > 2:
    NameFile += 'Task2'
  else:
    NameFile += 'Task1'

  return NameFile

In [None]:
# BertVersion = {'SpanishBert':'../content/bert-base-spanish-wwm-uncased/', 'MultilingualBert':'../content/bert-base-multilingual-uncased/'}
# OutputBert = ['hidden', 'pooler']
# LearningRate = [1e-5, 3e-5, 5e-5]
# BatchSize = [8, 16, 32 , 64]
# Epochs = 20

In [None]:
## Train Parameters
BertVersion = {'SpanishBert':'../content/bert-base-spanish-wwm-uncased/'}
OutputBert = ['hidden', 'pooler']
LearningRate = [3e-5, 5e-5]
BatchSize = [8, 16, 32 , 64]
Epochs = 20

In [None]:
## Evalute matrics
###### Task 1
MetricsTask1 = ['accuracy', 'f1', 'recall', 'precision']
###### Task 2
MetricsTask2 = ['accuracy', 'f1_macro', 'f1_weighted', 'recall', 'precision', 'cem']

## Get for 'Binary' classification' task1 or 'Multilabel classifcation' task2
Metrics = MetricsTask2 if df_train['Label'].nunique() > 2 else MetricsTask1

## Criate dictinaril results
ResultsTask = { bert:{ output:{ lr:{ bat:{ epoc:{ metric:[] for metric in Metrics + ['loss']} for epoc in range(1, Epochs+1) } for bat in BatchSize} for lr in LearningRate} for output in OutputBert } for bert in BertVersion.keys() }

In [None]:
## Where to Save Files
Path = 'drive/MyDrive/Code/DETOXIS/Machine-Learning-Tweets-Classification/Bert/Results/'

## Creating Main Parte Bert File Name
MainParteBertFileName = CriateFileName(BertVersion, NumberOfClasses=df_train['Label'].nunique())

## Create file to save results
FileResults = MainParteBertFileName + 'Results'
# with open(Path + FileResults + ".pkl",'wb') as f:
#   pickle.dump(ResultsTask, f)

#Train

In [None]:
### Cross Validation
for BertV, BertPath in BertVersion.items():
  for OutputB in OutputBert:

    ### Loading Bert trained weights
    mx = BERTBaseUncased(bert_path=BertPath, output_bert=OutputB, NumberOfClasses=df_train['Label'].nunique())

    for lr in LearningRate:
      for Batch in BatchSize:

        ## StratifiedKFold
        skf = StratifiedKFold(n_splits=10)
        fold = 1
        for train_index, valid_index in skf.split(df_train['Data'], df_train['Label']):
          X_train, X_valid = df_train.loc[train_index, 'Data'], df_train.loc[valid_index, 'Data']
          y_train, y_valid = df_train.loc[train_index, 'Label'], df_train.loc[valid_index, 'Label']

          print(f'parameters: Bertmodel: {BertV}, Output: {OutputB}, lr: {lr}, Batch: {Batch}, Totsl Num. Epochs: {Epochs}, Fold: {fold}')
          fold += 1
          MoDeL = TrainModel(PathSaveFiles = Path,
                            BertVersion=BertV,
                            BertPath=BertPath,
                            OutputBert=OutputB,
                            LearningRate=lr,
                            BatchSize=Batch,
                            Epochs=Epochs,
                            FileName= FileResults,
                            X_train=X_train, 
                            X_valid=X_valid,
                            y_train=y_train,
                            y_valid=y_valid)
        

          def _mp_fn(rank, flags):
            torch.set_default_tensor_type('torch.FloatTensor')
            a = MoDeL._run()

          FLAGS={}
          xmp.spawn(_mp_fn, args=(FLAGS,), nprocs=8, start_method='fork')

Some weights of the model checkpoint at ../content/bert-base-spanish-wwm-uncased/ were not used when initializing BertModel: ['bert.embeddings.position_ids']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at ../content/bert-base-spanish-wwm-uncased/ and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch: 19 of 20
bi=0, loss=0.00428386265411973
bi=10, loss=0.0068955314345657825
Accuracy = 0.7329545454545454
Epoch: 20 of 20
bi=0, loss=0.004021701868623495
bi=10, loss=0.005889202933758497
Accuracy = 0.7301136363636365
parameters: Bertmodel: SpanishBert, Output: pooler, lr: 3e-05, Batch: 32, Totsl Num. Epochs: 20, Fold: 5
num_train_steps = 243, world_size=8
Epoch: 1 of 20
bi=0, loss=1.3605506420135498
bi=10, loss=0.8074816465377808
Accuracy = 0.6761363636363636
Epoch: 2 of 20
bi=0, loss=0.9153426289558411
bi=10, loss=0.7944733500480652
Accuracy = 0.6903409090909092
Epoch: 3 of 20
bi=0, loss=0.743665337562561
bi=10, loss=0.571084201335907
Accuracy = 0.7244318181818181
Epoch: 4 of 20
bi=0, loss=0.5918833017349243
bi=10, loss=0.5661506056785583
Accuracy = 0.7130681818181819
Epoch: 5 of 20
bi=0, loss=0.463252991437912
bi=10, loss=0.5183402895927429
Accuracy = 0.6619318181818181
Epoch: 6 of 20
bi=0, loss=0.49587884545326233

In [None]:
def AveragResults(FileName, Path):
  with open(Path + FileName + ".pkl", "rb") as f:
              Results = pickle.load(f)

  for BT, ModelBertType,  in Results.items():
    for OP, OutPut in ModelBertType.items():
      for LR, LearningRate in OutPut.items():
        for BS, BatchSize in LearningRate.items():
          for EP, Epoch in BatchSize.items():
            for Metrics, ValuesCrossValidation in  Epoch.items():
 
              # Metrics = np.mean(ValuesCrossValidation)
              Results[BT][OP][LR][BS][EP][Metrics] = np.mean(ValuesCrossValidation)
            
  with open('Average' + FileName + '.pkl','wb') as f:
    pickle.dump(Results, f)

  with open(Path + 'Average' + FileName + '.pkl','wb') as f:
    pickle.dump(Results, f)
  
  return Results

In [None]:
## Average and Save Results
AverageResultsTask = AveragResults(FileName=FileResults, Path=Path)

In [None]:
### create dataframe for our results
def create_Data_Frame(all_resultas):

  

  ### Criate a pandas da Frame with all results
  df_results = pd.DataFrame.from_dict({(BertType, OutpuType, LearningRate, BactSize, Epochs): all_resultas[BertType][OutpuType][LearningRate][BactSize][Epochs]
                            for BertType in all_resultas.keys()
                            for OutpuType in all_resultas[BertType].keys()
                            for LearningRate in all_resultas[BertType][OutpuType].keys()
                            for BactSize in all_resultas[BertType][OutpuType][LearningRate].keys()
                            for Epochs in all_resultas[BertType][OutpuType][LearningRate][BactSize].keys()},
                        orient='index')
  return df_results

In [None]:
## Create a Data Frame
DfResultsTask = create_Data_Frame(all_resultas=AverageResultsTask)

### save results to a CSV file
DfResultsTask.to_csv(Path + 'Average' + FileResults + '_CSV_' + '.csv')

### See the Avarage results in the Pandas data Frame
DfResultsTask

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,accuracy,f1_macro,f1_weighted,recall,precision,cem,loss
SpanishBert,hidden,0.00001,8,1,0.685795,0.288260,0.000000,0.314073,0.331216,0.722565,
SpanishBert,hidden,0.00001,8,2,0.714205,0.385249,0.722565,0.388204,0.424997,0.769625,
SpanishBert,hidden,0.00001,8,3,0.716477,0.413840,0.769625,0.415084,0.461104,0.774677,
SpanishBert,hidden,0.00001,8,4,0.705398,0.416486,0.774677,0.421735,0.456023,0.764872,
SpanishBert,hidden,0.00001,8,5,0.715341,0.416753,0.764872,0.421896,0.459225,0.773905,
SpanishBert,...,...,...,...,...,...,...,...,...,...,...
SpanishBert,pooler,0.00005,64,16,0.690341,0.414261,0.751552,0.428517,0.430052,0.752674,0.205362
SpanishBert,pooler,0.00005,64,17,0.690625,0.415757,0.752674,0.427383,0.433727,0.752244,0.136129
SpanishBert,pooler,0.00005,64,18,0.690341,0.412469,0.752244,0.426431,0.425709,0.753042,0.109672
SpanishBert,pooler,0.00005,64,19,0.689205,0.414175,0.753042,0.428855,0.425632,0.751826,0.093740


In [None]:
## Creating LateX Table
LabelTaskTable = FileResults
print(DfResultsTask.to_latex(multicolumn=True, multirow=False, label=LabelTaskTable))

\begin{table}
\centering
\label{SpanishBertTask2Results}
\begin{tabular}{lllllrrrrrrr}
\toprule
            &        &         &    &    &  accuracy &  f1\_macro &  f1\_weighted &    recall &  precision &       cem &      loss \\
\midrule
SpanishBert & hidden & 0.00001 & 8  & 1  &  0.685795 &  0.288260 &     0.000000 &  0.314073 &   0.331216 &  0.722565 &       NaN \\
            &        &         &    & 2  &  0.714205 &  0.385249 &     0.722565 &  0.388204 &   0.424997 &  0.769625 &       NaN \\
            &        &         &    & 3  &  0.716477 &  0.413840 &     0.769625 &  0.415084 &   0.461104 &  0.774677 &       NaN \\
            &        &         &    & 4  &  0.705398 &  0.416486 &     0.774677 &  0.421735 &   0.456023 &  0.764872 &       NaN \\
            &        &         &    & 5  &  0.715341 &  0.416753 &     0.764872 &  0.421896 &   0.459225 &  0.773905 &       NaN \\
            &        &         &    & 6  &  0.706818 &  0.435896 &     0.773905 &  0.446014 &   0.461

# Inference

##Train the model with Full Train dataset

In [None]:
## 10 Best resuts
MetricForBestResults = 'cem' if df_train['Label'].nunique() > 2 else 'f1'
DfResultsTask.nlargest(n=10, columns= MetricForBestResults )

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,accuracy,f1_macro,f1_weighted,recall,precision,cem,loss
SpanishBert,hidden,1e-05,16,4,0.717045,0.40355,0.76777,0.409123,0.446873,0.77686,
SpanishBert,hidden,1e-05,8,3,0.716477,0.41384,0.769625,0.415084,0.461104,0.774677,
SpanishBert,hidden,3e-05,32,6,0.71875,0.409604,0.74827,0.417309,0.435515,0.774593,
SpanishBert,hidden,3e-05,64,5,0.714773,0.417772,0.763167,0.423512,0.446093,0.774551,0.541801
SpanishBert,hidden,1e-05,8,5,0.715341,0.416753,0.764872,0.421896,0.459225,0.773905,
SpanishBert,hidden,1e-05,8,20,0.717614,0.45032,0.770964,0.452513,0.482516,0.772752,
SpanishBert,hidden,1e-05,8,16,0.717898,0.451593,0.771236,0.452176,0.486291,0.772296,
SpanishBert,hidden,5e-05,8,2,0.708807,0.413074,0.754429,0.431097,0.465479,0.771905,
SpanishBert,hidden,1e-05,8,17,0.716761,0.449948,0.772296,0.451223,0.484324,0.771558,
SpanishBert,hidden,1e-05,8,8,0.710227,0.423075,0.764952,0.431958,0.454768,0.771399,


In [None]:
## Get best parameters from cross-validation DataFrame 
BestResultParameters = DfResultsTask.sort_values(MetricForBestResults, ascending=False)[:1].index
print(f'Best parameters : {BestResultParameters}')

Best parameters : MultiIndex([('SpanishBert', 'hidden', 1e-05, 16, 4)],
           )


In [None]:
## Add best parameters to variables in the final train
BertPath = BertVersion[BestResultParameters[0][0]]
BertVersion = {BestResultParameters[0][0] : BertVersion[BestResultParameters[0][0]]}
OutputBert = [BestResultParameters[0][1]]
LearningRate = [float(BestResultParameters[0][2])]
BatchSize = [int(BestResultParameters[0][3])]
Epochs = int(BestResultParameters[0][4])

In [None]:
## Criate dictinaril results
ResultsTaskBestParameters = { bert:{ output:{ lr:{ bat:{ epoc:{ metric:[] for metric in Metrics + ['loss']} for epoc in range(1, Epochs+1) } for bat in BatchSize} for lr in LearningRate} for output in OutputBert } for bert in BertVersion.keys() }

## Create file to save results BEST Parameters
#### Create file name
FileResultsBestModel = FileResults + 'BestModel'
#### Save the file fro results BEST Parameters
with open(Path + FileResultsBestModel + ".pkl",'wb') as f:
  pickle.dump(ResultsTaskBestParameters, f)

In [None]:
## Train with Best parameters

## Best parameters
BertV = BestResultParameters[0][0]
BertPath = BertVersion[BestResultParameters[0][0]]
OutputB = OutputBert[0]
lr = LearningRate[0]
Batch = BatchSize[0]
Epochs = Epochs

### Loading Bert trained weights
mx = BERTBaseUncased(bert_path=BertPath, output_bert=OutputB, NumberOfClasses=df_train['Label'].nunique())

## Split train and test
X_train = df_train['Data']
y_train = df_train['Label']
_, X_test, _, y_test = train_test_split(df_train['Data'], df_train['Label'], test_size=0.33, random_state=42)

print(f'parameters: Bertmodel: {BertV}, Output: {OutputB}, lr: {lr}, Batch: {Batch}, Totsl Num. Epochs: {Epochs}')
MoDeL = TrainModel(PathSaveFiles = Path,
                  BertVersion=BertV,
                  BertPath=BertPath,
                  OutputBert=OutputB,
                  LearningRate=lr,
                  BatchSize=Batch,
                  Epochs=Epochs,
                  FileName= FileResultsBestModel,
                  X_train=X_train, 
                  X_valid=X_test,
                  y_train=y_train,
                  y_valid=y_test,
                  SaveModel=True)


def _mp_fn(rank, flags):
  torch.set_default_tensor_type('torch.FloatTensor')
  a = MoDeL._run()

FLAGS={}
xmp.spawn(_mp_fn, args=(FLAGS,), nprocs=8, start_method='fork')

Some weights of the model checkpoint at ../content/bert-base-spanish-wwm-uncased/ were not used when initializing BertModel: ['bert.embeddings.position_ids']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at ../content/bert-base-spanish-wwm-uncased/ and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


parameters: Bertmodel: SpanishBert, Output: hidden, lr: 1e-05, Batch: 16, Totsl Num. Epochs: 4
num_train_steps = 108, world_size=8
Epoch: 1 of 4
bi=0, loss=1.690983772277832
bi=10, loss=0.8270350694656372
bi=20, loss=1.0059112310409546
Accuracy = 0.701048951048951
Epoch: 2 of 4
bi=0, loss=0.8150615096092224
bi=10, loss=0.7906311750411987
bi=20, loss=0.6265846490859985
Accuracy = 0.7736013986013985
Epoch: 3 of 4
bi=0, loss=0.7270472049713135
bi=10, loss=0.5866357088088989
bi=20, loss=0.547871470451355
Accuracy = 0.8155594405594406
Epoch: 4 of 4
bi=0, loss=0.550194263458252
bi=10, loss=0.5447127819061279
bi=20, loss=0.5726341605186462
Accuracy = 0.8374125874125874


In [None]:
## Average and Save Results
AverageResultsTaskBestModel = AveragResults(FileName=FileResultsBestModel, Path=Path)

In [None]:
## Create a Data Frame
DfResultsTaskBestModel = create_Data_Frame(all_resultas=AverageResultsTaskBestModel)

### save results to a CSV file
DfResultsTaskBestModel.to_csv(Path + 'Average' + FileResultsBestModel + '_CSV_' + '.csv')

### See the Avarage results in the Pandas data Frame
DfResultsTaskBestModel

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,accuracy,f1_macro,f1_weighted,recall,precision,cem,loss
SpanishBert,hidden,1e-05,16,1,0.701049,0.273298,0.0,0.29125,0.320689,0.751806,1.076546
SpanishBert,hidden,1e-05,16,2,0.773601,0.456156,0.751806,0.444534,0.488135,0.82771,0.674776
SpanishBert,hidden,1e-05,16,3,0.815559,0.515485,0.82771,0.516504,0.524467,0.862626,0.592203
SpanishBert,hidden,1e-05,16,4,0.837413,0.559376,0.862626,0.540827,0.617811,0.885832,0.507718


## Inference on Test Dataset

###Load data

In [None]:
PathDataSet = "../content/drive/MyDrive/Code/DETOXIS/Data/test.csv"
df_test = pd.read_csv(PathDataSet, usecols=["comment_id","comment"]).fillna("none")
NewColumnsNames = {"comment":"Data"}
df_test = df_test.rename(columns=NewColumnsNames)

In [None]:
class BERTDatasetTest:
    def __init__(self, comment_text, tokenizer, max_length):
        self.comment_text = comment_text
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, item):
        comment_text = str(self.comment_text[item])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_length,
        )
        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]
        
        padding_length = self.max_length - len(ids)
        
        ids = ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long)
        }

In [None]:
## Bert tozenizer
tokenizer = transformers.BertTokenizer.from_pretrained(BertPath, do_lower_case=True)

In [None]:
## Loading the best model
device = torch.device("xla")
model = BERTBaseUncased(bert_path=BertPath, output_bert=OutputB, NumberOfClasses=df_train['Label'].nunique()).to(device)
FileBestModel = Path + FileResultsBestModel + '.bin'
model.load_state_dict(torch.load(FileBestModel))
model.eval()

Some weights of the model checkpoint at ../content/bert-base-spanish-wwm-uncased/ were not used when initializing BertModel: ['bert.embeddings.position_ids']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at ../content/bert-base-spanish-wwm-uncased/ and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTBaseUncased(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [None]:
## Prepresing the data
valid_dataset = BERTDatasetTest(
        comment_text=df_test['Data'].values,
        tokenizer=tokenizer,
        max_length=192
)

valid_data_loader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size=Batch,
    drop_last=False,
    num_workers=4,
    shuffle=False
)

In [None]:
## Making the Inferences
with torch.no_grad():
    fin_outputs = []
    for bi, d in tqdm(enumerate(valid_data_loader)):
        ids = d["ids"]
        mask = d["mask"]
        token_type_ids = d["token_type_ids"]

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)

        outputs = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )

        outputs_np = outputs.detach().cpu().numpy().tolist()
        fin_outputs.extend(outputs_np) 

56it [00:04, 13.99it/s]


In [None]:
## List with Results
fin_outputs

## create a Dataframe from List of Results
df_results = pd.DataFrame.from_records(fin_outputs)

## get the model inference
df_results['Inference'] = df_results.idxmax(axis=1)

## Visualize results
df_results.head()

Unnamed: 0,0,1,2,3,Inference
0,1.658062,0.943345,-1.198898,-2.257798,0
1,2.619709,0.493857,-1.665284,-2.801999,0
2,-0.769244,0.947111,-0.199099,-2.062467,1
3,0.17536,0.991688,-0.431575,-2.116301,1
4,1.066046,1.194659,-0.406725,-2.346515,1


In [None]:
## Get rows index
df_idex = df_test.loc[:,"comment_id"].to_frame()

## Add index to the Results dataframe
df_results = df_results.join(df_idex)

### save results to a CSV file
df_results.to_csv(Path + 'ModelInfereneces' + FileResultsBestModel + '_CSV_' + '.csv')

## ## Visualize results
df_results.head()

Unnamed: 0,0,1,2,3,Inference,comment_id
0,1.658062,0.943345,-1.198898,-2.257798,0,10_001
1,2.619709,0.493857,-1.665284,-2.801999,0,10_002
2,-0.769244,0.947111,-0.199099,-2.062467,1,10_003
3,0.17536,0.991688,-0.431575,-2.116301,1,10_004
4,1.066046,1.194659,-0.406725,-2.346515,1,10_005


In [None]:
# Change the data to the DETOXIS format submition
### cerate a data frame only with the labels and ids
df_SubmationResults = df_results.loc[:, ['Inference', 'comment_id']]
df_SubmationResults.head()

Unnamed: 0,Inference,comment_id
0,0,10_001
1,0,10_002
2,1,10_003
3,1,10_004
4,1,10_005


In [None]:
## create a new id column
df_SubmationResults['id'] = np.arange(len(df_SubmationResults))
## removing olde id comment_id column
df_SubmationResults = df_SubmationResults.loc[:,['id', 'Inference']]
#submation format
df_SubmationResults.head()

Unnamed: 0,id,Inference
0,0,0
1,1,0
2,2,1
3,3,1
4,4,1


In [None]:
## saive inferences as .tsv
Path = 'drive/MyDrive/Code/DETOXIS/Machine-Learning-Tweets-Classification/Bert/Results/'
FileName = 'AI-UPV_subtask2_1'
df_SubmationResults.to_csv( Path + FileName + '.tsv', header=False, sep='\t', index=False)

# Reviewing results

In [None]:
df = pd.read_csv(Path + 'Submited_' + FileName + '.tsv', header=None, sep='\t')
df = df.rename({0:'id', 1:'Inference'}, axis=1)
df.head()

Unnamed: 0,id,Inference
0,0,1
1,1,0
2,2,1
3,3,1
4,4,1


In [None]:
df['Submited_Inference'] = df_SubmationResults['Inference']
df.head()

Unnamed: 0,id,Inference,Submited_Inference
0,0,1,0
1,1,0,0
2,2,1,1
3,3,1,1
4,4,1,1


In [None]:
df['check'] = df.apply(lambda x: 1 if x.Inference != x.Submited_Inference else 0, axis=1)
df.head()

Unnamed: 0,id,Inference,Submited_Inference,check
0,0,1,0,1
1,1,0,0,0
2,2,1,1,0
3,3,1,1,0
4,4,1,1,0


In [None]:
df.check.sum()

112

In [None]:
# df_SubmationResults['Inference'].unique()

array([0, 1, 2])

# Util when the process stops sandly

In [None]:
# import pickle
# with open('drive/MyDrive/Code/DETOXIS/Machine-Learning-Tweets-Classification/Bert/Results/SpanishBertTask2Results' + ".pkl", "rb") as f:
#   Re = pickle.load(f)
# Re

In [None]:
# def CleanBrokeTrain(FileName, Path, NumberOfFoldes=10):
#   with open(Path + FileName + ".pkl", "rb") as f:
#               Results = pickle.load(f)

#   for BT, ModelBertType,  in Results.items():
#     for OP, OutPut in ModelBertType.items():
#       for LR, LearningRate in OutPut.items():
#         for BS, BatchSize in LearningRate.items():
#           for EP, Epoch in BatchSize.items():
#             for Metrics, ValuesCrossValidation in  Epoch.items():
 
#               if len(ValuesCrossValidation) != 0 and not len(ValuesCrossValidation) == NumberOfFoldes:
#                 Results[BT][OP][LR][BS][EP][Metrics] = []
            
#   with open(FileName + '.pkl','wb') as f:
#     pickle.dump(Results, f)

#   with open(Path + FileName + '.pkl','wb') as f:
#     pickle.dump(Results, f)

In [None]:
# Path = 'drive/MyDrive/Code/DETOXIS/Machine-Learning-Tweets-Classification/Bert/Results/'
# File = 'SpanishBertTask2Results'

In [None]:
# CleanBrokeTrain(FileName=File, Path=Path, NumberOfFoldes=10)

In [None]:
# import pickle
# with open('drive/MyDrive/Code/DETOXIS/Machine-Learning-Tweets-Classification/Bert/Results/SpanishBertTask2Results' + ".pkl", "rb") as f:
#   RE = pickle.load(f)
# RE