# Downloading Dependences

In [None]:
# !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
# !python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev

In [None]:
# !apt-get install git-lfs

In [None]:
# !git lfs install
# !git clone https://huggingface.co/dccuchile/bert-base-spanish-wwm-uncased

In [None]:
# !git lfs install
# !git clone https://huggingface.co/bert-base-uncased

In [None]:
# !git lfs install
# !git clone https://huggingface.co/bert-base-multilingual-uncased

In [None]:
# !pip install transformers==3

# Load Dependences

In [1]:
### add NLP dependences
import pickle
import os
import torch
import pandas as pd
from scipy import stats
import numpy as np
import os

from sklearn import metrics
from sklearn.model_selection import StratifiedKFold, train_test_split

from tqdm import tqdm
from collections import OrderedDict, namedtuple
import torch.nn as nn
from torch.optim import lr_scheduler
import joblib

import logging
import transformers
from transformers import AdamW, get_linear_schedule_with_warmup, get_constant_schedule
import sys
from sklearn import metrics, model_selection

import warnings
import torch_xla
import torch_xla.debug.metrics as met
import torch_xla.distributed.data_parallel as dp
import torch_xla.distributed.parallel_loader as pl
import torch_xla.utils.utils as xu
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_multiprocessing as xmp
import torch_xla.test.test_utils as test_utils
import warnings

from torch_xla.core.xla_model import mesh_reduce

warnings.filterwarnings("ignore")

In [2]:
# Mount Google Drive
from google.colab import drive # import drive from google colab

ROOT = "/content/drive"     # default location for the drive
print(ROOT)                 # print content of ROOT (Optional)

drive.mount(ROOT)           # we mount the google drive at /content/drive

/content/drive
Mounted at /content/drive


# Functions

In [3]:
class BERTBaseUncased(nn.Module):
    def __init__(self, bert_path, output_bert='pooler', NumberOfClasses=2):
        super(BERTBaseUncased, self).__init__()
        self.bert_path = bert_path
        self.bert = transformers.BertModel.from_pretrained(self.bert_path)
        self.bert_drop = nn.Dropout(0.3)
        self.output_bert = output_bert
        self.NumberOfClasses = NumberOfClasses
        self.OutPutHidden = nn.Linear(768 * 2, NumberOfClasses)
        self.OutPoller = nn.Linear(768, NumberOfClasses)

    def forward(
            self,
            ids,
            mask,
            token_type_ids
    ):
        o1, o2 = self.bert(
            ids,
            attention_mask=mask,
            token_type_ids=token_type_ids)
          
        if self.output_bert=='hidden':
          apool = torch.mean(o1, 1)
          mpool, _ = torch.max(o1, 1)
          cat = torch.cat((apool, mpool), 1)
          bo = self.bert_drop(cat)

          output = self.OutPutHidden(bo) 

        else:
          bo = self.bert_drop(o2)
          output = self.OutPoller(bo)
        
        return output

In [4]:
class BERTDatasetTraining:
    def __init__(self, comment, targets, tokenizer, max_length):
        self.comment = comment
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.targets = targets

    def __len__(self):
        return len(self.comment)

    def __getitem__(self, item):
        comment = str(self.comment[item])
        comment = " ".join(comment.split())

        inputs = self.tokenizer.encode_plus(
            comment,
            None,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_length,
        )
        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]
        
        padding_length = self.max_length - len(ids)
        
        ids = ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[item], dtype=torch.float)
        }

In [5]:
class TrainModel():
  def __init__(self, PathSaveFiles, BertVersion, BertPath,  OutputBert, LearningRate, BatchSize, Epochs, FileName, X_train, X_valid, y_train ,y_valid, MaxLen = 110, SaveModel=False):
    self.BertVersion = BertVersion
    self.BertPath = BertPath
    self.OutputBert = OutputBert
    self.LearningRate = LearningRate
    self.BatchSize = BatchSize
    self.Epochs = Epochs
    self.FileName = FileName
    self.X_train = X_train
    self.X_valid = X_valid
    self.y_train = y_train
    self.y_valid = y_valid
    self.NumberOfLabels = y_train.nunique()
    self.average_metrics =  'macro' if self.NumberOfLabels > 2 else 'binary'
    self.PathSaveFiles = PathSaveFiles
    self.MaxLen = MaxLen
    self.SaveModel = SaveModel


  def _run(self):
      def OpenEndSave(CurrentEpoch, module):
          if module == 'open'and CurrentEpoch == 1:
            with open(self.PathSaveFiles + self.FileName + ".pkl", "rb") as f:
              self.Results = pickle.load(f)

          elif module == 'save' and CurrentEpoch == self.Epochs:
            with open(self.PathSaveFiles + self.FileName + ".pkl",'wb') as f:
              pickle.dump(self.Results, f)


      def loss_fn(outputs, targets):
        return nn.CrossEntropyLoss()(outputs, targets)
            

      def train_loop_fn(data_loader, model, optimizer, device, scheduler=None, epoch=None):
          model.train()
          for bi, d in enumerate(data_loader):
              ids = d["ids"]
              mask = d["mask"]
              token_type_ids = d["token_type_ids"]
              targets = d["targets"]

              ids = ids.to(device, dtype=torch.long)
              mask = mask.to(device, dtype=torch.long)
              token_type_ids = token_type_ids.to(device, dtype=torch.long)
              targets = targets.to(device, dtype=torch.float)
              

              optimizer.zero_grad()
              outputs = model(
                  ids=ids,
                  mask=mask,
                  token_type_ids=token_type_ids
              )

              loss = loss_fn(outputs, targets)
              if bi % 10 == 0:
                  xm.master_print(f'bi={bi}, loss={loss}')

                  ValueLoss = loss.cpu().detach().numpy().tolist()
                  ValueLoss = xm.mesh_reduce('test_loss',ValueLoss, np.mean)
                  self.Results[self.BertVersion][self.OutputBert][self.LearningRate][self.BatchSize][epoch]['loss'].append(ValueLoss)

              loss.backward()
              xm.optimizer_step(optimizer)
              if scheduler is not None:
                  scheduler.step()

      def eval_loop_fn(data_loader, model, device):
          model.eval()
          fin_targets = []
          fin_outputs = []
          for bi, d in enumerate(data_loader):
              ids = d["ids"]
              mask = d["mask"]
              token_type_ids = d["token_type_ids"]
              targets = d["targets"]

              ids = ids.to(device, dtype=torch.long)
              mask = mask.to(device, dtype=torch.long)
              token_type_ids = token_type_ids.to(device, dtype=torch.long)
              targets = targets.to(device, dtype=torch.float)

              outputs = model(
                  ids=ids,
                  mask=mask,
                  token_type_ids=token_type_ids
              )

              targets_np = targets.cpu().detach().numpy().tolist()
              outputs = torch.argmax(outputs, dim=1)
              outputs_np = outputs.detach().cpu().numpy().tolist()

              fin_targets.extend(targets_np)
              fin_outputs.extend(outputs_np)    

          return fin_outputs, fin_targets

      # tokenizer
      tokenizer = transformers.BertTokenizer.from_pretrained(self.BertPath, do_lower_case=True)

      train_dataset = BERTDatasetTraining(
          comment=self.X_train.values,
          targets=self.y_train.values,
          tokenizer=tokenizer,
          max_length=self.MaxLen
      )

      train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset,
            num_replicas=xm.xrt_world_size(),
            rank=xm.get_ordinal(),
            shuffle=True)

      train_data_loader = torch.utils.data.DataLoader(
          train_dataset,
          batch_size=self.BatchSize,
          sampler=train_sampler,
          drop_last=True,
          num_workers=1
      )

      valid_dataset = BERTDatasetTraining(
          comment=self.X_valid.values,
          targets=self.y_valid.values,
          tokenizer=tokenizer,
          max_length=self.MaxLen
      )

      valid_sampler = torch.utils.data.distributed.DistributedSampler(
            valid_dataset,
            num_replicas=xm.xrt_world_size(),
            rank=xm.get_ordinal(),
            shuffle=False)

      valid_data_loader = torch.utils.data.DataLoader(
          valid_dataset,
          batch_size=16,
          sampler=valid_sampler,
          drop_last=False,
          num_workers=1
      )

      device = xm.xla_device()
      model = mx.to(device)
      

      param_optimizer = list(model.named_parameters())
      no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
      optimizer_grouped_parameters = [
          {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
          {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]

      
      lr = 0.4 * self.LearningRate * xm.xrt_world_size()
      num_train_steps = int(len(train_dataset) / self.BatchSize / xm.xrt_world_size() * self.Epochs)
      xm.master_print(f'num_train_steps = {num_train_steps}, world_size={xm.xrt_world_size()}')

      optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
      scheduler = get_linear_schedule_with_warmup(
          optimizer,
          num_warmup_steps=0,
          num_training_steps=num_train_steps
      )

      best_f1, f1, best_cem, cem = 0,0,0,0

      for epoch in range(1, self.Epochs+1):
        ## print epoch
          xm.master_print(f'Epoch: {epoch} of {self.Epochs}')
        ## Open file to save results
          OpenEndSave(CurrentEpoch=epoch, module='open')

          para_loader = pl.ParallelLoader(train_data_loader, [device])
          train_loop_fn(para_loader.per_device_loader(device), model, optimizer, device, scheduler=scheduler, epoch=epoch)

          para_loader = pl.ParallelLoader(valid_data_loader, [device])
          o, t = eval_loop_fn(para_loader.per_device_loader(device), model, device)
          
          if self.NumberOfLabels == 2:
            f1 = xm.mesh_reduce('validation_f1', metrics.f1_score(t, o), np.mean)
            self.Results[self.BertVersion][self.OutputBert][self.LearningRate][self.BatchSize][epoch]['f1'].append(f1)

          else:
            self.Results[self.BertVersion][self.OutputBert][self.LearningRate][self.BatchSize][epoch]['f1_macro'].append(xm.mesh_reduce('validation_f1_macro', metrics.f1_score(t, o, average=self.average_metrics), np.mean))
            self.Results[self.BertVersion][self.OutputBert][self.LearningRate][self.BatchSize][epoch]['f1_weighted'].append(xm.mesh_reduce('validation_f1_weighted', metrics.f1_score(t, o, average='weighted'), np.mean))
            # cem = xm.mesh_reduce('validation_cem', cem_metric(t, o), np.mean)
            # self.Results[self.BertVersion][self.OutputBert][self.LearningRate][self.BatchSize][epoch]['cem'].append(xm.mesh_reduce('validation_cem', cem_metric(t, o), np.mean))

          accuracy = metrics.accuracy_score(t, o)
          accuracy = xm.mesh_reduce('test_accuracy', accuracy, np.mean)
          self.Results[self.BertVersion][self.OutputBert][self.LearningRate][self.BatchSize][epoch]['accuracy'].append(accuracy)
          self.Results[self.BertVersion][self.OutputBert][self.LearningRate][self.BatchSize][epoch]['recall'].append(xm.mesh_reduce('validation_recall', metrics.recall_score(t, o, average=self.average_metrics), np.mean))
          self.Results[self.BertVersion][self.OutputBert][self.LearningRate][self.BatchSize][epoch]['precision'].append(xm.mesh_reduce('validation_precison', metrics.precision_score(t, o, average=self.average_metrics), np.mean))
              
        ## save file with save results
          OpenEndSave(CurrentEpoch=epoch, module='save')

        ## Save model
          if self.SaveModel and epoch == self.Epochs:
            xm.save(model.state_dict(), self.PathSaveFiles + self.FileName + '.bin')
        
        ## print accuracy
          xm.master_print(f'Accuracy = {accuracy}')


#Load data

In [6]:
# Load Data

#### Data Path
PathDataSet = '../content/drive/MyDrive/Code/EXITS/Data/'
FileDataset = 'EXIST2021_translatedTraining'
#### Load tsv as a Data Frame
df_train = pd.read_csv(PathDataSet + FileDataset + '.csv', index_col=0)

#### Create two new columns converting str labels to Num label
df_train['LabelTask1'] = df_train['task1'].apply(lambda x : 1 if x == 'sexist' else 0)
CategorisList = list(df_train.task2.unique())
CategorisList.remove('non-sexist')
CategorisList.insert(0,'non-sexist')
CategoriSexism = {CategorisList[index]: index for index in range(len(list(df_train.task2.unique())))}
df_train['LabelTask2'] = df_train['task2'].apply(lambda x : CategoriSexism[x])

#### Get columns names
TestColumnNames = list(df_train.columns)
#### Vizualise Data
df_train.head()

Unnamed: 0,test_case,id,source,language,text,task1,task2,English,Spanish,LabelTask1,LabelTask2
0,EXIST2021,1,twitter,en,"She calls herself ""anti-feminazi"" how about sh...",sexist,ideological-inequality,"She calls herself ""anti-feminazi"" how about sh...","Ella se llama ""anti-feminazi"", ¿cómo se acerca...",1,1
1,EXIST2021,2,twitter,en,"Now, back to these women, the brave and the be...",non-sexist,non-sexist,"Now, back to these women, the brave and the be...","Ahora, de vuelta a estas mujeres, la valiente ...",0,0
2,EXIST2021,3,twitter,en,"@CurvyBandida @Xalynne_B Wow, your skirt is ve...",sexist,objectification,"@CurvyBandida @Xalynne_B Wow, your skirt is ve...","@Curvybandida @xalynne_b wow, tu falda es muy ...",1,2
3,EXIST2021,4,twitter,en,@AurelieGuiboud Incredible! Beautiful!But I l...,non-sexist,non-sexist,@AurelieGuiboud Incredible! Beautiful!But I l...,@Aurelieguiboud increíble!¡Hermoso! Pero me re...,0,0
4,EXIST2021,5,twitter,en,i find it extremely hard to believe that kelly...,non-sexist,non-sexist,i find it extremely hard to believe that kelly...,Me parece extremadamente difícil creer que Kel...,0,0


In [7]:
######################################################
############## Moddify CODE ##########################
######################################################

#### Change columns names for the train
LabelColumn = "LabelTask2"      ## "LabelTask1", "LabelTask2"
DataColumn = "English"          ## "text", "English" and "Spanish"
NewColumnsNames = {DataColumn:"Data",LabelColumn:"Label"}
df_train = df_train.rename(columns=NewColumnsNames)
# df_train = df_train.sample(frac=1).reset_index(drop=True)

#### Vizualise Data
df_train

Unnamed: 0,test_case,id,source,language,text,task1,task2,Data,Spanish,LabelTask1,Label
0,EXIST2021,1,twitter,en,"She calls herself ""anti-feminazi"" how about sh...",sexist,ideological-inequality,"She calls herself ""anti-feminazi"" how about sh...","Ella se llama ""anti-feminazi"", ¿cómo se acerca...",1,1
1,EXIST2021,2,twitter,en,"Now, back to these women, the brave and the be...",non-sexist,non-sexist,"Now, back to these women, the brave and the be...","Ahora, de vuelta a estas mujeres, la valiente ...",0,0
2,EXIST2021,3,twitter,en,"@CurvyBandida @Xalynne_B Wow, your skirt is ve...",sexist,objectification,"@CurvyBandida @Xalynne_B Wow, your skirt is ve...","@Curvybandida @xalynne_b wow, tu falda es muy ...",1,2
3,EXIST2021,4,twitter,en,@AurelieGuiboud Incredible! Beautiful!But I l...,non-sexist,non-sexist,@AurelieGuiboud Incredible! Beautiful!But I l...,@Aurelieguiboud increíble!¡Hermoso! Pero me re...,0,0
4,EXIST2021,5,twitter,en,i find it extremely hard to believe that kelly...,non-sexist,non-sexist,i find it extremely hard to believe that kelly...,Me parece extremadamente difícil creer que Kel...,0,0
...,...,...,...,...,...,...,...,...,...,...,...
6972,EXIST2021,6973,twitter,es,"Estamos igual sin pareja, pero puedes besar a ...",non-sexist,non-sexist,"We are the same without a partner, but you can...","Estamos igual sin pareja, pero puedes besar a ...",0,0
6973,EXIST2021,6974,twitter,es,2020 hijo de re mil putas,non-sexist,non-sexist,2020 son of re thousand whores,2020 hijo de re mil putas,0,0
6974,EXIST2021,6975,twitter,es,SEGURAMENTE ESTA CHICA NO COBRA EL DINERO QUE ...,non-sexist,non-sexist,Surely this girl does not charge the money I w...,SEGURAMENTE ESTA CHICA NO COBRA EL DINERO QUE ...,0,0
6975,EXIST2021,6976,twitter,es,@safetyaitana mi madre dice q va fea y i agree,sexist,objectification,@safetyaitana my mother says that goes ugly an...,@safetyaitana mi madre dice q va fea y i agree,1,2


In [8]:
######################################################
############## Moddify CODE ##########################
######################################################

## Select Data for train
LanguageTrain = 'en'        ## 'Whole', 'en', 'es'

df_train_es = df_train.loc[df_train.loc[df_train['language']== 'es' ].index[0]:df_train.loc[df_train['language']== 'es'].index[-1]]
df_train_en = df_train.loc[df_train.loc[df_train['language']== 'en' ].index[0]:df_train.loc[df_train['language']== 'en'].index[-1]]

In [9]:
## Get a Stratified sample of 20% of data/rows for Test (whole/es/en)
df_test_es = df_train_es.groupby(['Label']).apply(lambda x: x.sample(frac=0.2, random_state=48))
df_test_en = df_train_en.groupby(['Label']).apply(lambda x: x.sample(frac=0.2, random_state=48))
df_test_whole = pd.concat([df_test_es,df_test_en])

#Selectin the data for the Standar Train and Test
if LanguageTrain == 'whole':
  df_test = df_test_whole
elif LanguageTrain == 'es':
  df_test = df_test_es
  df_train = df_train_es
elif LanguageTrain == 'en':
  df_test = df_test_en
  df_train = df_train_en
else:
  print('wrong data')

df_test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,test_case,id,source,language,text,task1,task2,Data,Spanish,LabelTask1,Label
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,1540,EXIST2021,1541,twitter,en,"But yeah, Chuck yer pound and tin of beans in ...",non-sexist,non-sexist,"But yeah, Chuck yer pound and tin of beans in ...","Pero sí, Chuck yer libra y lata de frijoles y ...",0,0
0,175,EXIST2021,176,twitter,en,@idew2 @MsButterflyyy In a tweet about systemi...,non-sexist,non-sexist,@idew2 @MsButterflyyy In a tweet about systemi...,@ IDEW2 @MSButterflyyy en un tweet sobre el ra...,0,0
0,1081,EXIST2021,1082,twitter,en,@Crryptiic @nagitoosimp Isn’t it just sexual h...,non-sexist,non-sexist,@Crryptiic @nagitoosimp Isn’t it just sexual h...,@Crryptiic @nagitoosimp no es solo acoso sexua...,0,0
0,450,EXIST2021,451,twitter,en,@realDonaldTrump these are past Speech Topics ...,non-sexist,non-sexist,@realDonaldTrump these are past Speech Topics ...,@realdonaldtrump Estos son los temas anteriore...,0,0
0,2953,EXIST2021,2954,twitter,en,Like I don't really care and misandry ain't re...,non-sexist,non-sexist,Like I don't really care and misandry ain't re...,Como realmente no me importa y no sea realment...,0,0


In [10]:
# Removing Extra Index levels
df_test_es = df_test_es.reset_index(level=0, drop=True)
df_test_en = df_test_en.reset_index(level=0, drop=True)
df_test_whole = df_test_whole.reset_index(level=0, drop=True)

# Importantt for remove index in the next cell
df_test = df_test.reset_index(level=0, drop=True)

# Checking the Data
df_test.head()

Unnamed: 0,test_case,id,source,language,text,task1,task2,Data,Spanish,LabelTask1,Label
1540,EXIST2021,1541,twitter,en,"But yeah, Chuck yer pound and tin of beans in ...",non-sexist,non-sexist,"But yeah, Chuck yer pound and tin of beans in ...","Pero sí, Chuck yer libra y lata de frijoles y ...",0,0
175,EXIST2021,176,twitter,en,@idew2 @MsButterflyyy In a tweet about systemi...,non-sexist,non-sexist,@idew2 @MsButterflyyy In a tweet about systemi...,@ IDEW2 @MSButterflyyy en un tweet sobre el ra...,0,0
1081,EXIST2021,1082,twitter,en,@Crryptiic @nagitoosimp Isn’t it just sexual h...,non-sexist,non-sexist,@Crryptiic @nagitoosimp Isn’t it just sexual h...,@Crryptiic @nagitoosimp no es solo acoso sexua...,0,0
450,EXIST2021,451,twitter,en,@realDonaldTrump these are past Speech Topics ...,non-sexist,non-sexist,@realDonaldTrump these are past Speech Topics ...,@realdonaldtrump Estos son los temas anteriore...,0,0
2953,EXIST2021,2954,twitter,en,Like I don't really care and misandry ain't re...,non-sexist,non-sexist,Like I don't really care and misandry ain't re...,Como realmente no me importa y no sea realment...,0,0


In [11]:
# Remove the data/rows used for test set from the train set
df_train = df_train.drop(df_test.index)
df_train.head()

Unnamed: 0,test_case,id,source,language,text,task1,task2,Data,Spanish,LabelTask1,Label
0,EXIST2021,1,twitter,en,"She calls herself ""anti-feminazi"" how about sh...",sexist,ideological-inequality,"She calls herself ""anti-feminazi"" how about sh...","Ella se llama ""anti-feminazi"", ¿cómo se acerca...",1,1
1,EXIST2021,2,twitter,en,"Now, back to these women, the brave and the be...",non-sexist,non-sexist,"Now, back to these women, the brave and the be...","Ahora, de vuelta a estas mujeres, la valiente ...",0,0
2,EXIST2021,3,twitter,en,"@CurvyBandida @Xalynne_B Wow, your skirt is ve...",sexist,objectification,"@CurvyBandida @Xalynne_B Wow, your skirt is ve...","@Curvybandida @xalynne_b wow, tu falda es muy ...",1,2
3,EXIST2021,4,twitter,en,@AurelieGuiboud Incredible! Beautiful!But I l...,non-sexist,non-sexist,@AurelieGuiboud Incredible! Beautiful!But I l...,@Aurelieguiboud increíble!¡Hermoso! Pero me re...,0,0
5,EXIST2021,6,twitter,en,@Smithcouple971 Hello....m raj....m with good ...,sexist,sexual-violence,@Smithcouple971 Hello....m raj....m with good ...,@ Smithcouple971 Hola .... M Raj .... M con de...,1,3


In [12]:
# Reset index datframes and and Remove non-sexist rows if task 2 
#### Remove non-sexist rows if task 2 
if df_train['Label'].nunique() > 2:

  #Train
  df_train = df_train[df_train['Label'] != 0]
  df_train['Label'] = df_train['Label'].apply(lambda x : x -1)


  df_train_es = df_train_es[df_train_es['Label'] != 0]
  df_train_es['Label'] = df_train_es['Label'].apply(lambda x : x -1)

  df_train_en = df_train_en[df_train_en['Label'] != 0]
  df_train_en['Label'] = df_train_en['Label'].apply(lambda x : x -1)

  #Test
  df_test = df_test[df_test['Label'] != 0]
  df_test['Label'] = df_test['Label'].apply(lambda x : x -1)

  df_test_whole = df_test_whole[df_test_whole['Label'] != 0]
  df_test_whole['Label'] = df_test_whole['Label'].apply(lambda x : x -1)

  df_test_en = df_test_en[df_test_en['Label'] != 0]
  df_test_en['Label'] = df_test_en['Label'].apply(lambda x : x -1)

  df_test_es = df_test_es[df_test_es['Label'] != 0]
  df_test_es['Label'] = df_test_es['Label'].apply(lambda x : x -1)

#### Reset index
df_train = df_train.reset_index(drop=True)
df_train_es = df_train_es.reset_index(drop=True)
df_train_en = df_train_en.reset_index(drop=True)

df_test = df_test.reset_index(drop=True)
df_test_whole = df_test_whole.reset_index(drop=True)
df_test_en = df_test_en.reset_index(drop=True)
df_test_es = df_test_es.reset_index(drop=True)
 
df_test.head()

Unnamed: 0,test_case,id,source,language,text,task1,task2,Data,Spanish,LabelTask1,Label
0,EXIST2021,1166,twitter,en,@DJVeronica what you describe as #sexism sti...,sexist,ideological-inequality,@DJVeronica what you describe as #sexism sti...,@DJVERONICA ¿Qué describe como #SEXISIO todaví...,1,0
1,EXIST2021,1769,twitter,en,WW 84 is a major disappointment. This powerful...,sexist,ideological-inequality,WW 84 is a major disappointment. This powerful...,WW 84 es una gran decepción.¿Este poderoso gue...,1,0
2,EXIST2021,2704,twitter,en,@Ellaschmella Cause women fearing men in wigs ...,sexist,ideological-inequality,@Ellaschmella Cause women fearing men in wigs ...,@Ellaschmella porque las mujeres que temen a l...,1,0
3,EXIST2021,3083,twitter,en,There's a lot of talk about respect and equali...,sexist,ideological-inequality,There's a lot of talk about respect and equali...,Hay mucha conversación sobre el respeto y la i...,1,0
4,EXIST2021,3384,twitter,en,Talked to a feminazi about this and she explai...,sexist,ideological-inequality,Talked to a feminazi about this and she explai...,¡Hablé con un feminazi sobre esto y explicó có...,1,0


#Load Weights

In [13]:
def CriateFileName(BertVersionDict, NumberOfClasses):
  
  NameFile = str()
  for BertModel in BertVersionDict.keys():
    NameFile += BertModel

  if NumberOfClasses > 2:
    NameFile += 'Task2'
  else:
    NameFile += 'Task1'

  return NameFile

In [None]:
# BertVersion = {'EnglishBert':'../content/bert-base-uncased/', 'SpanishBert':'../content/bert-base-spanish-wwm-uncased/', 'MultilingualBert':'../content/bert-base-multilingual-uncased/'}
# OutputBert = ['hidden', 'pooler']
# LearningRate = [2e-5, 3e-5, 5e-5]
# BatchSize = [32, 64]
# Epochs = 8

In [14]:
######################################################
############## Moddify CODE - BERT model #############
######################################################

## Train Parameters
BertVersion = {'EnglishBert':'../content/bert-base-uncased/'}
OutputBert = ['hidden', 'pooler']
LearningRate = [2e-5, 3e-5, 5e-5]
BatchSize = [32, 64]
Epochs = 8

In [15]:
## Evalute matrics
###### Task 1
MetricsTask1 = ['accuracy', 'f1', 'recall', 'precision']
###### Task 2
MetricsTask2 = ['accuracy', 'f1_macro', 'f1_weighted', 'recall', 'precision']

## Get for 'Binary' classification' task1 or 'Multilabel classifcation' task2
Metrics = MetricsTask2 if df_train['Label'].nunique() > 2 else MetricsTask1

## Criate dictinaril results
ResultsTask = { bert:{ output:{ lr:{ bat:{ epoc:{ metric:[] for metric in Metrics + ['loss']} for epoc in range(1, Epochs+1) } for bat in BatchSize} for lr in LearningRate} for output in OutputBert } for bert in BertVersion.keys() }

In [16]:
## Where to Save Files
Path = 'drive/MyDrive/Code/EXITS/Machine-Learning-Tweets-Classification/Bert/Results/' 
BertModels = ''
for b in list(BertVersion.keys()):
  BertModels =  BertModels  + b + '_'
Folder = BertModels + LanguageTrain
Path = Path + Folder + 'DataTrain' + '/'

## Criate file to save results if it does not exist 
if not os.path.exists(Path):
  print(f'Criate folder : {Folder}' )
  print(f'Path : {Path}')
  os.makedirs(Path)

## Creating Main Parte Bert File Name
MainParteBertFileName = CriateFileName(BertVersion, NumberOfClasses=df_train['Label'].nunique()) + LanguageTrain

## Create file to save results if it does not existe
FileResults = MainParteBertFileName + 'DataTrain' + '_Results'
if not os.path.exists(Path + FileResults + '.pkl'):
  print(f'Creating File for results : {FileResults}.pkl')
  print(f'File Path : {Path}')
  with open(Path + FileResults + ".pkl",'wb') as f:
    pickle.dump(ResultsTask, f)

Creating File for results : EnglishBertTask2enDataTrain_Results.pkl
File Path : drive/MyDrive/Code/EXITS/Machine-Learning-Tweets-Classification/Bert/Results/EnglishBert_enDataTrain/


#Train

In [17]:
### Cross Validation
for BertV, BertPath in BertVersion.items():
  for OutputB in OutputBert:

    ### Loading Bert trained weights
    mx = BERTBaseUncased(bert_path=BertPath, output_bert=OutputB, NumberOfClasses=df_train['Label'].nunique())

    for lr in LearningRate:
      for Batch in BatchSize:

        ## StratifiedKFold
        skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=48)
        fold = 1
        for train_index, valid_index in skf.split(df_train['Data'], df_train['Label']):
          X_train, X_valid = df_train.loc[train_index, 'Data'], df_train.loc[valid_index, 'Data']
          y_train, y_valid = df_train.loc[train_index, 'Label'], df_train.loc[valid_index, 'Label']

          print(f'parameters: Bertmodel: {BertV}, Output: {OutputB}, lr: {lr}, Batch: {Batch}, Totsl Num. Epochs: {Epochs}, Fold: {fold}')
          fold += 1
          MoDeL = TrainModel(PathSaveFiles = Path,
                            BertVersion=BertV,
                            BertPath=BertPath,
                            OutputBert=OutputB,
                            LearningRate=lr,
                            BatchSize=Batch,
                            Epochs=Epochs,
                            FileName= FileResults,
                            X_train=X_train, 
                            X_valid=X_valid,
                            y_train=y_train,
                            y_valid=y_valid)
        

          def _mp_fn(rank, flags):
            torch.set_default_tensor_type('torch.FloatTensor')
            a = MoDeL._run()

          FLAGS={}
          xmp.spawn(_mp_fn, args=(FLAGS,), nprocs=8, start_method='fork')

parameters: Bertmodel: EnglishBert, Output: hidden, lr: 2e-05, Batch: 32, Totsl Num. Epochs: 8, Fold: 1
num_train_steps = 36, world_size=8
Epoch: 1 of 8
bi=0, loss=1.7083185911178589
Accuracy = 0.33088235294117646
Epoch: 2 of 8
bi=0, loss=1.5461724996566772
Accuracy = 0.4117647058823529
Epoch: 3 of 8
bi=0, loss=1.3166544437408447
Accuracy = 0.5
Epoch: 4 of 8
bi=0, loss=1.310128092765808
Accuracy = 0.5661764705882353
Epoch: 5 of 8
bi=0, loss=1.1363345384597778
Accuracy = 0.5808823529411765
Epoch: 6 of 8
bi=0, loss=1.0931962728500366
Accuracy = 0.6029411764705883
Epoch: 7 of 8
bi=0, loss=0.9068434238433838
Accuracy = 0.6176470588235294
Epoch: 8 of 8
bi=0, loss=0.7984212636947632
Accuracy = 0.6323529411764706
parameters: Bertmodel: EnglishBert, Output: hidden, lr: 2e-05, Batch: 32, Totsl Num. Epochs: 8, Fold: 2
num_train_steps = 36, world_size=8
Epoch: 1 of 8
bi=0, loss=1.673231601715088
Accuracy = 0.36764705882352944
Epoch: 2 of 8
bi=0, loss=1.5407172441482544
Accuracy = 0.42647058823529

In [18]:
def AveragResults(FileName, Path):
  with open(Path + FileName + ".pkl", "rb") as f:
              Results = pickle.load(f)

  for BT, ModelBertType,  in Results.items():
    for OP, OutPut in ModelBertType.items():
      for LR, LearningRate in OutPut.items():
        for BS, BatchSize in LearningRate.items():
          for EP, Epoch in BatchSize.items():
            for Metrics, ValuesCrossValidation in  Epoch.items():
 
              # Metrics = np.mean(ValuesCrossValidation)
              Results[BT][OP][LR][BS][EP][Metrics] = np.mean(ValuesCrossValidation)
            
  with open('Average' + FileName + '.pkl','wb') as f:
    pickle.dump(Results, f)

  with open(Path + 'Average' + FileName + '.pkl','wb') as f:
    pickle.dump(Results, f)
  
  return Results

In [19]:
## Average and Save Results
AverageResultsTask = AveragResults(FileName=FileResults, Path=Path)

In [20]:
### create dataframe for our results
def create_Data_Frame(all_resultas):

  

  ### Criate a pandas da Frame with all results
  df_results = pd.DataFrame.from_dict({(BertType, OutpuType, LearningRate, BactSize, Epochs): all_resultas[BertType][OutpuType][LearningRate][BactSize][Epochs]
                            for BertType in all_resultas.keys()
                            for OutpuType in all_resultas[BertType].keys()
                            for LearningRate in all_resultas[BertType][OutpuType].keys()
                            for BactSize in all_resultas[BertType][OutpuType][LearningRate].keys()
                            for Epochs in all_resultas[BertType][OutpuType][LearningRate][BactSize].keys()},
                        orient='index')
  return df_results

In [21]:
## Create a Data Frame
DfResultsTask = create_Data_Frame(all_resultas=AverageResultsTask)

### save results to a CSV file
DfResultsTask.to_csv(Path + 'Average' + FileResults + '_CSV_' + '.csv')

### See the Avarage results in the Pandas data Frame
DfResultsTask

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,accuracy,f1_macro,f1_weighted,recall,precision,loss
EnglishBert,hidden,0.00002,32,1,0.359559,0.231029,0.278085,0.316124,0.215693,1.687969
EnglishBert,hidden,0.00002,32,2,0.450735,0.350277,0.409299,0.401131,0.381529,1.528070
EnglishBert,hidden,0.00002,32,3,0.516912,0.443300,0.495002,0.475812,0.497231,1.390925
EnglishBert,hidden,0.00002,32,4,0.545588,0.478486,0.531959,0.504625,0.526306,1.266111
EnglishBert,hidden,0.00002,32,5,0.585294,0.530026,0.578710,0.555557,0.572571,1.128491
EnglishBert,...,...,...,...,...,...,...,...,...,...
EnglishBert,pooler,0.00005,64,4,0.382353,0.262199,0.311148,0.330961,0.276264,1.544226
EnglishBert,pooler,0.00005,64,5,0.461029,0.360397,0.416171,0.411669,0.383740,1.467250
EnglishBert,pooler,0.00005,64,6,0.511029,0.428818,0.483117,0.463879,0.467274,1.359216
EnglishBert,pooler,0.00005,64,7,0.550735,0.481228,0.530060,0.513569,0.511699,1.227243


In [22]:
## Creating LateX Table
LabelTaskTable = FileResults
print(DfResultsTask.to_latex(multicolumn=True, multirow=False, label=LabelTaskTable))

\begin{table}
\centering
\label{EnglishBertTask2enDataTrain_Results}
\begin{tabular}{lllllrrrrrr}
\toprule
            &        &         &    &   &  accuracy &  f1\_macro &  f1\_weighted &    recall &  precision &      loss \\
\midrule
EnglishBert & hidden & 0.00002 & 32 & 1 &  0.359559 &  0.231029 &     0.278085 &  0.316124 &   0.215693 &  1.687969 \\
            &        &         &    & 2 &  0.450735 &  0.350277 &     0.409299 &  0.401131 &   0.381529 &  1.528070 \\
            &        &         &    & 3 &  0.516912 &  0.443300 &     0.495002 &  0.475812 &   0.497231 &  1.390925 \\
            &        &         &    & 4 &  0.545588 &  0.478486 &     0.531959 &  0.504625 &   0.526306 &  1.266111 \\
            &        &         &    & 5 &  0.585294 &  0.530026 &     0.578710 &  0.555557 &   0.572571 &  1.128491 \\
            &        &         &    & 6 &  0.598529 &  0.542111 &     0.593318 &  0.565247 &   0.584045 &  1.013834 \\
            &        &         &    & 7 &  0.6058

# Inference

##Train the model with Full Train dataset

In [23]:
## 10 Best resuts
MetricForBestResults = 'f1_macro' if df_train['Label'].nunique() > 2 else 'accuracy'
DfResultsTask.nlargest(n=10, columns= MetricForBestResults )

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,accuracy,f1_macro,f1_weighted,recall,precision,loss
EnglishBert,hidden,3e-05,32,8,0.661029,0.610116,0.659015,0.633289,0.647172,0.461132
EnglishBert,hidden,5e-05,32,8,0.659559,0.608296,0.659217,0.628988,0.646696,0.231184
EnglishBert,hidden,5e-05,32,7,0.654412,0.601484,0.651375,0.62258,0.638208,0.327207
EnglishBert,hidden,3e-05,32,7,0.656618,0.600296,0.651703,0.623763,0.639326,0.57349
EnglishBert,pooler,5e-05,32,8,0.655147,0.596975,0.651494,0.621545,0.629893,0.391648
EnglishBert,pooler,5e-05,32,7,0.652941,0.596693,0.648557,0.619618,0.638363,0.51478
EnglishBert,hidden,5e-05,32,6,0.656618,0.59643,0.65188,0.620368,0.634687,0.477059
EnglishBert,hidden,5e-05,32,5,0.647794,0.596282,0.647,0.621449,0.642448,0.699217
EnglishBert,hidden,3e-05,32,6,0.643382,0.590333,0.642672,0.610534,0.628509,0.731223
EnglishBert,pooler,5e-05,32,6,0.643382,0.582051,0.63553,0.611128,0.614558,0.713569


In [24]:
## Get best parameters from cross-validation DataFrame 
BestResultParameters = DfResultsTask.sort_values(MetricForBestResults, ascending=False)[:1].index
print(f'Best parameters : {BestResultParameters}')

Best parameters : MultiIndex([('EnglishBert', 'hidden', 3e-05, 32, 8)],
           )


In [25]:
## Add best parameters to variables in the final train
BertPath = BertVersion[BestResultParameters[0][0]]
BertVersion = {BestResultParameters[0][0] : BertVersion[BestResultParameters[0][0]]}
OutputBert = [BestResultParameters[0][1]]
LearningRate = [float(BestResultParameters[0][2])]
BatchSize = [int(BestResultParameters[0][3])]
Epochs = int(BestResultParameters[0][4])

In [26]:
## Criate dictinaril results
ResultsTaskBestParameters = { bert:{ output:{ lr:{ bat:{ epoc:{ metric:[] for metric in Metrics + ['loss']} for epoc in range(1, Epochs+1) } for bat in BatchSize} for lr in LearningRate} for output in OutputBert } for bert in BertVersion.keys() }

## Create file to save results BEST Parameters
#### Create file name
FileResultsBestModel = FileResults + 'BestModel'
#### Save the file fro results BEST Parameters
with open(Path + FileResultsBestModel + ".pkl",'wb') as f:
  pickle.dump(ResultsTaskBestParameters, f)

In [27]:
## Train with Best parameters

## Best parameters
BertV = BestResultParameters[0][0]
BertPath = BertVersion[BestResultParameters[0][0]]
OutputB = OutputBert[0]
lr = LearningRate[0]
Batch = BatchSize[0]
Epochs = Epochs

### Loading Bert trained weights
mx = BERTBaseUncased(bert_path=BertPath, output_bert=OutputB, NumberOfClasses=df_train['Label'].nunique())

## Split train and test
X_train = df_train['Data']
y_train = df_train['Label']
_, X_test, _, y_test = train_test_split(df_train['Data'], df_train['Label'], test_size=0.33, random_state=42)

print(f'parameters: Bertmodel: {BertV}, Output: {OutputB}, lr: {lr}, Batch: {Batch}, Totsl Num. Epochs: {Epochs}')
MoDeL = TrainModel(PathSaveFiles = Path,
                  BertVersion=BertV,
                  BertPath=BertPath,
                  OutputBert=OutputB,
                  LearningRate=lr,
                  BatchSize=Batch,
                  Epochs=Epochs,
                  FileName= FileResultsBestModel,
                  X_train=X_train, 
                  X_valid=X_test,
                  y_train=y_train,
                  y_valid=y_test,
                  SaveModel=True)


def _mp_fn(rank, flags):
  torch.set_default_tensor_type('torch.FloatTensor')
  a = MoDeL._run()

FLAGS={}
xmp.spawn(_mp_fn, args=(FLAGS,), nprocs=8, start_method='fork')

parameters: Bertmodel: EnglishBert, Output: hidden, lr: 3e-05, Batch: 32, Totsl Num. Epochs: 8
num_train_steps = 40, world_size=8
Epoch: 1 of 8
bi=0, loss=1.6103203296661377
Accuracy = 0.46296296296296297
Epoch: 2 of 8
bi=0, loss=1.5058629512786865
Accuracy = 0.587962962962963
Epoch: 3 of 8
bi=0, loss=1.281524658203125
Accuracy = 0.6342592592592593
Epoch: 4 of 8
bi=0, loss=1.0643668174743652
Accuracy = 0.7361111111111112
Epoch: 5 of 8
bi=0, loss=0.748503565788269
Accuracy = 0.787037037037037
Epoch: 6 of 8
bi=0, loss=0.5491239428520203
Accuracy = 0.8518518518518519
Epoch: 7 of 8
bi=0, loss=0.3385702669620514
Accuracy = 0.8865740740740741
Epoch: 8 of 8
bi=0, loss=0.3341461420059204
Accuracy = 0.8888888888888888


In [28]:
## Average and Save Results
AverageResultsTaskBestModel = AveragResults(FileName=FileResultsBestModel, Path=Path)

In [29]:
## Create a Data Frame
DfResultsTaskBestModel = create_Data_Frame(all_resultas=AverageResultsTaskBestModel)

### save results to a CSV file
DfResultsTaskBestModel.to_csv(Path + 'Average' + FileResultsBestModel + '_CSV_' + '.csv')

### See the Avarage results in the Pandas data Frame
DfResultsTaskBestModel

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,accuracy,f1_macro,f1_weighted,recall,precision,loss
EnglishBert,hidden,3e-05,32,1,0.462963,0.3895,0.430038,0.433554,0.404798,1.691646
EnglishBert,hidden,3e-05,32,2,0.587963,0.529285,0.556457,0.551311,0.616015,1.520659
EnglishBert,hidden,3e-05,32,3,0.634259,0.583519,0.621047,0.586162,0.664011,1.290446
EnglishBert,hidden,3e-05,32,4,0.736111,0.709751,0.72915,0.712611,0.7326,1.085929
EnglishBert,hidden,3e-05,32,5,0.787037,0.758111,0.780778,0.756706,0.787446,0.837148
EnglishBert,hidden,3e-05,32,6,0.851852,0.834898,0.849724,0.828096,0.864771,0.675964
EnglishBert,hidden,3e-05,32,7,0.886574,0.869205,0.883381,0.86229,0.896688,0.597493
EnglishBert,hidden,3e-05,32,8,0.888889,0.872953,0.886153,0.865902,0.899537,0.504556


# Test

In [30]:
class BERTDatasetTest:
    def __init__(self, comment_text, tokenizer, max_length):
        self.comment_text = comment_text
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, item):
        comment_text = str(self.comment_text[item])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_length,
        )
        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]
        
        padding_length = self.max_length - len(ids)
        
        ids = ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long)
        }

In [31]:
## Bert tozenizer
tokenizer = transformers.BertTokenizer.from_pretrained(BertPath, do_lower_case=True)

In [32]:
## Loading the best model
device = torch.device("xla")
model = BERTBaseUncased(bert_path=BertPath, output_bert=OutputB, NumberOfClasses=df_train['Label'].nunique()).to(device)
FileBestModel = Path + FileResultsBestModel + '.bin'
model.load_state_dict(torch.load(FileBestModel))
model.eval()

BERTBaseUncased(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

### Test Whole Data

In [33]:
## Prepresing the data
valid_dataset = BERTDatasetTest(
        comment_text=df_test_whole['Data'].values,
        tokenizer=tokenizer,
        max_length=110
)

valid_data_loader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size=Batch,
    drop_last=False,
    num_workers=4,
    shuffle=False
)

In [34]:
## Making the Inferences
with torch.no_grad():
    fin_outputs = []
    for bi, d in tqdm(enumerate(valid_data_loader)):
        ids = d["ids"]
        mask = d["mask"]
        token_type_ids = d["token_type_ids"]

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)

        outputs = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )

        outputs_np = outputs.detach().cpu().numpy().tolist()
        fin_outputs.extend(outputs_np) 

22it [00:09,  2.30it/s]


In [35]:
## List with Results
fin_outputs

## create a Dataframe from List of Results
df_results = pd.DataFrame.from_records(fin_outputs)

## get the model inference
df_results['Inference'] = df_results.idxmax(axis=1)

## Visualize results
df_results.head()

Unnamed: 0,0,1,2,3,4,Inference
0,4.242957,-1.845282,-2.344079,-0.403579,-0.388582,0
1,1.690034,0.508515,-1.710589,-0.313468,-1.470104,0
2,3.565148,-0.473462,-3.20926,0.311266,-2.021156,0
3,4.519514,-1.710716,-2.72342,-0.669214,-0.483582,0
4,4.6423,-1.382578,-2.450377,-0.687019,-1.01296,0


In [36]:
## Get rows index
df_idex = df_test_whole.loc[:,["id", "Label"]]

## Add index to the Results dataframe
df_results = df_results.join(df_idex)

### save results to a CSV file
df_save_results = df_results.copy()
if df_train['Label'].nunique() > 2:
  df_save_results = df_save_results.rename({0:1, 1:2, 2:3, 3:4, 4:5}, axis='columns')
  df_save_results['Label'] = df_save_results['Label'].apply( lambda x : x+1)
  df_save_results['Inference'] = df_save_results['Inference'].apply(lambda x : x+1)
  
df_save_results.to_csv(Path + 'ModelInfereneces_' + FileResultsBestModel + '_WholeSetTest' +'_CSV_' + '.csv')

## ## Visualize results
df_results.head()

Unnamed: 0,0,1,2,3,4,Inference,id,Label
0,4.242957,-1.845282,-2.344079,-0.403579,-0.388582,0,5920,0
1,1.690034,0.508515,-1.710589,-0.313468,-1.470104,0,5605,0
2,3.565148,-0.473462,-3.20926,0.311266,-2.021156,0,6875,0
3,4.519514,-1.710716,-2.72342,-0.669214,-0.483582,0,6806,0
4,4.6423,-1.382578,-2.450377,-0.687019,-1.01296,0,6074,0


In [37]:
## caculation of performace metric
Target = df_results[df_results.columns[-1]].tolist()
Output = df_results[df_results.columns[-3]].tolist()

average_metrics = 'macro' if df_train['Label'].nunique() > 2 else 'binary'
print(f'Accuracy : {metrics.accuracy_score(Target, Output)}')
print(f'Recall : {metrics.recall_score(Target, Output, average = average_metrics)}')
print(f'Precision : {metrics.precision_score(Target, Output, average = average_metrics)}')
print(f'f1-score : {metrics.f1_score(Target, Output, average= average_metrics)}')

Accuracy : 0.6124260355029586
Recall : 0.5971838757810236
Precision : 0.6068191925799739
f1-score : 0.595108227762001


In [38]:
## caculation of performace metric
Target = df_results[df_results.columns[-1]].tolist()
Output = df_results[df_results.columns[-3]].tolist()

average_metrics = 'macro' if df_train['Label'].nunique() > 2 else 'binary'
print(f'Accuracy : {metrics.accuracy_score(Target, Output)}')
print(f'Recall : {metrics.recall_score(Target, Output, average = average_metrics)}')
print(f'Precision : {metrics.precision_score(Target, Output, average = average_metrics)}')
print(f'f1-score : {metrics.f1_score(Target, Output, average= average_metrics)}')

Accuracy : 0.6124260355029586
Recall : 0.5971838757810236
Precision : 0.6068191925799739
f1-score : 0.595108227762001


### Test Only English

In [39]:
## Prepresing the data
valid_dataset = BERTDatasetTest(
        comment_text=df_test_en['Data'].values,
        tokenizer=tokenizer,
        max_length=110
)

valid_data_loader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size=Batch,
    drop_last=False,
    num_workers=4,
    shuffle=False
)

In [40]:
## Making the Inferences
with torch.no_grad():
    fin_outputs = []
    for bi, d in tqdm(enumerate(valid_data_loader)):
        ids = d["ids"]
        mask = d["mask"]
        token_type_ids = d["token_type_ids"]

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)

        outputs = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )

        outputs_np = outputs.detach().cpu().numpy().tolist()
        fin_outputs.extend(outputs_np) 

11it [00:05,  2.14it/s]


In [41]:
## List with Results
fin_outputs

## create a Dataframe from List of Results
df_results = pd.DataFrame.from_records(fin_outputs)

## get the model inference
df_results['Inference'] = df_results.idxmax(axis=1)

## Visualize results
df_results.head()

Unnamed: 0,0,1,2,3,4,Inference
0,3.320736,-2.297317,-1.866211,-0.956359,0.927677,0
1,-0.249876,0.980628,-2.732001,2.152179,-1.502851,3
2,2.783257,-0.065578,-2.405887,0.219107,-2.042831,0
3,3.146954,-1.770834,-2.235059,2.011472,-1.293255,0
4,3.230182,-2.169609,-2.291314,0.601574,0.04781,0


In [42]:
## Get rows index
df_idex = df_test_en.loc[:,["id", "Label"]]

## Add index to the Results dataframe
df_results = df_results.join(df_idex)

### save results to a CSV file
df_save_results = df_results.copy()
if df_train['Label'].nunique() > 2:
  df_save_results = df_save_results.rename({0:1, 1:2, 2:3, 3:4, 4:5}, axis='columns')
  df_save_results['Label'] = df_save_results['Label'].apply( lambda x : x+1)
  df_save_results['Inference'] = df_save_results['Inference'].apply(lambda x : x+1)

df_save_results.to_csv(Path + 'ModelInfereneces_' + FileResultsBestModel + '_EnglishSetTest' +'_CSV_' + '.csv')

## ## Visualize results
df_results.head()

Unnamed: 0,0,1,2,3,4,Inference,id,Label
0,3.320736,-2.297317,-1.866211,-0.956359,0.927677,0,1166,0
1,-0.249876,0.980628,-2.732001,2.152179,-1.502851,3,1769,0
2,2.783257,-0.065578,-2.405887,0.219107,-2.042831,0,2704,0
3,3.146954,-1.770834,-2.235059,2.011472,-1.293255,0,3083,0
4,3.230182,-2.169609,-2.291314,0.601574,0.04781,0,3384,0


In [43]:
## caculation of performace metric
Target = df_results[df_results.columns[-1]].tolist()
Output = df_results[df_results.columns[-3]].tolist()

average_metrics = 'macro' if df_train['Label'].nunique() > 2 else 'binary'
print(f'Accuracy : {metrics.accuracy_score(Target, Output)}')
print(f'Recall : {metrics.recall_score(Target, Output, average = average_metrics)}')
print(f'Precision : {metrics.precision_score(Target, Output, average = average_metrics)}')
print(f'f1-score : {metrics.f1_score(Target, Output, average= average_metrics)}')

Accuracy : 0.636085626911315
Recall : 0.6184256556964771
Precision : 0.6237024379129643
f1-score : 0.6187254276483163


### Test Only Spanish

In [44]:
## Prepresing the data
valid_dataset = BERTDatasetTest(
        comment_text=df_test_es['Data'].values,
        tokenizer=tokenizer,
        max_length=110
)

valid_data_loader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size=Batch,
    drop_last=False,
    num_workers=4,
    shuffle=False
)

In [45]:
## Making the Inferences
with torch.no_grad():
    fin_outputs = []
    for bi, d in tqdm(enumerate(valid_data_loader)):
        ids = d["ids"]
        mask = d["mask"]
        token_type_ids = d["token_type_ids"]

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)

        outputs = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )

        outputs_np = outputs.detach().cpu().numpy().tolist()
        fin_outputs.extend(outputs_np) 

11it [00:05,  2.13it/s]


In [46]:
## List with Results
fin_outputs

## create a Dataframe from List of Results
df_results = pd.DataFrame.from_records(fin_outputs)

## get the model inference
df_results['Inference'] = df_results.idxmax(axis=1)

## Visualize results
df_results.head()

Unnamed: 0,0,1,2,3,4,Inference
0,4.242957,-1.845282,-2.344079,-0.403579,-0.388582,0
1,1.690034,0.508515,-1.710589,-0.313468,-1.470104,0
2,3.565148,-0.473462,-3.20926,0.311266,-2.021156,0
3,4.519514,-1.710716,-2.72342,-0.669214,-0.483582,0
4,4.6423,-1.382578,-2.450377,-0.687019,-1.01296,0


In [47]:
## Get rows index
df_idex = df_test_es.loc[:,["id", "Label"]]

## Add index to the Results dataframe
df_results = df_results.join(df_idex)

### save results to a CSV file
df_save_results = df_results.copy()
if df_train['Label'].nunique() > 2:
  df_save_results = df_save_results.rename({0:1, 1:2, 2:3, 3:4, 4:5}, axis='columns')
  df_save_results['Label'] = df_save_results['Label'].apply( lambda x : x+1)
  df_save_results['Inference'] = df_save_results['Inference'].apply(lambda x : x+1)

df_save_results.to_csv(Path + 'ModelInfereneces_' + FileResultsBestModel + '_SpanishSetTest' + '_CSV_' + '.csv')

## ## Visualize results
df_results.head()

Unnamed: 0,0,1,2,3,4,Inference,id,Label
0,4.242957,-1.845282,-2.344079,-0.403579,-0.388582,0,5920,0
1,1.690034,0.508515,-1.710589,-0.313468,-1.470104,0,5605,0
2,3.565148,-0.473462,-3.20926,0.311266,-2.021156,0,6875,0
3,4.519514,-1.710716,-2.72342,-0.669214,-0.483582,0,6806,0
4,4.6423,-1.382578,-2.450377,-0.687019,-1.01296,0,6074,0


In [48]:
## caculation of performace metric
Target = df_results[df_results.columns[-1]].tolist()
Output = df_results[df_results.columns[-3]].tolist()

average_metrics = 'macro' if df_train['Label'].nunique() > 2 else 'binary'
print(f'Accuracy : {metrics.accuracy_score(Target, Output)}')
print(f'Recall : {metrics.recall_score(Target, Output, average = average_metrics)}')
print(f'Precision : {metrics.precision_score(Target, Output, average = average_metrics)}')
print(f'f1-score : {metrics.f1_score(Target, Output, average= average_metrics)}')

Accuracy : 0.5873925501432665
Recall : 0.5859976687304135
Precision : 0.5960084091292395
f1-score : 0.5701836391031962


## Inference

###Load data

In [51]:
# Load data for inference 

#### Data Path
PathDataSet = '../content/drive/MyDrive/Code/EXITS/Data/'
FileDataset = 'EXIST2021_translatedTest'
#### Load tsv as a Data Frame
df_RealData = pd.read_csv(PathDataSet + FileDataset + '.csv', index_col=0)

#### Change columns names for the train
df_RealData = df_RealData.rename(columns=NewColumnsNames)

#### Vizualise Data
df_RealData.head()

Unnamed: 0,test_case,id,source,language,text,Data,Spanish
0,EXIST2021,6978,gab,en,Pennsylvania State Rep horrifies with opening ...,Pennsylvania State Rep horrifies with opening ...,La representante del estado de Pensilvania se ...
1,EXIST2021,6979,twitter,en,"@iilovegrapes He sounds like as ass, and very ...","@iilovegrapes He sounds like as ass, and very ...","@iilovegrapes Suena como un idiota, y muy cond..."
2,EXIST2021,6980,twitter,en,"@averyangryskel1 @4ARealistParty LOL! ""This be...","@averyangryskel1 @4ARealistParty LOL! ""This be...","@ averyangryskel1 @ 4ARealistParty ¡LOL! ""¡Est..."
3,EXIST2021,6981,twitter,en,@WanderOrange @stalliontwink Rights?I mean yea...,@WanderOrange @stalliontwink Rights?I mean yea...,@WanderOrange @stalliontwink ¿Derechos? Quiero...
4,EXIST2021,6982,twitter,en,the jack manifold appreciation i’m seeing is o...,the jack manifold appreciation i’m seeing is o...,la apreciación de jack manifold que estoy vien...


In [52]:
## Prepresing the data
valid_dataset = BERTDatasetTest(
        comment_text=df_RealData['Data'].values,
        tokenizer=tokenizer,
        max_length=110
)

valid_data_loader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size=Batch,
    drop_last=False,
    num_workers=4,
    shuffle=False
)

In [53]:
## Making the Inferences
with torch.no_grad():
    fin_outputs = []
    for bi, d in tqdm(enumerate(valid_data_loader)):
        ids = d["ids"]
        mask = d["mask"]
        token_type_ids = d["token_type_ids"]

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)

        outputs = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )

        outputs_np = outputs.detach().cpu().numpy().tolist()
        fin_outputs.extend(outputs_np) 

137it [00:14,  9.57it/s]


In [54]:
## List with Results
fin_outputs

## create a Dataframe from List of Results
df_results = pd.DataFrame.from_records(fin_outputs)

## change columns if task2
if df_train['Label'].nunique() > 2:
  df_results = df_results.rename({0:1, 1:2, 2:3, 3:4, 4:5}, axis='columns')

## get the model inference
df_results['Inference'] = df_results.idxmax(axis=1)

## Visualize results
df_results.head()

Unnamed: 0,1,2,3,4,5,Inference
0,2.523081,-2.385136,-1.700898,0.646595,-0.094977,1
1,-2.446868,1.13502,-0.047037,-0.080157,1.925244,5
2,4.672415,-1.370831,-2.945409,-0.263661,-1.112644,1
3,4.298006,-1.787846,-2.777997,-0.487159,-0.616419,1
4,-2.309859,2.511712,0.087705,-0.047756,-0.913645,2


In [55]:
## Get rows index
df_idex = df_RealData.loc[:,["id"]]

## Add index to the Results dataframe
df_results = df_results.join(df_idex)

### save results to a CSV file
df_results.to_csv(Path + 'ModelInfereneces' + FileResultsBestModel + '_RealData' + '_CSV_' + '.csv')

## ## Visualize results
df_results.head()

Unnamed: 0,1,2,3,4,5,Inference,id
0,2.523081,-2.385136,-1.700898,0.646595,-0.094977,1,6978
1,-2.446868,1.13502,-0.047037,-0.080157,1.925244,5,6979
2,4.672415,-1.370831,-2.945409,-0.263661,-1.112644,1,6980
3,4.298006,-1.787846,-2.777997,-0.487159,-0.616419,1,6981
4,-2.309859,2.511712,0.087705,-0.047756,-0.913645,2,6982


#Task2 - Multiclass

# Util when the process stops sandly

In [None]:
# Path = 'drive/MyDrive/Code/EXITS/Machine-Learning-Tweets-Classification/Bert/Results/EnglishBert_enDataTrain/'
# File = 'EnglishBertTask2enDataTrain_Results'

In [None]:
# import pickle
# with open('drive/MyDrive/Code/EXITS/Machine-Learning-Tweets-Classification/Bert/Results/EnglishBert_enDataTrain/EnglishBertTask2enDataTrain_Results' + ".pkl", "rb") as f:
#   Re = pickle.load(f)
# Re

{'EnglishBert': {'hidden': {2e-05: {32: {1: {'accuracy': [0.3014705882352941,
       0.2941176470588235,
       0.338235294117647,
       0.3382352941176471,
       0.30147058823529416,
       0.3308823529411764,
       0.3161764705882353,
       0.34558823529411764,
       0.32352941176470584,
       0.34558823529411764],
      'f1_macro': [0.16622641247641246,
       0.15167320596500164,
       0.19431930986110552,
       0.1948361768950004,
       0.16115070150867367,
       0.20351966388731096,
       0.20068038579067993,
       0.19212546134421135,
       0.18979963019436702,
       0.19223942723942722],
      'f1_weighted': [0.19612923840865015,
       0.1840706213998875,
       0.2389137229148156,
       0.2447824138568083,
       0.1852482724255853,
       0.24596365044029403,
       0.23189558423122436,
       0.23213351191292364,
       0.23607647512446273,
       0.22720575992634817],
      'loss': [1.6918650269508362,
       1.6352583318948746,
       1.608773112297058,
   

In [None]:
# DfResultsTask = pd.read_csv(Path + 'AverageSpanishBertTask1Results_CSV_.csv', index_col=[0,1], skipinitialspace=True)

In [None]:
# def CleanBrokeTrain(FileName, Path, NumberOfFoldes=10):
#   with open(Path + FileName + ".pkl", "rb") as f:
#               Results = pickle.load(f)

#   for BT, ModelBertType,  in Results.items():
#     for OP, OutPut in ModelBertType.items():
#       for LR, LearningRate in OutPut.items():
#         for BS, BatchSize in LearningRate.items():
#           for EP, Epoch in BatchSize.items():
#             for Metrics, ValuesCrossValidation in  Epoch.items():
 
#               if len(ValuesCrossValidation) != 0 and not len(ValuesCrossValidation) == NumberOfFoldes:
#                 Results[BT][OP][LR][BS][EP][Metrics] = []
            
#   with open(FileName + '.pkl','wb') as f:
#     pickle.dump(Results, f)

#   with open(Path + FileName + '.pkl','wb') as f:
#     pickle.dump(Results, f)

In [None]:
# CleanBrokeTrain(FileName=FileResults, Path=Path, NumberOfFoldes=10)

In [None]:
# LengPhrase = df_train['text'].str.split().str.len().tolist()
# LengPhrase.sort()
# LengPhrase[-13:]

In [None]:
# LengPhrase = df_RealData['text'].str.split().str.len().tolist()
# LengPhrase.sort()

# LengPhrase[-50:]

# Alternative code for inference

In [None]:
# class BERTDatasetTest:
#     def __init__(self, comment_text, targets, tokenizer, max_length):
#         self.comment_text = comment_text
#         self.tokenizer = tokenizer
#         self.max_length = max_length
#         self.targets = targets

#     def __len__(self):
#         return len(self.comment_text)

#     def __getitem__(self, item):
#         comment_text = str(self.comment_text[item])
#         comment_text = " ".join(comment_text.split())

#         inputs = self.tokenizer.encode_plus(
#             comment_text,
#             None,
#             truncation=True,
#             add_special_tokens=True,
#             max_length=self.max_length,
#         )
#         ids = inputs["input_ids"]
#         token_type_ids = inputs["token_type_ids"]
#         mask = inputs["attention_mask"]
        
#         padding_length = self.max_length - len(ids)
        
#         ids = ids + ([0] * padding_length)
#         mask = mask + ([0] * padding_length)
#         token_type_ids = token_type_ids + ([0] * padding_length)
        
#         return {
#             'ids': torch.tensor(ids, dtype=torch.long),
#             'mask': torch.tensor(mask, dtype=torch.long),
#             'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
#             'targets': torch.tensor(self.targets[item], dtype=torch.float)
#         }

In [None]:
# ## Bert tozenizer
# tokenizer = transformers.BertTokenizer.from_pretrained(BertPath, do_lower_case=True)

In [None]:
# ## Loading the best model
# device = torch.device("xla")
# model = BERTBaseUncased(bert_path=BertPath, output_bert=OutputB, NumberOfClasses=df_train['Label'].nunique()).to(device)
# FileBestModel = Path + FileResultsBestModel + '.bin'
# model.load_state_dict(torch.load(FileBestModel))
# model.eval()

BERTBaseUncased(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

### Test Whole Data

In [None]:
# ## Prepresing the data
# valid_dataset = BERTDatasetTest(
#         comment_text=df_test_whole['Data'].values,
#         targets=df_test_whole['Label'].values,
#         tokenizer=tokenizer,
#         max_length=110
# )

# valid_data_loader = torch.utils.data.DataLoader(
#     valid_dataset,
#     batch_size=Batch,
#     drop_last=False,
#     num_workers=4,
#     shuffle=False
# )

In [None]:
# with torch.no_grad():
#           model.eval()
#           fin_targets = []
#           fin_outputs = []
#           for bi, d in tqdm(enumerate(valid_data_loader)):
#               ids = d["ids"]
#               mask = d["mask"]
#               token_type_ids = d["token_type_ids"]
#               targets = d["targets"]

#               ids = ids.to(device, dtype=torch.long)
#               mask = mask.to(device, dtype=torch.long)
#               token_type_ids = token_type_ids.to(device, dtype=torch.long)
#               targets = targets.to(device, dtype=torch.float)

#               outputs = model(
#                   ids=ids,
#                   mask=mask,
#                   token_type_ids=token_type_ids
#               )

#               targets_np = targets.cpu().detach().numpy().tolist()
#               outputs = torch.argmax(outputs, dim=1)
#               outputs_np = outputs.detach().cpu().numpy().tolist()

#               fin_targets.extend(targets_np)
#               fin_outputs.extend(outputs_np)    

11it [00:01,  7.08it/s]


In [None]:
# ## caculation of performace metric
# Target = fin_targets
# Output = fin_outputs

# average_metrics = 'macro' if df_train['Label'].nunique() > 2 else 'binary'
# print(f'Accuracy : {metrics.accuracy_score(Target, Output)}')
# print(f'Recall : {metrics.recall_score(Target, Output, average = average_metrics)}')
# print(f'Precision : {metrics.precision_score(Target, Output, average = average_metrics)}')
# print(f'f1-score : {metrics.f1_score(Target, Output, average= average_metrics)}')

Accuracy : 0.5591715976331361
Recall : 0.5536807360198497
Precision : 0.43499226525034806
f1-score : 0.48487293344990956


In [None]:
# import collections

# t=collections.Counter(Target)
# print(t)
# o=collections.Counter(Output)
# print(o)

Counter({0.0: 173, 3.0: 162, 4.0: 137, 2.0: 104, 1.0: 100})
Counter({0: 299, 2: 201, 3: 131, 4: 40, 1: 5})
