In [1]:
import pandas as pd
import numpy as np

import random 

import torch
import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import confusion_matrix

torch.manual_seed(666)

from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report

from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from transformers import AdamW

import warnings
warnings.filterwarnings('ignore')

print(torch.__version__)

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import train_test_split
# pip install seaborn
#import seaborn as sns
import matplotlib.pyplot as plt


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

from azureml.core.datastore import Datastore
from azureml.core.dataset import Dataset
from azureml.core import Workspace

1.8.1


In [2]:
# get the configs for the Azure Workspace
ws = Workspace.from_config()
# initialize location of the data (blob)
datastore_name = 'mlthesisdatablob'
dataset_name='data.cleaned.raw'
#load the data of english examples
datastore = Datastore.get(ws, datastore_name)
datapath = datastore.path('data_all_lang_downsampling.csv')
dataset = Dataset.auto_read_files(datapath)

d = dataset.register(workspace=ws, name=dataset_name, exist_ok=True, update_if_exist=True)

dataset = Dataset.get(ws,dataset_name)
# create a dataframe
df = dataset.to_pandas_dataframe()

print(len(df))

#Alternatively: Load data from csv in directory 
# import pandas as pd

# df = pd.read_csv('data_all_lang_downsampling.csv')

#df.head()
## Preprocessing 

# create a dataset for each language

df_en = df[df.lang_final=='en']
df_es = df[df.lang_final=='es']
df_it = df[df.lang_final=='it']
df_hi = df[df.lang_final=='hi']
df_pt = df[df.lang_final=='pt']
df_fr = df[df.lang_final=='fr']

# reset the indicies

df_en.reset_index(drop=True, inplace=True)
df_es.reset_index(drop=True, inplace=True)
df_it.reset_index(drop=True, inplace=True)
df_hi.reset_index(drop=True, inplace=True)
df_pt.reset_index(drop=True, inplace=True)
df_fr.reset_index(drop=True, inplace=True)

"Dataset.auto_read_files" is deprecated after version 1.0.69. Please use "Dataset.Tabular.from_delimited_files" instead. See Dataset API change notice at https://aka.ms/dataset-deprecation.
DatasetDefinition class is deprecated after version 1.0.69. See Dataset API change notice at https://aka.ms/dataset-deprecation.
The constructor of Dataset is deprecated after version 1.0.69. Please use factory methods from "Dataset.Tabular" and "Dataset.File" to create dataset instances. See Dataset API change notice at https://aka.ms/dataset-deprecation.
"Dataset.get" is deprecated after version 1.0.69. Please use "Dataset.get_by_name" and "Dataset.get_by_id" to retrieve dataset. See Dataset API change notice at https://aka.ms/dataset-deprecation.


7794


In [3]:
df.duplicated().any()

False

In [4]:
def train_val_test_split_df(df, test_size , val_size, random_state, stratify_column):
    # create a intermediary df and the test set
    val_s = val_size/(1-test_size)
    df_int, df_test = train_test_split(df,
                                       stratify=df[stratify_column],
                                       test_size=test_size,
                                       random_state=random_state)
    df_train, df_val = train_test_split(df_int,
                                       stratify=df_int[stratify_column],
                                       test_size=test_size,
                                       random_state=random_state)
    df_train = df_train.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    
    return df_train, df_val, df_test

In [5]:
# initialize tokenizer

MODEL_TYPE = 'xlm-roberta-base'

tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_TYPE)

#from transformers import XLMRobertaForSequenceClassification

model = XLMRobertaForSequenceClassification.from_pretrained(
    MODEL_TYPE, 
    num_labels = 2, # The number of output labels. 2 for binary classification.
)

# Send the model to the device.
model.to(device)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

XLMRobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (La

In [6]:
"""The code for the following cell was created using inspiration from : 
https://www.kaggle.com/vbookshelf/basics-of-bert-and-xlm-roberta-pytorch"""
class CompDataset(Dataset):
    
    def __init__(self, df):
        self.df_data = df
        
    def __getitem__(self, index):
        
        text = self.df_data.loc[index, 'text_feat_clean']
        
        
        encoded_dict = tokenizer.encode_plus(text,
                                             add_special_tokens = True,
                                             max_length = 512,
                                             truncation=True,
                                             pad_to_max_length=True,
                                             return_attention_mask=True,
                                             return_tensors ='pt')
        
        # These are torch tensors already.
        padded_token_list = encoded_dict['input_ids'][0]
        att_mask = encoded_dict['attention_mask'][0]

        
        # Convert the target to a torch tensor
        target = torch.tensor(self.df_data.loc[index, 'target'])

        sample = (padded_token_list, att_mask, target)

        return sample
    
    def __len__(self):
        return len(self.df_data)


In [7]:
def train(train_dataloader, optimizer, NUM_EPOCHS, model_name):    

    """The code for the following cell was created using inspiration from : 
    https://www.kaggle.com/vbookshelf/basics-of-bert-and-xlm-roberta-pytorch"""

    # Set the seed.
    seed_val = 101

    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

    # Store the average loss after each epoch so we can plot them.
    loss_values = []


    # For each epoch...
    for epoch in range(0, NUM_EPOCHS):

        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch + 1, NUM_EPOCHS))


        stacked_val_labels = []
        targets_list = []

        # ========================================
        #               Training
        # ========================================

        print('Training...')

        # put the model into train mode
        model.train()

        # This turns gradient calculations on and off.
        torch.set_grad_enabled(True)


        # Reset the total loss for this epoch.
        total_train_loss = 0

        for i, batch in enumerate(train_dataloader):

            train_status = 'Batch ' + str(i) + ' of ' + str(len(train_dataloader))

            print(train_status, end='\r')


            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            model.zero_grad()        


            outputs = model(b_input_ids, 
                        attention_mask=b_input_mask,
                        labels=b_labels)

            # Get the loss from the outputs tuple: (loss, logits)
            loss = outputs[0]

            # Convert the loss from a torch tensor to a number.
            # Calculate the total loss.
            total_train_loss = total_train_loss + loss.item()

            # Zero the gradients
            optimizer.zero_grad()

            # Perform a backward pass to calculate the gradients.
            loss.backward()


            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            # Use the optimizer to update the weights.
            optimizer.step() 



        print('Train loss:' ,total_train_loss/len(train_dataloader))


        # ========================================
        #               Validation
        # ========================================

        print('\nValidation...')

        # Put the model in evaluation mode.
        model.eval()

        # Turn off the gradient calculations.
        # This tells the model not to compute or store gradients.
        # This step saves memory and speeds up validation.
        torch.set_grad_enabled(False)


        # Reset the total loss for this epoch.
        total_val_loss = 0


        for j, batch in enumerate(val_dataloader):

            val_status = 'Batch ' + str(j) + ' of ' + str(len(val_dataloader))

            print(val_status, end='\r')

            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)      


            outputs = model(b_input_ids, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)

            # Get the loss from the outputs tuple: (loss, logits)
            loss = outputs[0]

            # Convert the loss from a torch tensor to a number.
            # Calculate the total loss.
            total_val_loss = total_val_loss + loss.item()


            # Get the preds
            preds = outputs[1]


            # Move preds to the CPU
            val_preds = preds.detach().cpu().numpy()

            # Move the labels to the cpu
            targets_np = b_labels.to('cpu').numpy()

            # Append the labels to a numpy list
            targets_list.extend(targets_np)

            if j == 0:  # first batch
                stacked_val_preds = val_preds

            else:
                stacked_val_preds = np.vstack((stacked_val_preds, val_preds))


        # Calculate the validation accuracy
        y_true = targets_list
        y_pred = np.argmax(stacked_val_preds, axis=1)

        val_acc = accuracy_score(y_true, y_pred)


        print('Val loss:' ,total_val_loss/len(val_dataloader))
        print('Val acc: ', val_acc)


        # Save the Model
        torch.save(model.state_dict(), model_name+'.pt')

    return model, loss_values


def evaluate(test_dataloader, model):    
    targets_list = []
    for j, batch in enumerate(test_dataloader):

            inference_status = 'Batch ' + str(j+1) + ' of ' + str(len(test_dataloader))

            print(inference_status, end='\r')

            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            outputs = model(b_input_ids, 
                    attention_mask=b_input_mask)


            # Get the preds
            preds = outputs[0]


            # Move preds to the CPU
            preds = preds.detach().cpu().numpy()

            # Move the labels to the cpu
            targets_np = b_labels.to('cpu').numpy()

            # Append the labels to a numpy list
            targets_list.extend(targets_np)

            # Stack the predictions.

            if j == 0:  # first batch
                stacked_preds = preds

            else:
                stacked_preds = np.vstack((stacked_preds, preds))
            
    y_true = targets_list
    y_pred = np.argmax(stacked_preds, axis=1)
    
    return y_true, y_pred
            

def x_language_eval(dataloader, model, fig_name):
    y_true, y_pred = evaluate(dataloader, model)
    print(classification_report(y_true, y_pred))
    print(confusion_matrix(y_true, y_pred))
    #figure = sns.heatmap(confusion_matrix(y_true, y_pred), annot=True, cmap='Blues')
    #fig = figure.get_figure()
    #fig.savefig(fig_name)
    
def ouput_creator(train_lang, model, d_dataloader):
    
    for i in d_dataloader:
        if i != train_lang:
            print('============='+train_lang +' - '+i+'=============')
            x_language_eval(d_dataloader[i], model, train_lang+'_'+i+'_b.png')
        else:
            print('============='+train_lang +' - '+i+'=============')
            x_language_eval(test_dataloader, model, i+'_'+i+'_b.png')


In [8]:
L_RATE = 2e-5

NUM_EPOCHS = 3
BATCH_SIZE = 8
NUM_CORES = 6

DF = df_fr

MODEL_NAME = 'model_fr_downsampling'

# Define the optimizer
optimizer = AdamW(model.parameters(),
              lr = L_RATE, 
              eps = 1e-8 
            )


df_train, df_val, df_test = train_val_test_split_df(DF, 0.1,0.2,565, 'target')

train_data = CompDataset(df_train)
val_data = CompDataset(df_val)
test_data = CompDataset(df_test)


train_dataloader = torch.utils.data.DataLoader(train_data,
                                        batch_size=BATCH_SIZE,
                                        sampler = RandomSampler(train_data),
                                       num_workers=NUM_CORES)

val_dataloader = torch.utils.data.DataLoader(val_data,
                                        batch_size=BATCH_SIZE,
                                        sampler = SequentialSampler(val_data),
                                       num_workers=NUM_CORES)

test_dataloader = torch.utils.data.DataLoader(test_data,
                                              batch_size=BATCH_SIZE,
                                              sampler=SequentialSampler(test_data), 
                                              num_workers=NUM_CORES)


english_dataloader = torch.utils.data.DataLoader(CompDataset(df_en),
                                        batch_size=BATCH_SIZE,
                                        shuffle=True,
                                       num_workers=NUM_CORES)



spanish_dataloader = torch.utils.data.DataLoader(CompDataset(df_es),
                                        batch_size=BATCH_SIZE,
                                        shuffle=True,
                                       num_workers=NUM_CORES)

italian_dataloader = torch.utils.data.DataLoader(CompDataset(df_it),
                                        batch_size=BATCH_SIZE,
                                        shuffle=True,
                                       num_workers=NUM_CORES)

hindi_dataloader = torch.utils.data.DataLoader(CompDataset(df_hi),
                                        batch_size=BATCH_SIZE,
                                        shuffle=True,
                                       num_workers=NUM_CORES)

portuguese_dataloader = torch.utils.data.DataLoader(CompDataset(df_pt),
                                        batch_size=BATCH_SIZE,
                                        shuffle=True,
                                       num_workers=NUM_CORES)

french_dataloader = torch.utils.data.DataLoader(CompDataset(df_fr),
                                        batch_size=BATCH_SIZE,
                                        shuffle=True,
                                       num_workers=NUM_CORES)

d_dataloader = {'en':english_dataloader,
                'es':spanish_dataloader,
                'it':italian_dataloader,
                'hi':hindi_dataloader,
                'pt':portuguese_dataloader, 
                'fr':french_dataloader}

In [9]:
model, loss_stats = train(train_dataloader, optimizer, NUM_EPOCHS, MODEL_NAME)    


Training...
Train loss: 0.6986815184354782

Validation...
Val loss: 0.6793171167373657
Val acc:  0.5333333333333333

Training...
Train loss: 0.5634130742400885

Validation...
Val loss: 0.4686014652252197
Val acc:  0.9333333333333333

Training...
Train loss: 0.2650660932995379

Validation...
Val loss: 0.06836574524641037
Val acc:  1.0


In [10]:
# model downsampling

ouput_creator('en', model,d_dataloader)

              precision    recall  f1-score   support

           0       0.88      0.99      0.93       330
           1       0.99      0.86      0.92       330

    accuracy                           0.93       660
   macro avg       0.93      0.93      0.93       660
weighted avg       0.93      0.93      0.93       660

[[326   4]
 [ 45 285]]
              precision    recall  f1-score   support

           0       0.83      0.92      0.87       168
           1       0.91      0.81      0.86       168

    accuracy                           0.87       336
   macro avg       0.87      0.87      0.87       336
weighted avg       0.87      0.87      0.87       336

[[155  13]
 [ 32 136]]
              precision    recall  f1-score   support

           0       0.79      0.90      0.84        42
           1       0.89      0.76      0.82        42

    accuracy                           0.83        84
   macro avg       0.84      0.83      0.83        84
weighted avg       0.84     

In [10]:
ouput_creator('es', model,d_dataloader)

              precision    recall  f1-score   support

           0       0.91      0.49      0.64      3298
           1       0.65      0.95      0.77      3298

    accuracy                           0.72      6596
   macro avg       0.78      0.72      0.70      6596
weighted avg       0.78      0.72      0.70      6596

[[1609 1689]
 [ 157 3141]]
              precision    recall  f1-score   support

           0       0.80      0.94      0.86        17
           1       0.93      0.76      0.84        17

    accuracy                           0.85        34
   macro avg       0.86      0.85      0.85        34
weighted avg       0.86      0.85      0.85        34

[[16  1]
 [ 4 13]]
              precision    recall  f1-score   support

           0       0.87      0.62      0.72        42
           1       0.70      0.90      0.79        42

    accuracy                           0.76        84
   macro avg       0.79      0.76      0.76        84
weighted avg       0.79     

In [10]:
ouput_creator('it', model,d_dataloader)

              precision    recall  f1-score   support

           0       0.50      1.00      0.67      3298
           1       0.88      0.02      0.03      3298

    accuracy                           0.51      6596
   macro avg       0.69      0.51      0.35      6596
weighted avg       0.69      0.51      0.35      6596

[[3291    7]
 [3247   51]]
              precision    recall  f1-score   support

           0       0.51      1.00      0.67       168
           1       1.00      0.03      0.06       168

    accuracy                           0.51       336
   macro avg       0.75      0.51      0.37       336
weighted avg       0.75      0.51      0.37       336

[[168   0]
 [163   5]]
              precision    recall  f1-score   support

           0       0.67      0.80      0.73         5
           1       0.67      0.50      0.57         4

    accuracy                           0.67         9
   macro avg       0.67      0.65      0.65         9
weighted avg       0.67 

In [10]:
ouput_creator('hi', model,d_dataloader)

              precision    recall  f1-score   support

           0       0.78      0.62      0.69      3298
           1       0.69      0.83      0.75      3298

    accuracy                           0.72      6596
   macro avg       0.73      0.72      0.72      6596
weighted avg       0.73      0.72      0.72      6596

[[2047 1251]
 [ 572 2726]]
              precision    recall  f1-score   support

           0       0.77      0.52      0.62       168
           1       0.64      0.84      0.72       168

    accuracy                           0.68       336
   macro avg       0.70      0.68      0.67       336
weighted avg       0.70      0.68      0.67       336

[[ 88  80]
 [ 27 141]]
              precision    recall  f1-score   support

           0       0.75      0.36      0.48        42
           1       0.58      0.88      0.70        42

    accuracy                           0.62        84
   macro avg       0.66      0.62      0.59        84
weighted avg       0.66 

In [10]:
ouput_creator('pt', model,d_dataloader)

              precision    recall  f1-score   support

           0       0.72      0.27      0.39      3298
           1       0.55      0.90      0.68      3298

    accuracy                           0.58      6596
   macro avg       0.64      0.58      0.54      6596
weighted avg       0.64      0.58      0.54      6596

[[ 884 2414]
 [ 337 2961]]
              precision    recall  f1-score   support

           0       0.77      0.44      0.56       168
           1       0.61      0.87      0.72       168

    accuracy                           0.65       336
   macro avg       0.69      0.65      0.64       336
weighted avg       0.69      0.65      0.64       336

[[ 74  94]
 [ 22 146]]
              precision    recall  f1-score   support

           0       0.58      0.62      0.60        42
           1       0.59      0.55      0.57        42

    accuracy                           0.58        84
   macro avg       0.58      0.58      0.58        84
weighted avg       0.58 

In [10]:
ouput_creator('fr', model,d_dataloader)

              precision    recall  f1-score   support

           0       0.76      0.32      0.45      3298
           1       0.57      0.90      0.70      3298

    accuracy                           0.61      6596
   macro avg       0.66      0.61      0.57      6596
weighted avg       0.66      0.61      0.57      6596

[[1068 2230]
 [ 346 2952]]
              precision    recall  f1-score   support

           0       0.79      0.59      0.68       168
           1       0.67      0.85      0.75       168

    accuracy                           0.72       336
   macro avg       0.73      0.72      0.71       336
weighted avg       0.73      0.72      0.71       336

[[ 99  69]
 [ 26 142]]
              precision    recall  f1-score   support

           0       0.95      0.43      0.59        42
           1       0.63      0.98      0.77        42

    accuracy                           0.70        84
   macro avg       0.79      0.70      0.68        84
weighted avg       0.79 