## LIAR multiclass pipeline

In [None]:
!pip3 install transformers
!pip3 install sentencepiece
!pip3 install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Importing packages

In [None]:
from google.colab import drive
import os
import torch
from torch.utils.data import Dataset, TensorDataset
import numpy as np
import random
import transformers
import pandas as pd
import time
import datetime
import torch
from transformers import (
    AutoTokenizer,
    BartForSequenceClassification,
    BartTokenizer,
    BertForSequenceClassification,
    BertTokenizer,
    BigBirdForSequenceClassification,
    BigBirdTokenizer,
    ConvBertForSequenceClassification,
    ConvBertTokenizer,
    CTRLForSequenceClassification,
    CTRLTokenizer,
    DebertaForSequenceClassification,
    DebertaTokenizer,
    DebertaV2ForSequenceClassification,
    DebertaV2Tokenizer,
    DistilBertForSequenceClassification,
    DistilBertTokenizer,
    ElectraForSequenceClassification,
    ElectraTokenizer,
    FNetForSequenceClassification,
    FNetTokenizer,
    FunnelForSequenceClassification,
    FunnelTokenizer,
    GPT2ForSequenceClassification,
    GPT2Tokenizer,
    LongformerForSequenceClassification,
    LongformerTokenizer,
    LukeForSequenceClassification,
    LukeTokenizer,
    MobileBertForSequenceClassification,
    MobileBertTokenizer,
    MPNetForSequenceClassification,
    MPNetTokenizer,
    OpenAIGPTForSequenceClassification,
    OpenAIGPTTokenizer,
    RobertaForSequenceClassification,
    RobertaTokenizer,
    SqueezeBertForSequenceClassification,
    SqueezeBertTokenizer,
    XLMForSequenceClassification,
    XLMRobertaForSequenceClassification,
    XLMRobertaTokenizer,
    XLMTokenizer,
    XLNetForSequenceClassification,
    XLNetTokenizer
)
import wandb
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import random_split
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA A100-SXM4-40GB


#### Initializing Weights and Biases for experiment tracking later on

In [None]:
wandb.login()

True

In [None]:
sweep_config = {
    'method': 'grid', #grid, random
    'metric': {
      'name': 'test_accuracy',
      'goal': 'maximize'   
    },
    'parameters': {
        'learning_rate': {
            'values': [ 5e-5, 3e-5, 2e-5]
        },
        'batch_size': {
            'values': [8, 16, 32]
        },
        'epochs':{
            'values':[2, 3, 4]
        }
    }
}

sweep_defaults = {
    'method': 'grid', #grid, random
    'metric': {
      'name': 'test_accuracy',
      'goal': 'maximize'   
    },
    'parameters': {
        'learning_rate': {
            'values': [5e-5]
        },
        'batch_size': {
            'values': [32]
        },
        'epochs':{
            'values':[2]
        }
    }
}



sweep_id = wandb.sweep(sweep_defaults)

Create sweep with ID: awmdo9rh
Sweep URL: https://wandb.ai/meilina/uncategorized/sweeps/awmdo9rh


### Exploratory data analysis 

In [None]:
# TO DO

### Dataset preparation

Run the following cells if you want to create your own dataset, and prepare your own train/test/val splits.

In [None]:
def create_dataset(sentences, tokenizer, max_length, labels):
    # Tokenize the sentences
    input_ids = []
    attention_masks = []
    for sent in sentences:
        encoded_dict = tokenizer.encode_plus(
                            sent,
                            add_special_tokens = True,
                            max_length = max_length,
                            pad_to_max_length = True,
                            return_attention_mask = True,
                            return_tensors = 'pt'
                       )
        
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    
    # Convert the lists into tensors
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)
    
    # Create a TensorDataset with the input_ids, attention_masks, and labels
    dataset = TensorDataset(input_ids, attention_masks, labels)
    
    return dataset


In [None]:
MODELS = {
    'bert': (BertForSequenceClassification, BertTokenizer),
    'ctrl': (CTRLForSequenceClassification, CTRLTokenizer),
    'distilbert': (DistilBertForSequenceClassification, DistilBertTokenizer),
    'gpt2': (GPT2ForSequenceClassification, GPT2Tokenizer),
    'longformer': (LongformerForSequenceClassification, LongformerTokenizer),
    'roberta': (RobertaForSequenceClassification, RobertaTokenizer),
}


def load_model(model_name, tokenizer_name, num_labels, output_attentions=False, output_hidden_states=False):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model_type = model_name.split('-')[0].lower()
   
    # Edge cases 
    if 'conv' in model_name:
      model_class = ConvBertForSequenceClassification
      tokenizer_class = ConvBertTokenizer 
    elif 'bigbird' in model_name:
      model_class = BigBirdForSequenceClassification
      tokenizer_class = BigBirdTokenizer
    elif 'luke' in model_name: #studio-ousia/luke-base"
      model_class = LukeForSequenceClassification
      tokenizer_class = LukeTokenizer
    # elif 'deberta' in model_name:
    #   model_class = DebertaForSequenceClassification
    #   tokenizer_class = DebertaTokenizer
    elif 'deberta' in model_name:
      model_class = DebertaV2ForSequenceClassification
      tokenizer_class = DebertaV2Tokenizer
    elif 'xlm' in model_name: #xlm-roberta-base
      model_class = XLMRobertaForSequenceClassification
      tokenizer_class = XLMRobertaTokenizer
    elif 'xlnet' in model_name: #xlnet-base-cased
      model_class = XLNetForSequenceClassification
      tokenizer_class = XLNetTokenizer
    elif 'squeezebert' in model_name: #squeezebert-uncased
      model_class = SqueezeBertForSequenceClassification
      tokenizer_class = SqueezeBertTokenizer
    elif 'open' in model_name:
      model_class = OpenAIGPTForSequenceClassification
      tokenizer_class = OpenAIGPTTokenizer
    elif 'mpnet' in model_name:
      model_class = MPNetForSequenceClassification
      tokenizer_class = MPNetTokenizer
    elif 'mobile' in model_name:
      model_class = MobileBertForSequenceClassification
      tokenizer_class = MobileBertTokenizer
    elif 'electra' in model_name: #electra-base-uncased
      model_class = ElectraForSequenceClassification
      tokenizer_class = ElectraTokenizer
    elif 'small' in model_name: #funnel-transformer/small-base
      model_class = FunnelForSequenceClassification
      tokenizer_class = FunnelTokenizer 
    elif 'fnet' in model_name: #fnet-base
      model_class = FNetForSequenceClassification
      tokenizer_class = FNetTokenizer
    else:
      model_class, tokenizer_class = MODELS[model_type]
    model = model_class.from_pretrained(model_name,num_labels=num_labels,output_attentions=output_attentions,output_hidden_states=output_hidden_states).to(device)
    tokenizer = tokenizer_class.from_pretrained(tokenizer_name)
    return model, tokenizer

In [None]:
def get_max_length(sentences, tokenizer, model):
    max_length = 0
    for sent in sentences:
        encoded_sent = tokenizer.encode(sent, add_special_tokens=True)
        max_length = max(max_length, len(encoded_sent))
    return max_length

In [None]:
import torch

def tokenize_sentences(sentences, tokenizer, max_length, labels):
    # Initialize empty lists to hold the tokenized input and attention masks.
    input_ids = []
    attention_masks = []
    sentence_ids = []
    counter = 0

    # For every sentence...
    for sent in sentences:
        # Encode the sentence using the tokenizer.
        encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 120,   # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )

        # Add the encoded sentence to the list.    
        input_ids.append(encoded_dict['input_ids'])

        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(encoded_dict['attention_mask'])
        sentence_ids.append(counter)
        counter  = counter + 1

    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    sentence_ids = torch.tensor(sentence_ids)
    labels = torch.tensor(labels)

    # Return a dictionary containing the input_ids, attention_masks, and sentence_ids.
    return {'input_ids': input_ids, 'attention_masks': attention_masks, 'sentence_ids': sentence_ids, 'labels': labels}


In [None]:
def create_dataset(sentences, tokenizer, max_length, labels):
    # Tokenize the sentences.
    tokenized = tokenize_sentences(sentences, tokenizer, max_length, labels)

    # Combine the inputs and labels into a TensorDataset.
    dataset = TensorDataset(
        tokenized['sentence_ids'],
        tokenized['input_ids'],
        tokenized['attention_masks'],
        tokenized['labels']
    )

    return dataset


In [None]:
def index_remover(tensordata):
    input_ids = []
    attention_masks = []
    labels = []
   
    for a,b,c,d in tensordata:
        input_ids.append(b.tolist())
        attention_masks.append(c.tolist())
        labels.append(d.tolist())
        
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    labels = torch.tensor(labels)
    
    final_dataset =  TensorDataset(input_ids, attention_masks, labels)
    return final_dataset

In [None]:
def create_train_val_test_datasets(dataset, split_ratio=(0.8, 0.1, 0.1)):
    # Calculate the number of samples to include in each set.
    num_samples = len(dataset)
    train_size = int(split_ratio[0] * num_samples)
    val_size = int(split_ratio[1] * num_samples)
    test_size = num_samples - train_size - val_size

    # Divide the dataset by randomly selecting samples.
    train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

    # Remove sentence IDs from tensor dataset so that it can be used for training/validation/testing.
    train_dataset = index_remover(train_dataset)
    val_dataset = index_remover(val_dataset)
    test_dataset = index_remover(test_dataset)

    # Print the number of samples in each set.
    print('{:>5,} training samples'.format(train_size))
    print('{:>5,} validation samples'.format(val_size))
    print('{:>5,} testing samples'.format(test_size))

    return train_dataset, val_dataset, test_dataset


### Hyperparameter setting

In [None]:
def get_data_loaders(train_dataset, val_dataset, test_dataset):
    # Create the training dataloader.
    batch_size = wandb.config.batch_size
    train_dataloader = DataLoader(
        train_dataset,  # The training samples.
        sampler=RandomSampler(train_dataset), # Select batches randomly
        batch_size=batch_size # Trains with this batch size.
    )

    # Create the validation dataloader.
    val_dataloader = DataLoader(
        val_dataset, # The validation samples.
        sampler=SequentialSampler(val_dataset), # Pull out batches sequentially.
        batch_size=batch_size # Evaluate with this batch size.
    )

    # Create the validation dataloader.
    test_dataloader = DataLoader(
        test_dataset, # The testing samples.
        sampler=SequentialSampler(test_dataset), # Pull out batches sequentially.
        batch_size=batch_size # Evaluate with this batch size.
    )
    return train_dataloader, val_dataloader, test_dataloader

In [None]:
def get_optimizer(model):
    optimizer = AdamW(model.parameters(),
                      lr=wandb.config.learning_rate, 
                      eps=1e-8)
    return optimizer

In [None]:
def get_scheduler(dataloader, optimizer):
    # Total number of training steps is [number of batches] x [number of epochs]. 
    # (Note that this is not the same as the number of training samples).
    epochs = wandb.config.epochs
    total_steps = len(dataloader) * epochs

    # Create the learning rate scheduler.
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps = 0, # Default value in run_glue.py
                                                num_training_steps = total_steps)
    return scheduler

In [None]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)
    
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
from torch.utils.data import random_split

def create_train_val_test_datasets2(dataset, split_ratio=(0.8, 0.1, 0.1)):
    # Calculate the number of samples to include in each set.
    num_samples = len(dataset)
    train_size = int(split_ratio[0] * num_samples)
    val_size = int(split_ratio[1] * num_samples)
    test_size = num_samples - train_size - val_size

    # Divide the dataset by randomly selecting samples.
    train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

    # Remove sentence IDs from tensor dataset so that it can be used for training/validation/testing.
    train_dataset = index_remover(train_dataset)
    val_dataset = index_remover(val_dataset)
    test_dataset = index_remover(test_dataset)

    # Print the number of samples in each set.
    print('{:>5,} training samples'.format(train_size))
    print('{:>5,} validation samples'.format(val_size))
    print('{:>5,} testing samples'.format(test_size))

    return train_dataset, val_dataset, test_dataset


In [None]:
import pandas as pd
from torch.utils.data import TensorDataset
from sklearn.metrics import f1_score
import json

def train_train():
    wandb.init(project="liar-multiclass", name="xlmroberta-base")

    # Read the data
    train_data = pd.read_json("/content/drive/MyDrive/misinfo/LIAR_train.jsonl", lines=True, orient='records')
    val_data = pd.read_json("/content/drive/MyDrive/misinfo/LIAR_val.jsonl", lines=True, orient='records')
    test_data = pd.read_json("/content/drive/MyDrive/misinfo/LIAR_test.jsonl", lines=True, orient='records')

    # Extract sentences and labels from the data
    train_sentences = train_data['text'].tolist()
    train_labels = train_data['label'].tolist()
    val_sentences = val_data['text'].tolist()
    val_labels = val_data['label'].tolist()
    test_sentences = test_data['text'].tolist()
    test_labels = test_data['label'].tolist()

    # Load the model and tokenizer
    #model, tokenizer = load_model('bert-base-uncased', 'bert-base-uncased', num_labels=6, output_attentions=False, output_hidden_states=False)
    #model, tokenizer = load_model('bert-base-uncased', 'bert-base-uncased', num_labels=6, output_attentions=False, output_hidden_states=False)
    #model, tokenizer = load_model('microsoft/deberta-base', 'microsoft/deberta-base', num_labels=6, output_attentions=False, output_hidden_states=False)    
    #model, tokenizer = load_model('YituTech/conv-bert-base', 'YituTech/conv-bert-base', num_labels=6, output_attentions=False, output_hidden_states=False)
    #model, tokenizer = load_model('funnel-transformer/small-base', 'funnel-transformer/small-base', num_labels=6, output_attentions=False, output_hidden_states=False) 
    #model, tokenizer = load_model('studio-ousia/luke-base', 'studio-ousia/luke-base', num_labels=6, output_attentions=False, output_hidden_states=False)  
    #model, tokenizer = load_model("funnel-transformer/small-base", "funnel-transformer/small-base", num_labels=6, output_attentions=False, output_hidden_states=False)  
    #model, tokenizer = load_model('roberta-base', 'roberta-base', num_labels=6, output_attentions=False, output_hidden_states=False)    
    #model, tokenizer = load_model('squeezebert/squeezebert-uncased', 'squeezebert/squeezebert-uncased', num_labels=6, output_attentions=False, output_hidden_states=False)    
    model, tokenizer = load_model('xlm-roberta-base', 'xlm-roberta-base', num_labels=6, output_attentions=False, output_hidden_states=False)    
    #model, tokenizer = load_model('microsoft/deberta-v3-base', 'microsoft/deberta-v3-base', num_labels=6, output_attentions=False,  output_hidden_states=False)  
    # Get the maximum sentence length
    max_length = get_max_length(train_sentences + val_sentences + test_sentences, tokenizer, model)

    # Create datasets
    train_dataset = create_dataset(train_sentences, tokenizer, max_length, train_labels)
    val_dataset = create_dataset(val_sentences, tokenizer, max_length, val_labels)
    test_dataset = create_dataset(test_sentences, tokenizer, max_length, test_labels)

    # Remove sentence_ids from the datasets
    train_dataset = index_remover(train_dataset)
    val_dataset = index_remover(val_dataset)
    test_dataset = index_remover(test_dataset)

    # Get data loaders
    train_dataloader, validation_dataloader, testing_dataloader = get_data_loaders(train_dataset, val_dataset, test_dataset)

    # Get optimizer and scheduler
    optimizer = get_optimizer(model)
    scheduler = get_scheduler(train_dataloader, optimizer)
    seed_val=42
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

    # Measure the total training time for the whole run.
    total_t0 = time.time()
    epochs = wandb.config.epochs

    # For each epoch...
    for epoch_i in range(0, epochs):

        # ========================================
        #               Training
        # ========================================

        # Perform one full pass over the training set.
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')

        # Measure how long the training epoch takes.
        t0 = time.time()

        # Reset the total loss for this epoch.
        total_train_loss = 0

        # Put the model into training mode. Don't be mislead--the call to 
        # `train` just changes the *mode*, it doesn't *perform* the training.
        # `dropout` and `batchnorm` layers behave differently during training
        # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):

            # Progress update every 40 batches.
            if step % 40 == 0 and not step == 0:
                # Calculate elapsed time in minutes.
                elapsed = format_time(time.time() - t0)

                # Report progress.
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))


            # Unpack this training batch from our dataloader. 
            #
            # As we unpack the batch, we'll also copy each tensor to the GPU using the 
            # `to` method.
            #
            # `batch` contains three pytorch tensors:
            #   [0]: input ids 
            #   [1]: attention masks
            #   [2]: labels 
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            # Always clear any previously calculated gradients before performing a
            # backward pass. PyTorch doesn't do this automatically because 
            # accumulating the gradients is "convenient while training RNNs". 
            # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
            model.zero_grad()        

            # Perform a forward pass (evaluate the model on this training batch).
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            # It returns different numbers of parameters depending on what arguments
            # arge given and what flags are set. For our useage here, it returns
            # the loss (because we provided labels) and the "logits"--the model
            # outputs prior to activation.
            loss, logits = model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask, 
                                labels=b_labels).to_tuple()

            # Accumulate the training loss over all of the batches so that we can
            # calculate the average loss at the end. `loss` is a Tensor containing a
            # single value; the `.item()` function just returns the Python value 
            # from the tensor.
            total_train_loss += loss.item()

            # Perform a backward pass to calculate the gradients.
            loss.backward()

            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and take a step using the computed gradient.
            # The optimizer dictates the "update rule"--how the parameters are
            # modified based on their gradients, the learning rate, etc.
            optimizer.step()

            # Update the learning rate.
            scheduler.step()

        # Calculate the average loss over all of the batches.
        avg_train_loss = total_train_loss / len(train_dataloader)            
        
        # Measure how long this epoch took.
        training_time = format_time(time.time() - t0)

        wandb.log({'avg_train_loss':avg_train_loss, 'train_batch_loss':loss.item()})

        print("")
        print("  average training loss: {0:.2f}".format(avg_train_loss))
        print("  training epoch took: {:}".format(training_time))
            
        # ========================================
        #               Testing
        # ========================================
        # After the completion of each training epoch, measure our performance on
        # our testing set.

        print("")
        print("Running testing...")

        t0 = time.time()

        # Put the model in evaluation mode--the dropout layers behave differently
        # during evaluation.
        model.eval()

        # Tracking variables 
        total_test_accuracy = 0
        total_test_loss = 0
        nb_eval_steps = 0
        true_labels = []
        pred_labels = []
        all_logits = []
        all_probs = []
        all_labels = []

        # Evaluate data for one epoch
        for batch in testing_dataloader:
            
            # Unpack this training batch from our dataloader. 
            #
            # As we unpack the batch, we'll also copy each tensor to the GPU using 
            # the `to` method.
            #
            # `batch` contains three pytorch tensors:
            #   [0]: input ids 
            #   [1]: attention masks
            #   [2]: labels 
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            
            # Tell pytorch not to bother with constructing the compute graph during
            # the forward pass, since this is only needed for backprop (training).
            with torch.no_grad():        

                # Forward pass, calculate logit predictions.
                # token_type_ids is the same as the "segment ids", which 
                # differentiates sentence 1 and 2 in 2-sentence tasks.
                # The documentation for this `model` function is here: 
                # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
                # Get the "logits" output by the model. The "logits" are the output
                # values prior to applying an activation function like the softmax.
                loss, logits = model(b_input_ids, 
                                      token_type_ids=None, 
                                      attention_mask=b_input_mask,
                                      labels=b_labels).to_tuple()  
            # Accumulate the test loss.
                total_test_loss += loss.item()

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            #probs = torch.softmax(logits, dim=-1).detach().cpu()
            label_ids = b_labels.to('cpu').numpy()

            # Calculate the accuracy for this batch of test sentences, and
            # accumulate it over all batches.
            total_test_accuracy += flat_accuracy(logits, label_ids)

            #logits = logits.cpu().numpy()
            probs = torch.softmax(torch.tensor(logits), dim=-1).detach().cpu().numpy()
            predicted_labels = np.argmax(logits, axis=1)
            all_probs.extend(probs.tolist())
            all_labels.extend(predicted_labels.tolist())

            # Save the labels and probabilities to a JSON file
            output_file = 'labels_probs.json'
            with open(output_file, 'w') as f:
              json.dump({'labels': all_labels, 'probs': all_probs}, f)


            true_labels.extend(label_ids)
            pred_labels.extend(np.argmax(logits, axis=1))
            

        # Report the final accuracy for this testing run.
        avg_test_accuracy = total_test_accuracy / len(testing_dataloader)
        print("  accuracy: {0:.2f}".format(avg_test_accuracy))

        # Calculate the average loss over all of the batches.
        avg_test_loss = total_test_loss / len(testing_dataloader)
        f1 = f1_score(true_labels, pred_labels, average='weighted')
        print("  f1 score: {0:.2f}".format(f1))
        
        # Measure how long the testing run took.
        testing_time = format_time(time.time() - t0)
        wandb.log({'test_accuracy':avg_test_accuracy,'avg_test_loss':avg_test_loss, 'f1': f1})
        print("  testing loss: {0:.2f}".format(avg_test_loss))
        print("  testing took: {:}".format(testing_time))


    print("")
    print("training complete!")

    print("total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))




In [None]:
wandb.agent(sweep_id,function=train_train)

[34m[1mwandb[0m: Agent Starting Run: zn1yrxln with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 2
[34m[1mwandb[0m: 	learning_rate: 5e-05


Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (839 > 512). Running this sequence through the model will result in indexing errors
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.



Training...
  Batch    40  of    320.    Elapsed: 0:00:08.
  Batch    80  of    320.    Elapsed: 0:00:17.
  Batch   120  of    320.    Elapsed: 0:00:25.
  Batch   160  of    320.    Elapsed: 0:00:33.
  Batch   200  of    320.    Elapsed: 0:00:42.
  Batch   240  of    320.    Elapsed: 0:00:50.
  Batch   280  of    320.    Elapsed: 0:00:58.

  average training loss: 1.77
  training epoch took: 0:01:06

Running testing...
  accuracy: 0.21
  f1 score: 0.07
  testing loss: 1.75
  testing took: 0:00:03

Training...
  Batch    40  of    320.    Elapsed: 0:00:08.
  Batch    80  of    320.    Elapsed: 0:00:17.
  Batch   120  of    320.    Elapsed: 0:00:25.
  Batch   160  of    320.    Elapsed: 0:00:33.
  Batch   200  of    320.    Elapsed: 0:00:41.
  Batch   240  of    320.    Elapsed: 0:00:50.
  Batch   280  of    320.    Elapsed: 0:00:58.

  average training loss: 1.76
  training epoch took: 0:01:06

Running testing...
  accuracy: 0.21
  f1 score: 0.07
  testing loss: 1.75
  testing took: 0:

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
avg_test_loss,█▁
avg_train_loss,█▁
f1,▁▁
test_accuracy,▁▁
train_batch_loss,▁█

0,1
avg_test_loss,1.751
avg_train_loss,1.76274
f1,0.07236
test_accuracy,0.20863
train_batch_loss,1.7822


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.
