In [1]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

torch.cuda.set_device(1)
print(torch.cuda.current_device())
print(torch.cuda.get_device_properties(1))

1
_CudaDeviceProperties(name='NVIDIA A100-PCIE-40GB', major=8, minor=0, total_memory=40370MB, multi_processor_count=108)


In [3]:
import random, time, datetime
import pandas as pd
import numpy as np
import seaborn as sns
import tensorflow as tf

from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification 
from transformers import AdamW, get_linear_schedule_with_warmup

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from matplotlib import pyplot as plt
from statistics import mean

2023-05-10 23:27:53.364566: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
df = pd.read_csv("../data/figlang_all.tsv", sep="\t", encoding="utf-8")
print(df.shape)
print(df["label"].value_counts())
df.head()

(10848, 4)
0    6506
1    2212
2     884
3     625
4     621
Name: label, dtype: int64


Unnamed: 0,text,label,label_binary,source
0,I can't believe my ex didn't pay his car note ...,0,0,Sarcasm_premise
1,But then the paper would not find out about yo...,0,0,Idiom_premise
2,Last week my kid said some really mean things ...,0,0,CreativeParaphrase_premise
3,"The gravy was so fatty, it made the meat taste...",0,0,Metaphor_premise
4,He pulls a giant disc out and flashes it like ...,3,1,Simile_hypothesis


# 1. Set the model to use

**Uncomment one of the lines below to set the model.**

In [5]:
#MODEL_NAME = "bert-base-uncased" 
#MODEL_NAME = "roberta-base" 
MODEL_NAME = "xlnet-base-cased" 

# 2. K-fold validation

In [6]:
df_figlang = df.loc[df["label"] != 0]

# Re-map the label to start from 0, otherwise the training does not work:
label_map = {1: 0,
             2: 1,
             3: 2,
             4: 3}

df_figlang["label"] = df_figlang["label"].map(label_map)

X = df_figlang["text"].values
y = df_figlang["label"].values

print(np.unique(y))

[0 1 2 3]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_figlang["label"] = df_figlang["label"].map(label_map)


In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer

XLNetTokenizerFast(name_or_path='xlnet-base-cased', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '<sep>', 'pad_token': '<pad>', 'cls_token': '<cls>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False), 'additional_special_tokens': ['<eop>', '<eod>']})

**Run a sample tokenization on all sequences in the training set to get the max_len:**

In [8]:
test_max_len = 0

for text in df["text"]:
    input_ids = tokenizer.encode(text, 
                                 add_special_tokens=True)
    test_max_len = max(test_max_len, len(input_ids))

print('Max sentence length: ', test_max_len)

Max sentence length:  59


In [9]:
# Maximal length of input sequence
MAX_LEN = test_max_len

# Number of labels 
NUM_LABELS = 4

# Specifying batch size: For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32.
batch_size = 16 

# Number of training epochs (authors recommend between 2 and 4)
epochs = 4

In [10]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every text...
for idx, text in enumerate(X):
    encoded_dict = tokenizer.encode_plus(
        text,                      # Sentence to encode.
        add_special_tokens = True, 
        padding = "max_length",
        truncation = True,
        max_length = MAX_LEN,
        return_attention_mask = True,
        return_tensors = 'pt'
    )
    
    # Add the encoded sentence to the list.
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
    
    # ----- Qi: Only for checking the results of tokenization
    #print(text)
    #print(tokenizer.convert_ids_to_tokens(encoded_dict['input_ids'][0])) 
    #print(len(tokenizer.convert_ids_to_tokens(encoded_dict['input_ids'][0])))
    #print("-----") 
    
print("Done with tokenization.")

Done with tokenization.


In [11]:
# Convert to tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(y)

In [12]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Helper function for formatting elapsed times.
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [13]:
N_SPLIT = 10
kf = KFold(n_splits=N_SPLIT, shuffle=False)

In [14]:
all_metrics = []

for kf_idx, (train_idx, test_idx) in enumerate(kf.split(X)):
    print("FOLD {}:".format(kf_idx+1))
    
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, 
        num_labels = NUM_LABELS, 
        output_attentions = False, 
        output_hidden_states = True,
    )
    
    # Tell pytorch to run this model on the GPU.
    model.cuda()
    
    # Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
    # I believe the 'W' stands for 'Weight Decay fix"
    optimizer = AdamW(model.parameters(),
                      lr = 2e-5,   # args.learning_rate - default is 5e-5, our notebook had 2e-5
                      eps = 1e-8   # args.adam_epsilon  - default is 1e-8.
                     )    
    
    # For each epoch...
    for epoch_i in range(0, epochs):
        # ========================================
        #               Training
        # ========================================
        # Perform one full pass over the training set.

        print("")
        print('  ======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('  Training...')

        # Measure how long the training epoch takes.
        t0 = time.time()

        # Reset the total loss for this epoch.
        total_loss = 0
        
        # Put the model into training mode. Don't be mislead--the call to `train` just changes the *mode*, it doesn't *perform* the training.
        # `dropout` and `batchnorm` layers behave differently during training vs. test 
        # (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
        model.train()
        
        # Create the DataLoader for our training set.
        train_data = TensorDataset(input_ids[train_idx], attention_masks[train_idx], labels[train_idx])
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
        
        # Total number of training steps is number of batches * number of epochs.
        total_steps = len(train_dataloader) * epochs

        # Create the learning rate scheduler.
        scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                    num_warmup_steps = 0, # Default value in run_glue.py
                                                    num_training_steps = total_steps)
        
        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            
            # Progress update every 40 batches.
            if step % 40 == 0 and not step == 0:
                # Calculate elapsed time in minutes.
                elapsed = format_time(time.time() - t0)
            
                # Report progress.
                print('    Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
                
            # Unpack this training batch from our dataloader. 
            #
            # As we unpack the batch, we'll also copy each tensor to the GPU using the `to` method.
            #
            # `batch` contains three pytorch tensors:
            #   [0]: input ids 
            #   [1]: attention masks
            #   [2]: labels 
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)  
            
            # Always clear any previously calculated gradients before performing a backward pass. 
            # PyTorch doesn't do this automatically because accumulating the gradients is "convenient while training RNNs". 
            # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
            model.zero_grad()  
            
            # Perform a forward pass (evaluate the model on this training batch).
            # This will return the loss (rather than the model output) because we have provided the `labels`.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(b_input_ids, 
                            #token_type_ids=None, 
                            attention_mask=b_input_mask, 
                            labels=b_labels)
            
            # The call to `model` always returns a tuple, so we need to pull the loss value out of the tuple.
            loss = outputs[0]
            
            # Accumulate the training loss over all of the batches so that we can calculate the average loss at the end. 
            # `loss` is a Tensor containing a single value; the `.item()` function just returns the Python value from the tensor.
            total_loss += loss.item()
            
            # Perform a backward pass to calculate the gradients.
            loss.backward()
            
            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            
            # Update parameters and take a step using the computed gradient.
            # The optimizer dictates the "update rule"--how the parameters are modified based on their gradients, the learning rate, etc.
            optimizer.step()
            
            # Update the learning rate.
            scheduler.step()
    
        # Calculate the average loss over the training data.
        avg_train_loss = total_loss / len(train_dataloader)  
        
        
        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epoch took: {:}".format(format_time(time.time() - t0)))    
        
        
    # ========================================
    #               Test
    # ========================================
        
    print("")
    print("  Running Test...")
        
    # Put the model in evaluation mode--the dropout layers behave differently during evaluation.
    model.eval()
        
    # Create the DataLoader.
    prediction_data = TensorDataset(input_ids[test_idx], attention_masks[test_idx], labels[test_idx])
    prediction_sampler = SequentialSampler(prediction_data)
    prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
    
    print('  Predicting labels for {:,} test sentences...'.format(len(input_ids[test_idx])))
    
    # Tracking variables 
    predictions , true_labels = [], []
    
    # Predict 
    for batch in prediction_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and speeding up prediction
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            outputs = model(b_input_ids, 
                            #token_type_ids=None, 
                            attention_mask=b_input_mask)
            
        logits = outputs[0]
        
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Store predictions and true labels
        predictions.append(logits)
        true_labels.append(label_ids)
        
    # Combine the results across all batches. 
    flat_predictions = np.concatenate(predictions, axis=0)

    # For each sample, pick the label (0 or 1) with the higher score.
    flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

    # Combine the correct labels for each batch into a single list.
    flat_true_labels = np.concatenate(true_labels, axis=0)
    
    metric = f1_score(flat_true_labels, flat_predictions, average="macro")
    all_metrics.append(metric)
    print("  F1 of fold {}:".format(kf_idx),  round(metric, 10))
    print()
    
print("Done.")
print("Average Macro-F1 ({} folds): ".format(kf_idx+1), mean(all_metrics))

FOLD 1:


Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a


  Training...
    Batch    40  of    245.    Elapsed: 0:00:06.
    Batch    80  of    245.    Elapsed: 0:00:11.
    Batch   120  of    245.    Elapsed: 0:00:16.
    Batch   160  of    245.    Elapsed: 0:00:21.
    Batch   200  of    245.    Elapsed: 0:00:26.
    Batch   240  of    245.    Elapsed: 0:00:31.

  Average training loss: 0.38
  Training epoch took: 0:00:32

  Training...
    Batch    40  of    245.    Elapsed: 0:00:05.
    Batch    80  of    245.    Elapsed: 0:00:10.
    Batch   120  of    245.    Elapsed: 0:00:15.
    Batch   160  of    245.    Elapsed: 0:00:20.
    Batch   200  of    245.    Elapsed: 0:00:25.
    Batch   240  of    245.    Elapsed: 0:00:30.

  Average training loss: 0.13
  Training epoch took: 0:00:31

  Training...
    Batch    40  of    245.    Elapsed: 0:00:05.
    Batch    80  of    245.    Elapsed: 0:00:10.
    Batch   120  of    245.    Elapsed: 0:00:15.
    Batch   160  of    245.    Elapsed: 0:00:20.
    Batch   200  of    245.    Elapsed: 0:00:24

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a


  Training...
    Batch    40  of    245.    Elapsed: 0:00:05.
    Batch    80  of    245.    Elapsed: 0:00:10.
    Batch   120  of    245.    Elapsed: 0:00:15.
    Batch   160  of    245.    Elapsed: 0:00:20.
    Batch   200  of    245.    Elapsed: 0:00:25.
    Batch   240  of    245.    Elapsed: 0:00:30.

  Average training loss: 0.37
  Training epoch took: 0:00:31

  Training...
    Batch    40  of    245.    Elapsed: 0:00:05.
    Batch    80  of    245.    Elapsed: 0:00:10.
    Batch   120  of    245.    Elapsed: 0:00:15.
    Batch   160  of    245.    Elapsed: 0:00:20.
    Batch   200  of    245.    Elapsed: 0:00:25.
    Batch   240  of    245.    Elapsed: 0:00:30.

  Average training loss: 0.16
  Training epoch took: 0:00:30

  Training...
    Batch    40  of    245.    Elapsed: 0:00:05.
    Batch    80  of    245.    Elapsed: 0:00:10.
    Batch   120  of    245.    Elapsed: 0:00:15.
    Batch   160  of    245.    Elapsed: 0:00:20.
    Batch   200  of    245.    Elapsed: 0:00:25

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a


  Training...
    Batch    40  of    245.    Elapsed: 0:00:05.
    Batch    80  of    245.    Elapsed: 0:00:10.
    Batch   120  of    245.    Elapsed: 0:00:15.
    Batch   160  of    245.    Elapsed: 0:00:20.
    Batch   200  of    245.    Elapsed: 0:00:25.
    Batch   240  of    245.    Elapsed: 0:00:30.

  Average training loss: 0.38
  Training epoch took: 0:00:31

  Training...
    Batch    40  of    245.    Elapsed: 0:00:06.
    Batch    80  of    245.    Elapsed: 0:00:12.
    Batch   120  of    245.    Elapsed: 0:00:17.
    Batch   160  of    245.    Elapsed: 0:00:23.
    Batch   200  of    245.    Elapsed: 0:00:28.
    Batch   240  of    245.    Elapsed: 0:00:33.

  Average training loss: 0.14
  Training epoch took: 0:00:33

  Training...
    Batch    40  of    245.    Elapsed: 0:00:05.
    Batch    80  of    245.    Elapsed: 0:00:10.
    Batch   120  of    245.    Elapsed: 0:00:15.
    Batch   160  of    245.    Elapsed: 0:00:20.
    Batch   200  of    245.    Elapsed: 0:00:26

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a


  Training...
    Batch    40  of    245.    Elapsed: 0:00:05.
    Batch    80  of    245.    Elapsed: 0:00:10.
    Batch   120  of    245.    Elapsed: 0:00:16.
    Batch   160  of    245.    Elapsed: 0:00:21.
    Batch   200  of    245.    Elapsed: 0:00:27.
    Batch   240  of    245.    Elapsed: 0:00:34.

  Average training loss: 0.34
  Training epoch took: 0:00:35

  Training...
    Batch    40  of    245.    Elapsed: 0:00:05.
    Batch    80  of    245.    Elapsed: 0:00:11.
    Batch   120  of    245.    Elapsed: 0:00:16.
    Batch   160  of    245.    Elapsed: 0:00:21.
    Batch   200  of    245.    Elapsed: 0:00:26.
    Batch   240  of    245.    Elapsed: 0:00:30.

  Average training loss: 0.15
  Training epoch took: 0:00:31

  Training...
    Batch    40  of    245.    Elapsed: 0:00:05.
    Batch    80  of    245.    Elapsed: 0:00:10.
    Batch   120  of    245.    Elapsed: 0:00:15.
    Batch   160  of    245.    Elapsed: 0:00:20.
    Batch   200  of    245.    Elapsed: 0:00:25

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a


  Training...
    Batch    40  of    245.    Elapsed: 0:00:05.
    Batch    80  of    245.    Elapsed: 0:00:10.
    Batch   120  of    245.    Elapsed: 0:00:15.
    Batch   160  of    245.    Elapsed: 0:00:20.
    Batch   200  of    245.    Elapsed: 0:00:25.
    Batch   240  of    245.    Elapsed: 0:00:30.

  Average training loss: 0.39
  Training epoch took: 0:00:31

  Training...
    Batch    40  of    245.    Elapsed: 0:00:05.
    Batch    80  of    245.    Elapsed: 0:00:10.
    Batch   120  of    245.    Elapsed: 0:00:15.
    Batch   160  of    245.    Elapsed: 0:00:20.
    Batch   200  of    245.    Elapsed: 0:00:25.
    Batch   240  of    245.    Elapsed: 0:00:30.

  Average training loss: 0.17
  Training epoch took: 0:00:30

  Training...
    Batch    40  of    245.    Elapsed: 0:00:05.
    Batch    80  of    245.    Elapsed: 0:00:10.
    Batch   120  of    245.    Elapsed: 0:00:15.
    Batch   160  of    245.    Elapsed: 0:00:20.
    Batch   200  of    245.    Elapsed: 0:00:25

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a


  Training...
    Batch    40  of    245.    Elapsed: 0:00:05.
    Batch    80  of    245.    Elapsed: 0:00:10.
    Batch   120  of    245.    Elapsed: 0:00:15.
    Batch   160  of    245.    Elapsed: 0:00:20.
    Batch   200  of    245.    Elapsed: 0:00:25.
    Batch   240  of    245.    Elapsed: 0:00:29.

  Average training loss: 0.42
  Training epoch took: 0:00:30

  Training...
    Batch    40  of    245.    Elapsed: 0:00:05.
    Batch    80  of    245.    Elapsed: 0:00:10.
    Batch   120  of    245.    Elapsed: 0:00:15.
    Batch   160  of    245.    Elapsed: 0:00:20.
    Batch   200  of    245.    Elapsed: 0:00:25.
    Batch   240  of    245.    Elapsed: 0:00:30.

  Average training loss: 0.15
  Training epoch took: 0:00:31

  Training...
    Batch    40  of    245.    Elapsed: 0:00:05.
    Batch    80  of    245.    Elapsed: 0:00:10.
    Batch   120  of    245.    Elapsed: 0:00:15.
    Batch   160  of    245.    Elapsed: 0:00:20.
    Batch   200  of    245.    Elapsed: 0:00:25

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a


  Training...
    Batch    40  of    245.    Elapsed: 0:00:05.
    Batch    80  of    245.    Elapsed: 0:00:10.
    Batch   120  of    245.    Elapsed: 0:00:15.
    Batch   160  of    245.    Elapsed: 0:00:20.
    Batch   200  of    245.    Elapsed: 0:00:25.
    Batch   240  of    245.    Elapsed: 0:00:30.

  Average training loss: 0.38
  Training epoch took: 0:00:31

  Training...
    Batch    40  of    245.    Elapsed: 0:00:05.
    Batch    80  of    245.    Elapsed: 0:00:10.
    Batch   120  of    245.    Elapsed: 0:00:16.
    Batch   160  of    245.    Elapsed: 0:00:20.
    Batch   200  of    245.    Elapsed: 0:00:25.
    Batch   240  of    245.    Elapsed: 0:00:30.

  Average training loss: 0.13
  Training epoch took: 0:00:31

  Training...
    Batch    40  of    245.    Elapsed: 0:00:05.
    Batch    80  of    245.    Elapsed: 0:00:10.
    Batch   120  of    245.    Elapsed: 0:00:15.
    Batch   160  of    245.    Elapsed: 0:00:20.
    Batch   200  of    245.    Elapsed: 0:00:25

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a


  Training...
    Batch    40  of    245.    Elapsed: 0:00:05.
    Batch    80  of    245.    Elapsed: 0:00:10.
    Batch   120  of    245.    Elapsed: 0:00:16.
    Batch   160  of    245.    Elapsed: 0:00:22.
    Batch   200  of    245.    Elapsed: 0:00:28.
    Batch   240  of    245.    Elapsed: 0:00:33.

  Average training loss: 0.37
  Training epoch took: 0:00:33

  Training...
    Batch    40  of    245.    Elapsed: 0:00:06.
    Batch    80  of    245.    Elapsed: 0:00:11.
    Batch   120  of    245.    Elapsed: 0:00:17.
    Batch   160  of    245.    Elapsed: 0:00:22.
    Batch   200  of    245.    Elapsed: 0:00:28.
    Batch   240  of    245.    Elapsed: 0:00:33.

  Average training loss: 0.15
  Training epoch took: 0:00:34

  Training...
    Batch    40  of    245.    Elapsed: 0:00:05.
    Batch    80  of    245.    Elapsed: 0:00:10.
    Batch   120  of    245.    Elapsed: 0:00:15.
    Batch   160  of    245.    Elapsed: 0:00:20.
    Batch   200  of    245.    Elapsed: 0:00:24

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a


  Training...
    Batch    40  of    245.    Elapsed: 0:00:05.
    Batch    80  of    245.    Elapsed: 0:00:10.
    Batch   120  of    245.    Elapsed: 0:00:15.
    Batch   160  of    245.    Elapsed: 0:00:19.
    Batch   200  of    245.    Elapsed: 0:00:24.
    Batch   240  of    245.    Elapsed: 0:00:29.

  Average training loss: 0.37
  Training epoch took: 0:00:30

  Training...
    Batch    40  of    245.    Elapsed: 0:00:05.
    Batch    80  of    245.    Elapsed: 0:00:10.
    Batch   120  of    245.    Elapsed: 0:00:15.
    Batch   160  of    245.    Elapsed: 0:00:20.
    Batch   200  of    245.    Elapsed: 0:00:25.
    Batch   240  of    245.    Elapsed: 0:00:30.

  Average training loss: 0.15
  Training epoch took: 0:00:30

  Training...
    Batch    40  of    245.    Elapsed: 0:00:05.
    Batch    80  of    245.    Elapsed: 0:00:10.
    Batch   120  of    245.    Elapsed: 0:00:15.
    Batch   160  of    245.    Elapsed: 0:00:20.
    Batch   200  of    245.    Elapsed: 0:00:25

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a


  Training...
    Batch    40  of    245.    Elapsed: 0:00:05.
    Batch    80  of    245.    Elapsed: 0:00:10.
    Batch   120  of    245.    Elapsed: 0:00:15.
    Batch   160  of    245.    Elapsed: 0:00:20.
    Batch   200  of    245.    Elapsed: 0:00:24.
    Batch   240  of    245.    Elapsed: 0:00:29.

  Average training loss: 0.39
  Training epoch took: 0:00:30

  Training...
    Batch    40  of    245.    Elapsed: 0:00:05.
    Batch    80  of    245.    Elapsed: 0:00:10.
    Batch   120  of    245.    Elapsed: 0:00:15.
    Batch   160  of    245.    Elapsed: 0:00:19.
    Batch   200  of    245.    Elapsed: 0:00:24.
    Batch   240  of    245.    Elapsed: 0:00:29.

  Average training loss: 0.16
  Training epoch took: 0:00:30

  Training...
    Batch    40  of    245.    Elapsed: 0:00:06.
    Batch    80  of    245.    Elapsed: 0:00:11.
    Batch   120  of    245.    Elapsed: 0:00:17.
    Batch   160  of    245.    Elapsed: 0:00:22.
    Batch   200  of    245.    Elapsed: 0:00:27

In [15]:
f = open("../results/classification_results/" + str(N_SPLIT) + "folds_" + MODEL_NAME + ".txt", "w")

for i in range(N_SPLIT):
    f.write("Fold " + str(i+1) + ": " + str(all_metrics[i]) + "\n")

f.write("\nAVERAGE MACRO-F1 of " + str(N_SPLIT) + "FOLDS: " + str(mean(all_metrics)))
f.close()