In [1]:
!pip install --upgrade transformers torch transformers[torch] tokenizers huggingface_hub pytorch-crf
!pip install protobuf==3.20.3

Defaulting to user installation because normal site-packages is not writeable
Collecting huggingface_hub
  Downloading huggingface_hub-0.23.0-py3-none-any.whl.metadata (12 kB)
Downloading huggingface_hub-0.23.0-py3-none-any.whl (401 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: huggingface_hub
  Attempting uninstall: huggingface_hub
    Found existing installation: huggingface-hub 0.22.2
    Uninstalling huggingface-hub-0.22.2:
      Successfully uninstalled huggingface-hub-0.22.2
Successfully installed huggingface_hub-0.23.0
Defaulting to user installation because normal site-packages is not writeable


In [2]:
import torch
torch.cuda.empty_cache()

assert torch.cuda.is_available()

In [3]:
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")
device = torch.device("cuda")

Found device: NVIDIA A100-SXM4-80GB MIG 1g.10gb, n_gpu: 1


In [4]:
import random
import numpy as np
import pandas as pd

def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

seed_everything()

In [5]:
import pandas as pd

def read_conll(file_path):
    sentences = []
    labels = []
    current_sentence = []
    current_labels = []

    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line:
                columns = line.split()
                word, label = columns[0], columns[-1]
                current_sentence.append(word)
                current_labels.append(label)
                
                # Check if the current word is a sentence boundary
                if word == '.' and label == 'O':
                    sentences.append(' '.join(current_sentence))
                    labels.append(current_labels)
                    current_sentence = []
                    current_labels = []

    # Create a DataFrame from the accumulated sentences and labels
    df = pd.DataFrame({
        'sentences': sentences,
        'labels': labels
    })
    return df


In [6]:
import torch
max_length=128

def tokenize_and_format(sentences, tokenizer, max_length=max_length):
    """
    Tokenizes sentences and returns formatted input IDs and attention masks.
    
    Parameters:
    sentences: List of sentence strings to be tokenized.
    tokenizer: Tokenizer instance used for tokenizing the sentences.
    """
    input_ids = []
    attention_masks = []

    # Encode each sentence
    for sentence in sentences:
        encoded_dict = tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            max_length=max_length,  # Adjust based on your model's maximum input length
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        # Store the input ID and the attention mask of this sentence
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    # Convert lists of tensors to single tensors
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

In [7]:
#!pip install --no-cache-dir transformers sentencepiece
#!pip install sentencepiece
from transformers import AutoTokenizer

model_name = 'worldbank/econberta'
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True, use_fast=False)



In [8]:
label_dict = {
    'O': 0,
    'B-intervention': 1,
    'I-intervention': 2,
    'B-outcome': 3,
    'I-outcome': 4,
    'B-population': 5,
    'I-population': 6,
    'B-effect_size': 7,
    'I-effect_size': 8,
    'B-coreference': 9,
    'I-coreference': 10
}

In [9]:
reverse_label_dict = {v: k for k, v in label_dict.items()}

In [10]:
import numpy as np
import torch

def get_dataset(df, tokenizer, label_dict, max_length=max_length):
    """
    Processes a DataFrame to return a dataset suitable for training/testing an NER model.
    
    Parameters:
    df: DataFrame containing 'Tokens' and 'Labels' columns.
    tokenizer: Tokenizer to use for encoding the sentences.
    label_dict: Dictionary mapping label names to indices.
    max_length: Maximum length of the tokenized input.
    """
    sentences = df.sentences.values
    
    # Tokenize sentences
    input_ids, attention_masks = tokenize_and_format(sentences, tokenizer, max_length)

    # Prepare labels
    label_list = []
    for labels in df.labels.values:
        # Initialize a list to hold the encoded labels for each sentence
        encoded_labels = [label_dict[label] for label in labels]
        
        # Truncate or pad the labels to match the max_length
        encoded_labels = encoded_labels[:max_length]  # Truncate if needed
        encoded_labels += [label_dict['O']] * (max_length - len(encoded_labels))  # Pad with 'O' if needed
        
        label_list.append(encoded_labels)

    # Convert label_list to a tensor
    labels = torch.tensor(label_list, dtype=torch.long)

    # Create the dataset
    dataset = [(input_ids[i], attention_masks[i], labels[i]) for i in range(len(df))]

    return dataset, sentences

In [11]:
seed_everything()

train_df = pd.read_csv('spanish_train.csv')
val_df = pd.read_csv('spanish_val.csv')
test_df = pd.read_csv('spanish_test.csv')
def preprocess_labels(labels):
    # Remove leading and trailing whitespace
    labels = labels.strip()
    # Remove outer square brackets and split by comma
    labels = labels[1:-1].split(',')
    # Remove extra spaces and quotes from each label
    labels = [label.strip().strip("'\"") for label in labels]
    return labels
train_df['labels'] = train_df['labels'].apply(preprocess_labels)
val_df['labels'] = val_df['labels'].apply(preprocess_labels)
test_df['labels'] = test_df['labels'].apply(preprocess_labels)


train_set, train_sentences = get_dataset(train_df, tokenizer, label_dict)
val_set, val_sentences = get_dataset(val_df, tokenizer, label_dict)
test_set, test_sentences = get_dataset(test_df, tokenizer, label_dict)

In [12]:
train_set[0]

(tensor([     1,    287,    435,    285,   1770,    268, 100734,  12283,    266,
           2181,  12128,   8843,  22203,    718,    865, 100306,   1110,   2368,
           1488,   1110,    865,  90114,   1628,   4770,    795,    260,      2,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,    

In [13]:
# Set the hyperparameters according to Table 8
dropout = 0.2
learning_rates = [5e-5, 6e-5, 7e-5]  # Perform hyperparameter search
batch_size = 12
gradient_accumulation_steps = 4
weight_decay = 0
max_epochs = 10
lr_decay = "slanted_triangular"
fraction_of_steps = 0.06
adam_epsilon = 1e-8
adam_beta1 = 0.9
adam_beta2 = 0.999

seed_everything()

In [14]:
def preprocess_entities(labels, tokens):
    """
    Extract entities from token-label pairs.
    
    Args:
    labels (list of int): List of label indices corresponding to each token.
    tokens (list of str): List of tokens corresponding to each label index.
    
    Returns:
    list of tuples: Each tuple represents an entity with (entity_type, start_index, end_index, entity_text).
    """
    entities = []
    current_entity = None

    for i, (token, label) in enumerate(zip(tokens, labels)):
        if label.startswith("B-"):
            if current_entity:
                entities.append(current_entity)
            entity_type = label.split("-")[1]
            current_entity = (entity_type, i, i, token)
        elif label.startswith("I-") and current_entity and label.split("-")[1] == current_entity[0]:
            current_entity = (current_entity[0], current_entity[1], i, current_entity[3] + " " + token)
        else:
            if current_entity:
                entities.append(current_entity)
                current_entity = None

    if current_entity:
        entities.append(current_entity)

    return entities

In [15]:
def compute_entity_level_metrics(true_entities, pred_entities):
    metrics = {"EM": 0, "EB": 0, "PM": 0, "PB": 0, "ML": 0, "FA": 0}
    true_matched = set()
    pred_matched = set()

    # Check for exact and partial matches
    for i, true_entity in enumerate(true_entities):
        for j, pred_entity in enumerate(pred_entities):
            if j in pred_matched:
                continue
            if true_entity == pred_entity:
                metrics["EM"] += 1
                true_matched.add(i)
                pred_matched.add(j)
                break
            elif true_entity[0] == pred_entity[0] and is_overlapping((true_entity[1], true_entity[2]), (pred_entity[1], pred_entity[2])):
                if true_entity[1] == pred_entity[1] and true_entity[2] == pred_entity[2]:
                    metrics["EB"] += 1
                else:
                    metrics["PM"] += 1
                true_matched.add(i)
                pred_matched.add(j)
                break
            elif is_overlapping((true_entity[1], true_entity[2]), (pred_entity[1], pred_entity[2])):
                metrics["PB"] += 1
                true_matched.add(i)
                pred_matched.add(j)
                break

    # Check for missed labels (entities in true but not in pred)
    for i, true_entity in enumerate(true_entities):
        if i not in true_matched:
            metrics["ML"] += 1

    # Check for false alarms (entities in pred but not in true)
    for j, pred_entity in enumerate(pred_entities):
        if j not in pred_matched:
            metrics["FA"] += 1

    return metrics

In [16]:
def is_overlapping(span1, span2):
    """
    Check if two spans overlap.
    Args:
    span1, span2 (tuple): (start_index, end_index) of the span.

    Returns:
    bool: True if spans overlap, False otherwise.
    """
    assert len(span1) == 2 and len(span2) == 2, "Each span must be a tuple of two elements (start_index, end_index)"
    start1, end1 = span1
    start2, end2 = span2
    return max(start1, start2) <= min(end1, end2)

In [17]:
print(label_dict)

{'O': 0, 'B-intervention': 1, 'I-intervention': 2, 'B-outcome': 3, 'I-outcome': 4, 'B-population': 5, 'I-population': 6, 'B-effect_size': 7, 'I-effect_size': 8, 'B-coreference': 9, 'I-coreference': 10}


In [18]:
from collections import defaultdict

def analyze_generalization(model, data, tokenizer, train_words):
    grouped_entities = defaultdict(lambda: ([], []))  # {group_name: (true_entities, pred_entities)}

    for i, (input_ids, attention_mask, label_tensor) in enumerate(data):
        input_ids = input_ids.unsqueeze(0).to(device)
        attention_mask = attention_mask.unsqueeze(0).to(device)
        
        # Call model without labels to get the decoded labels
        with torch.no_grad():
            decoded_labels = model(input_ids, attention_mask=attention_mask)["decoded"][0]
            # No need to use argmax since CRF.decode returns the most likely tag sequence
        
        # Convert the decoded labels to label names using label_dict
        pred_labels = [reverse_label_dict.get(label) for label in decoded_labels]

        # Convert input_ids to tokens
        tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist(), skip_special_tokens=True)

        # Assuming true_labels are provided in a similar structure
        true_labels = [reverse_label_dict.get(l.item()) for l in label_tensor]
        
        # Preprocess entities for true and predicted labels
        true_entities = preprocess_entities(true_labels, tokens)
        pred_entities = preprocess_entities(pred_labels, tokens)
        
        for true_entity, pred_entity in zip(true_entities, pred_entities):
            length = true_entity[2] - true_entity[1]

            seen = any(word in train_words for word in true_entity[3].split())  # Check if any word in entity text was seen in training

            group_name = f"Length {length} - {'Seen' if seen else 'Unseen'}"
            grouped_entities[group_name][0].append(true_entity)
            grouped_entities[group_name][1].append(pred_entity)

    for group_name, group_data in grouped_entities.items():
        group_true_entities, group_pred_entities = group_data
        metrics = compute_entity_level_metrics(group_true_entities, group_pred_entities)
        print(f"Group: {group_name}, Metrics: {metrics}")

In [19]:
from sklearn.metrics import classification_report
import torch
import numpy as np

def get_validation_performance(val_set, model, device, label_dict, batch_size):
    # Put the model in evaluation mode
    model.eval()

    # Tracking variables
    total_eval_loss = 0
    all_pred_labels = []
    all_true_labels = []

    num_batches = int(len(val_set) / batch_size) + (1 if len(val_set) % batch_size != 0 else 0)

    for i in range(num_batches):
        end_index = min(batch_size * (i + 1), len(val_set))
        batch = val_set[i * batch_size:end_index]

        if len(batch) == 0:
            continue

        input_id_tensors = torch.stack([data[0] for data in batch])
        input_mask_tensors = torch.stack([data[1] for data in batch])
        label_tensors = torch.stack([data[2] for data in batch])

        # Move tensors to the GPU
        b_input_ids = input_id_tensors.to(device)
        b_input_mask = input_mask_tensors.to(device)
        b_labels = label_tensors.to(device)
        b_labels = b_labels.long()

        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs["loss"]
            logits = outputs["logits"]

            # Accumulate the validation loss.
            total_eval_loss += loss.item()

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            # Get the predicted labels
            pred_labels = np.argmax(logits, axis=2).flatten()
            true_labels = label_ids.flatten()

            # Convert labels to their original names
            pred_labels = [reverse_label_dict.get(label) for label in pred_labels]
            true_labels = [reverse_label_dict.get(label) for label in true_labels]

            # Filter out special tokens ('O' label is used for non-entity and special tokens)
            filtered_pred_labels = [pred for pred, true in zip(pred_labels, true_labels) if true != 'O']
            filtered_true_labels = [true for true in true_labels if true != 'O']
            
            # After filtering out special tokens
            if not filtered_pred_labels or not filtered_true_labels:
                print("Warning: No non-'O' labels found in this batch.")
            else:
                all_pred_labels.extend(filtered_pred_labels)
                all_true_labels.extend(filtered_true_labels)
            
    # After processing all batches, check if we have any labels to report on
    if not all_true_labels or not all_pred_labels:
        print("Error: No non-'O' labels found in the entire validation set.")
        default_labels = [list(label_dict.values())[0]]  # Use the first label as a placeholder
        report = classification_report(default_labels, default_labels, digits=4, zero_division=0)
    else:
        # Calculate precision, recall, and F1 score
        report = classification_report(all_true_labels, all_pred_labels, digits=4, zero_division=0)

    return report


In [20]:
from torchcrf import CRF
from transformers import AutoModel

class CRFTagger(torch.nn.Module):
    def __init__(self, model_name, num_labels):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = torch.nn.Dropout(dropout)
        self.classifier = torch.nn.Linear(self.bert.config.hidden_size, num_labels)
        self.crf = CRF(num_labels, batch_first=True)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        # Mask should be of type 'bool' in newer PyTorch versions
        mask = attention_mask.type(torch.bool) if hasattr(torch, 'bool') else attention_mask.byte()
        
        if labels is not None:
            loss = -self.crf(logits, labels, mask=mask, reduction='mean')
            return {'loss': loss, 'logits': logits, 'decoded': self.crf.decode(logits, mask=mask)}
        else:
            decoded_labels = self.crf.decode(logits, mask=mask)
            return {'decoded': decoded_labels, 'logits': logits}

In [21]:
seed_everything()

# Load the pre-trained model
model = CRFTagger(model_name, len(label_dict))
model.dropout = torch.nn.Dropout(dropout)
model.to(device)



CRFTagger(
  (bert): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
              (dropou

In [23]:
from transformers import get_linear_schedule_with_warmup, AdamW
import time

# Calculate the total number of training steps
total_steps = (len(train_set) // (batch_size * gradient_accumulation_steps)) * max_epochs


for lr in learning_rates:
    print(f"Current learning rate: {lr}")

    # Create the optimizer with the specified hyperparameters
    optimizer = AdamW(model.parameters(), lr=lr, eps=adam_epsilon, betas=(adam_beta1, adam_beta2), weight_decay=weight_decay, no_deprecation_warning=True)

    # Create the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(total_steps * fraction_of_steps), num_training_steps=total_steps)

    # Training loop
    for epoch_i in range(max_epochs):
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, max_epochs))
        print('Training...')

        total_train_loss = 0
        model.train()

        num_batches = int(len(train_set) / batch_size) + (1 if len(train_set) % batch_size != 0 else 0)

        for i in range(num_batches):
            end_index = min(batch_size * (i + 1), len(train_set))
            batch = train_set[i * batch_size:end_index]

            if len(batch) == 0:
                continue

            input_id_tensors = torch.stack([data[0] for data in batch])
            input_mask_tensors = torch.stack([data[1] for data in batch])
            label_tensors = torch.stack([data[2] for data in batch])

            b_input_ids = input_id_tensors.to(device)
            b_input_mask = input_mask_tensors.to(device)
            b_labels = label_tensors.long().to(device)

            model.zero_grad()
            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs["loss"]
            total_train_loss += loss.item()

            # Accumulate gradients
            loss = loss / gradient_accumulation_steps
            loss.backward()

            # Perform optimizer step after accumulating gradients for gradient_accumulation_steps
            if (i + 1) % gradient_accumulation_steps == 0 or i == num_batches - 1:  # Ensure step is taken on the last batch
                optimizer.step()
                scheduler.step()
                model.zero_grad()

        print(f"Total loss: {total_train_loss}")
        report = get_validation_performance(val_set, model, device, label_dict, batch_size)
        print(report)
        analyze_generalization(model, val_set, tokenizer, train_sentences)

    print("")
    print(f"Training complete at learning rate: {lr}!")

    timestamp = time.strftime("%Y%m%d-%H%M%S")
    print(f"BERT multilingual Model saved at: {timestamp}")
    torch.save(model.state_dict(), f'DeBertamultilingmodel_lr-{lr}_{timestamp}.pth')

print("")
print(f"Training complete!")

Current learning rate: 5e-05

Training...
Total loss: 13800.402116775513
                precision    recall  f1-score   support

 B-coreference     0.0000    0.0000    0.0000     255.0
 B-effect_size     0.0000    0.0000    0.0000     142.0
B-intervention     0.0000    0.0000    0.0000     704.0
     B-outcome     0.0000    0.0000    0.0000    1035.0
  B-population     0.0000    0.0000    0.0000     598.0
 I-coreference     0.0000    0.0000    0.0000       7.0
 I-effect_size     0.0000    0.0000    0.0000     247.0
I-intervention     0.0000    0.0000    0.0000    1600.0
     I-outcome     0.0000    0.0000    0.0000    1760.0
  I-population     0.0000    0.0000    0.0000     618.0
             O     0.0000    0.0000    0.0000       0.0

      accuracy                         0.0000    6966.0
     macro avg     0.0000    0.0000    0.0000    6966.0
  weighted avg     0.0000    0.0000    0.0000    6966.0


Training...
Total loss: 13043.54767036438
                precision    recall  f1-s

In [None]:
model_name = model_name
lr = learning_rates[0]

# Load state_dict of the model
model.load_state_dict(torch.load(f'{model_name}-model_lr-{lr}_1.pth'))

In [24]:
print(get_validation_performance(test_set, model, device, label_dict, batch_size))

                precision    recall  f1-score   support

 B-coreference     0.4459    0.1273    0.1980       550
 B-effect_size     0.3068    0.0844    0.1324       320
B-intervention     0.3759    0.1549    0.2194      1330
     B-outcome     0.3346    0.1490    0.2062      1718
  B-population     0.3497    0.1473    0.2073      1018
 I-coreference     0.0000    0.0000    0.0000        22
 I-effect_size     0.4530    0.1100    0.1770       482
I-intervention     0.7246    0.2647    0.3878      2663
     I-outcome     0.6299    0.2226    0.3289      3235
  I-population     0.6146    0.2469    0.3523      1227
             O     0.0000    0.0000    0.0000         0

      accuracy                         0.1982     12565
     macro avg     0.3850    0.1370    0.2008     12565
  weighted avg     0.5343    0.1982    0.2883     12565



In [25]:
analyze_generalization(model, test_set, tokenizer, train_sentences)

Group: Length 3 - Unseen, Metrics: {'EM': 0, 'EB': 2, 'PM': 87, 'PB': 168, 'ML': 19, 'FA': 19}
Group: Length 4 - Unseen, Metrics: {'EM': 2, 'EB': 1, 'PM': 65, 'PB': 84, 'ML': 9, 'FA': 9}
Group: Length 0 - Unseen, Metrics: {'EM': 22, 'EB': 202, 'PM': 103, 'PB': 1044, 'ML': 104, 'FA': 104}
Group: Length 2 - Unseen, Metrics: {'EM': 2, 'EB': 8, 'PM': 143, 'PB': 286, 'ML': 18, 'FA': 18}
Group: Length 7 - Unseen, Metrics: {'EM': 0, 'EB': 0, 'PM': 14, 'PB': 27, 'ML': 4, 'FA': 4}
Group: Length 1 - Unseen, Metrics: {'EM': 2, 'EB': 34, 'PM': 292, 'PB': 631, 'ML': 44, 'FA': 44}
Group: Length 5 - Unseen, Metrics: {'EM': 1, 'EB': 0, 'PM': 40, 'PB': 65, 'ML': 9, 'FA': 9}
Group: Length 9 - Unseen, Metrics: {'EM': 0, 'EB': 0, 'PM': 7, 'PB': 12, 'ML': 6, 'FA': 6}
Group: Length 10 - Unseen, Metrics: {'EM': 0, 'EB': 0, 'PM': 5, 'PB': 6, 'ML': 2, 'FA': 2}
Group: Length 8 - Unseen, Metrics: {'EM': 1, 'EB': 0, 'PM': 23, 'PB': 22, 'ML': 3, 'FA': 3}
Group: Length 11 - Unseen, Metrics: {'EM': 0, 'EB': 0, 'PM':