In [1]:
import pandas as pd

df = pd.read_parquet('train.parquet')

In [2]:
import spacy

from spacy.training.iob_utils import biluo_to_iob, doc_to_biluo_tags
from tqdm.autonotebook import tqdm
tqdm.pandas()

df.trigger_words = df.trigger_words.apply(lambda x: [] if x is None else x)
df['target'] = df.trigger_words.apply(lambda x: [[y[0], y[1], 'TRIGGER'] for y in x])

def resolve_overlapping_spans(spans):
    if not spans:
        return []
    spans = sorted(spans, key=lambda x: x[0])  # Sort by start index
    resolved = [spans[0]]
    for current in spans[1:]:
        last = resolved[-1]
        if current[0] < last[1]:  # Overlap
            new_span = (last[0], max(last[1], current[1]), 'TRIGGER')
            resolved[-1] = new_span
            print('resolved')
        else:
            resolved.append(current)
    return resolved

df['target'] = df.target.apply(resolve_overlapping_spans)

nlp = spacy.blank("xx")

def convert_to_conll(row):
    data = {
        "text": row['content'],
        "label": row['target']
    }
    doc = nlp(data["text"])
    ents = []
    for start, end, label in data["label"]:
        span = doc.char_span(start, end, label=label)
        if span is not None:
            ents.append(span)
        else:
            pass
        #TODO fix not align to token case
        '''
            print(
                "Skipping span (does not align to tokens):",
                start,
                end,
                label,
                doc.text[start:end],
            )
        '''
    doc.ents = ents
    return {
        'tokens': list([t.text for t in doc]),
        'labels': list(biluo_to_iob(doc_to_biluo_tags(doc)))
    }

df['conll'] = df.progress_apply(convert_to_conll, axis=1)

resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved


  from tqdm.autonotebook import tqdm


  0%|          | 0/3822 [00:00<?, ?it/s]

In [3]:
import numpy as np


df['is_valid'] = np.random.binomial(1, 0.2, df.shape[0])

In [4]:
label2id = {'O': 0, 'B-TRIGGER': 1, 'I-TRIGGER': 2}

df['tokens'] = df.conll.str['tokens']
df['ner_tags'] = df.conll.str['labels'].apply(lambda x: [label2id[t] for t in x])

df_train = df[df.is_valid == 0]
df_valid = df[df.is_valid == 1]

In [5]:
df_train[['tokens', 'ner_tags']].to_json(
    './data/train_processed.json', orient='records', lines=True)
df_valid[['tokens', 'ner_tags']].to_json(
    './data/valid_processed.json', orient='records', lines=True)

In [6]:
from datasets import load_dataset

raw_datasets_ua = load_dataset(
    "json",
    data_files={
        'train': './data/train_processed.json',
        'val': './data/valid_processed.json'
    }
)
raw_datasets_ua

Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 3050
    })
    val: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 772
    })
})

In [30]:
from transformers import AutoTokenizer, AutoModelForTokenClassification


id2label = {v: k for k, v in label2id.items()}

model = AutoModelForTokenClassification.from_pretrained(
    'bert-base-multilingual-cased',
    id2label=id2label,
    label2id=label2id,
)
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)


def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [32]:
tokenized_datasets_ua = raw_datasets_ua.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets_ua["train"].column_names,
)

In [33]:
from transformers import AdamW, get_linear_schedule_with_warmup


EPOCHS = 5

optimizer = AdamW([
    {'params': list(model.bert.parameters()), 'lr': 2e-5},
    {'params': list(model.classifier.parameters()), 'lr': 1e-4}
])

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0.1*EPOCHS*(tokenized_datasets_ua['train'].num_rows/16),
    num_training_steps=EPOCHS*(tokenized_datasets_ua['train'].num_rows/16)
)



In [34]:
from transformers import TrainingArguments

args = TrainingArguments(
    "bert-ua-loc-ner",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=EPOCHS,
    save_total_limit=5
)

In [35]:
import evaluate
import numpy as np

metric = evaluate.load("seqeval")

label_names = list(label2id.keys())

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [36]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets_ua["train"],
    eval_dataset=tokenized_datasets_ua["val"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    optimizers=(optimizer, scheduler)
)
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.527702,0.01295,0.011152,0.011984,0.782783
2,No log,0.490371,0.024936,0.030359,0.027382,0.79151
3,0.478500,0.512497,0.029768,0.036555,0.032814,0.793369
4,0.478500,0.551838,0.036019,0.047088,0.040816,0.79224
5,0.478500,0.584749,0.03257,0.045849,0.038085,0.789679


TrainOutput(global_step=955, training_loss=0.3991094139858066, metrics={'train_runtime': 352.8504, 'train_samples_per_second': 43.219, 'train_steps_per_second': 2.707, 'total_flos': 3669170396055840.0, 'train_loss': 0.3991094139858066, 'epoch': 5.0})

In [39]:
preds = trainer.predict(tokenized_datasets_ua["val"])

In [71]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "./bert-ua-loc-ner/checkpoint-955/"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)

Device set to use cuda:0


In [76]:
preds = token_classifier.predict(df_valid.content.tolist())

In [77]:
val_sub = [str([(p['start'], p['end']) for p in row]) for row in preds]

In [78]:
from copy import deepcopy

def safe_string(row):
    if row is None:
        return '[]'
    else:
        return str([(s[0], s[1]) for s in row])

valid_sub = deepcopy(df_valid)
valid_sub['trigger_words'] = valid_sub.trigger_words.apply(safe_string)
valid_sub_gt = deepcopy(valid_sub[['id', 'trigger_words']])
valid_sub_hat = deepcopy(valid_sub[['id', 'trigger_words']])
valid_sub_hat['trigger_words'] = val_sub

In [79]:
score(solution=valid_sub_gt, submission=valid_sub_hat, row_id_column_name='id')

0.45892063205111544

In [80]:
test = pd.read_csv('test.csv')

In [82]:
preds_test = token_classifier.predict(test.content.tolist())

In [83]:
test_sub = [str([(p['start'], p['end']) for p in row]) for row in preds_test]

In [85]:
ss = pd.read_csv('sample_submission.csv')

In [87]:
ss['trigger_words'] = test_sub

In [89]:
ss.to_csv('submissions/bert-base-ml-cv0.459.csv', index=False)

In [42]:
import pandas as pd
import pandas.api.types
from sklearn.metrics import f1_score
import ast


class ParticipantVisibleError(Exception):
    """Custom exception for participant-visible errors."""
    pass


def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    """
    Compute span-level F1 score based on overlap.

    Parameters:
    - solution (pd.DataFrame): Ground truth DataFrame with row ID and token labels.
    - submission (pd.DataFrame): Submission DataFrame with row ID and token labels.
    - row_id_column_name (str): Column name for the row identifier.

    Returns:
    - float: The token-level weighted F1 score.

    Example:
    >>> solution = pd.DataFrame({
    ...     "id": [1, 2, 3],
    ...     "trigger_words": [[(612, 622), (725, 831)], [(300, 312)], []]
    ... })
    >>> submission = pd.DataFrame({
    ...     "id": [1, 2, 3],
    ...     "trigger_words": [[(612, 622), (700, 720)], [(300, 312)], [(100, 200)]]
    ... })
    >>> score(solution, submission, "id")
    0.16296296296296295
    """
    if not all(col in solution.columns for col in ["id", "trigger_words"]):
        raise ValueError("Solution DataFrame must contain 'id' and 'trigger_words' columns.")
    if not all(col in submission.columns for col in ["id", "trigger_words"]):
        raise ValueError("Submission DataFrame must contain 'id' and 'trigger_words' columns.")
    
    def safe_parse_spans(trigger_words):
        if isinstance(trigger_words, str):
            try:
                return ast.literal_eval(trigger_words)
            except (ValueError, SyntaxError):
                return []
        if isinstance(trigger_words, (list, tuple)):
            return trigger_words
        return []

    def extract_tokens_from_spans(spans):
        tokens = set()
        for start, end in spans:
            tokens.update(range(start, end))
        return tokens
    
    solution = solution.copy()
    submission = submission.copy()

    solution["trigger_words"] = solution["trigger_words"].apply(safe_parse_spans)
    submission["trigger_words"] = submission["trigger_words"].apply(safe_parse_spans)

    merged = pd.merge(
        solution,
        submission,
        on="id",
        suffixes=("_solution", "_submission")
    )

    total_true_tokens = 0
    total_pred_tokens = 0
    overlapping_tokens = 0

    for _, row in merged.iterrows():
        true_spans = row["trigger_words_solution"]
        pred_spans = row["trigger_words_submission"]

        true_tokens = extract_tokens_from_spans(true_spans)
        pred_tokens = extract_tokens_from_spans(pred_spans)

        total_true_tokens += len(true_tokens)
        total_pred_tokens += len(pred_tokens)
        overlapping_tokens += len(true_tokens & pred_tokens)

    precision = overlapping_tokens / total_pred_tokens if total_pred_tokens > 0 else 0
    recall = overlapping_tokens / total_true_tokens if total_true_tokens > 0 else 0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return f1