In [1]:
%env CUDA_VISIBLE_DEVICES=7

env: CUDA_VISIBLE_DEVICES=7


In [2]:
import sys
sys.path.append("..")
sys.path.append("../../evaluation")
import helper

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import calc_metrics

In [4]:
import datetime
from functools import partial
from glob import glob
from pathlib import Path
import json
from itertools import chain

In [5]:
from datasets import Dataset
from sklearn.model_selection import train_test_split
import evaluate
import numpy as np
import pandas as pd
import torch

from transformers import AutoModelForTokenClassification,\
                         BertTokenizerFast, BertTokenizer, AutoTokenizer,\
                         TrainingArguments, Trainer, DataCollatorForTokenClassification,\
                         BertForTokenClassification

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [7]:
log_path_base = "initial_model"

## Dataset

### Read the Train Dataset (Sentences and Labels)

In [8]:
fn_train_sentences = '../../data/subtask2/subtask2_train.data.txt'
fn_train_labels = '../../data/subtask2/subtask2_train.labels.txt'
entity_label_grouping = False

# Open the Data of Sentences
with open(fn_train_sentences, 'r', encoding='utf-8') as file:
    train_sentences = file.readlines()
    train_sentences = [sentence.split() for sentence in train_sentences]

# Open the label data
with open(fn_train_labels, 'r', encoding='utf-8') as file:
    train_labels = file.readlines()
    train_labels = [sentence_labels.split() for sentence_labels in train_labels]

### Change the labels, if we have Entity Labels
if entity_label_grouping:
    train_labels = [helper.reduce_to_entity_type_labels(labels) for labels in train_labels]


In [9]:
dataset = pd.DataFrame(zip(train_sentences, train_labels), columns=["sentence", "label"])
dataset.head()

Unnamed: 0,sentence,label
0,"[Here, we, report, a, comprehensive, suite, fo...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,"[The, resource, is, available, free, of, charg...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B-URL, O]"
2,"[In, this, work, ,, we, described, the, DelPhi...","[O, O, O, O, O, O, O, O, O, O, O, O, O]"
3,"["", Project, name, :, DelPhi, Project, home, p...","[O, O, O, O, O, O, O, O, O, O, B-URL, O, O, O,..."
4,"[We, have, developed, ANDES, ,, a, software, l...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


## Create label id lookup

In [10]:
possible_labels = list(set(chain(*train_labels)))
possible_labels.sort(key=lambda x: (x != "O", x[2:], x[0]))

In [11]:
id_to_label = dict(enumerate(possible_labels))
label_to_id = {label:id for id, label in id_to_label.items()}
id_to_label

{0: 'O',
 1: 'B-Abbreviation',
 2: 'B-AlternativeName',
 3: 'I-AlternativeName',
 4: 'B-Citation',
 5: 'I-Citation',
 6: 'B-Developer',
 7: 'I-Developer',
 8: 'B-Extension',
 9: 'I-Extension',
 10: 'B-License',
 11: 'I-License',
 12: 'B-Release',
 13: 'I-Release',
 14: 'B-URL',
 15: 'I-URL',
 16: 'B-Version',
 17: 'I-Version'}

## Prepare and Train the dataset

In [17]:
# parameter
eval_size=0.25
random_state=42
experimen_name = "billy"

### Split the dataset into Train and Validation

In [18]:
train, validation = train_test_split(dataset, test_size=eval_size, random_state=random_state)
len(train), len(validation)

(1764, 589)

# Tokenize and map labels

In [20]:
base_model = 'bert-base-uncased'

In [22]:
%%time
tokenizer = BertTokenizerFast.from_pretrained(base_model)
tokenizer_params = dict(
    truncation=True,
    is_split_into_words=True,
    padding="do_not_pad",
    max_length=512)

Train_dataset = tokenizer(train['sentence'].to_list(), **tokenizer_params)
label_ids = train['label'].apply(lambda labels: [label_to_id.get(label) for label in labels]).to_list()
helper.map_labels(Train_dataset, label_ids)
Train_dataset = Dataset.from_dict(Train_dataset)

Val_dataset = tokenizer(validation['sentence'].to_list(), **tokenizer_params)
label_ids = validation['label'].apply(lambda labels: [label_to_id.get(label) for label in labels]).to_list()
helper.map_labels(Val_dataset, label_ids)
Val_dataset = Dataset.from_dict(Val_dataset)

dataset_info = dict(
    n_train=len(Train_dataset),
    n_eval=len(Val_dataset),
    eval_size=eval_size,
)
dataset_info

CPU times: user 4.32 s, sys: 142 ms, total: 4.47 s
Wall time: 661 ms


{'n_train': 1764, 'n_eval': 589, 'eval_size': 0.25}

### Train Model

In [23]:
def get_trainer_state(model_path):
    states = glob(f"{model_path}/*/trainer_state.json")
    if not states:
        return
    states = [json.load(open(f)) for f in states]
    states.sort(key=lambda x: -x["epoch"])
    states_last = states[0]
    return states_last

def get_best_model_checkpoint(model_path):
    states = get_trainer_state(model_path)
    best_model_checkpoint = states["best_model_checkpoint"]
    return best_model_checkpoint

### Training

In [87]:
# Mail: Sent: Donnerstag, 11. Juli 2024 13:39
# Epochs=5 , train_batch_size=64, validation_batch_size=32, lr=2e-5

model_base_path =  '../../models/subtask2'

training_args = dict(
    save_total_limit=2,
    save_strategy='epoch',
    metric_for_best_model="overall_f1",
    num_train_epochs=8,
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=8,
    weight_decay=0.01,
    warmup_ratio=0.1,
    evaluation_strategy='epoch',
    disable_tqdm=False,
)
start = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
model_path = f'{model_base_path}/{start}'
training_args["output_dir"] = model_path
model_path

'../../models/subtask2/2024-07-13_01-03-19'

In [88]:
experiment_name = "john"

In [89]:
model = AutoModelForTokenClassification.from_pretrained(base_model, id2label=id_to_label, num_labels=len(id_to_label))
trainer = Trainer(model=model,
                  args=TrainingArguments(**training_args),
                  compute_metrics=partial(helper.ner_eval_metrics, id_to_label=id_to_label),
                  train_dataset=Train_dataset,
                  eval_dataset=Val_dataset,
                  data_collator=DataCollatorForTokenClassification(tokenizer, padding="longest", max_length=512),
                  tokenizer=tokenizer)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [90]:
# old time: 1:23 => 19 minutes
# only positive => 

In [91]:
dataset_performance = trainer.train()

Epoch,Training Loss,Validation Loss,Abbreviation,Alternativename,Citation,Developer,Extension,License,Release,Url,Version,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
0,No log,0.142534,"{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 15}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 10}","{'precision': 0.6850828729281768, 'recall': 0.9538461538461539, 'f1': 0.797427652733119, 'number': 130}","{'precision': 0.5047619047619047, 'recall': 0.3419354838709677, 'f1': 0.40769230769230763, 'number': 155}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 10}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 9}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 16}","{'precision': 0.9047619047619048, 'recall': 0.95, 'f1': 0.9268292682926829, 'number': 80}","{'precision': 0.8292682926829268, 'recall': 0.9543859649122807, 'f1': 0.8874388254486134, 'number': 285}",0.752149,0.739437,0.745739,0.972446
1,No log,0.069983,"{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 15}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 10}","{'precision': 0.6888888888888889, 'recall': 0.9538461538461539, 'f1': 0.8000000000000002, 'number': 130}","{'precision': 0.8220858895705522, 'recall': 0.864516129032258, 'f1': 0.8427672955974844, 'number': 155}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 10}","{'precision': 0.3333333333333333, 'recall': 0.2222222222222222, 'f1': 0.26666666666666666, 'number': 9}","{'precision': 0.25, 'recall': 0.375, 'f1': 0.3, 'number': 16}","{'precision': 0.9382716049382716, 'recall': 0.95, 'f1': 0.9440993788819876, 'number': 80}","{'precision': 0.8947368421052632, 'recall': 0.9543859649122807, 'f1': 0.9235993208828523, 'number': 285}",0.810026,0.864789,0.836512,0.982902
2,No log,0.053682,"{'precision': 0.6428571428571429, 'recall': 0.6, 'f1': 0.6206896551724138, 'number': 15}","{'precision': 0.375, 'recall': 0.3, 'f1': 0.33333333333333326, 'number': 10}","{'precision': 0.7423312883435583, 'recall': 0.9307692307692308, 'f1': 0.8259385665529012, 'number': 130}","{'precision': 0.8375, 'recall': 0.864516129032258, 'f1': 0.8507936507936507, 'number': 155}","{'precision': 1.0, 'recall': 0.5, 'f1': 0.6666666666666666, 'number': 10}","{'precision': 0.7142857142857143, 'recall': 0.5555555555555556, 'f1': 0.6250000000000001, 'number': 9}","{'precision': 0.36363636363636365, 'recall': 0.5, 'f1': 0.4210526315789474, 'number': 16}","{'precision': 0.9620253164556962, 'recall': 0.95, 'f1': 0.9559748427672956, 'number': 80}","{'precision': 0.9275862068965517, 'recall': 0.9438596491228071, 'f1': 0.9356521739130435, 'number': 285}",0.842246,0.887324,0.864198,0.986531
3,No log,0.045711,"{'precision': 0.7142857142857143, 'recall': 0.6666666666666666, 'f1': 0.689655172413793, 'number': 15}","{'precision': 0.5, 'recall': 0.5, 'f1': 0.5, 'number': 10}","{'precision': 0.8053691275167785, 'recall': 0.9230769230769231, 'f1': 0.8602150537634408, 'number': 130}","{'precision': 0.7966101694915254, 'recall': 0.9096774193548387, 'f1': 0.8493975903614457, 'number': 155}","{'precision': 1.0, 'recall': 0.8, 'f1': 0.888888888888889, 'number': 10}","{'precision': 0.5555555555555556, 'recall': 0.5555555555555556, 'f1': 0.5555555555555556, 'number': 9}","{'precision': 0.3103448275862069, 'recall': 0.5625, 'f1': 0.4, 'number': 16}","{'precision': 0.9382716049382716, 'recall': 0.95, 'f1': 0.9440993788819876, 'number': 80}","{'precision': 0.9351535836177475, 'recall': 0.9614035087719298, 'f1': 0.9480968858131488, 'number': 285}",0.841558,0.912676,0.875676,0.988376
4,No log,0.046815,"{'precision': 0.875, 'recall': 0.4666666666666667, 'f1': 0.608695652173913, 'number': 15}","{'precision': 0.5, 'recall': 0.4, 'f1': 0.4444444444444445, 'number': 10}","{'precision': 0.8650793650793651, 'recall': 0.8384615384615385, 'f1': 0.8515625, 'number': 130}","{'precision': 0.8888888888888888, 'recall': 0.8774193548387097, 'f1': 0.8831168831168831, 'number': 155}","{'precision': 1.0, 'recall': 0.8, 'f1': 0.888888888888889, 'number': 10}","{'precision': 0.6, 'recall': 0.6666666666666666, 'f1': 0.631578947368421, 'number': 9}","{'precision': 0.5555555555555556, 'recall': 0.625, 'f1': 0.5882352941176471, 'number': 16}","{'precision': 0.95, 'recall': 0.95, 'f1': 0.9500000000000001, 'number': 80}","{'precision': 0.9358108108108109, 'recall': 0.9719298245614035, 'f1': 0.9535283993115319, 'number': 285}",0.895332,0.891549,0.893437,0.98936
5,No log,0.044506,"{'precision': 0.8, 'recall': 0.8, 'f1': 0.8000000000000002, 'number': 15}","{'precision': 0.4444444444444444, 'recall': 0.4, 'f1': 0.4210526315789474, 'number': 10}","{'precision': 0.8188405797101449, 'recall': 0.8692307692307693, 'f1': 0.8432835820895522, 'number': 130}","{'precision': 0.8742138364779874, 'recall': 0.896774193548387, 'f1': 0.8853503184713376, 'number': 155}","{'precision': 1.0, 'recall': 0.8, 'f1': 0.888888888888889, 'number': 10}","{'precision': 0.75, 'recall': 0.6666666666666666, 'f1': 0.7058823529411765, 'number': 9}","{'precision': 0.6875, 'recall': 0.6875, 'f1': 0.6875, 'number': 16}","{'precision': 0.9382716049382716, 'recall': 0.95, 'f1': 0.9440993788819876, 'number': 80}","{'precision': 0.9611307420494699, 'recall': 0.9543859649122807, 'f1': 0.9577464788732394, 'number': 285}",0.894003,0.902817,0.898388,0.989913
6,No log,0.04255,"{'precision': 0.6666666666666666, 'recall': 0.8, 'f1': 0.7272727272727272, 'number': 15}","{'precision': 0.4444444444444444, 'recall': 0.4, 'f1': 0.4210526315789474, 'number': 10}","{'precision': 0.8201438848920863, 'recall': 0.8769230769230769, 'f1': 0.8475836431226766, 'number': 130}","{'precision': 0.8588957055214724, 'recall': 0.9032258064516129, 'f1': 0.8805031446540881, 'number': 155}","{'precision': 1.0, 'recall': 0.8, 'f1': 0.888888888888889, 'number': 10}","{'precision': 0.75, 'recall': 0.6666666666666666, 'f1': 0.7058823529411765, 'number': 9}","{'precision': 0.6, 'recall': 0.75, 'f1': 0.6666666666666665, 'number': 16}","{'precision': 0.9625, 'recall': 0.9625, 'f1': 0.9625000000000001, 'number': 80}","{'precision': 0.952054794520548, 'recall': 0.9754385964912281, 'f1': 0.9636048526863086, 'number': 285}",0.883311,0.916901,0.899793,0.99059
7,No log,0.043122,"{'precision': 0.7058823529411765, 'recall': 0.8, 'f1': 0.7500000000000001, 'number': 15}","{'precision': 0.625, 'recall': 0.5, 'f1': 0.5555555555555556, 'number': 10}","{'precision': 0.8053691275167785, 'recall': 0.9230769230769231, 'f1': 0.8602150537634408, 'number': 130}","{'precision': 0.8588957055214724, 'recall': 0.9032258064516129, 'f1': 0.8805031446540881, 'number': 155}","{'precision': 1.0, 'recall': 0.8, 'f1': 0.888888888888889, 'number': 10}","{'precision': 0.75, 'recall': 0.6666666666666666, 'f1': 0.7058823529411765, 'number': 9}","{'precision': 0.65, 'recall': 0.8125, 'f1': 0.7222222222222223, 'number': 16}","{'precision': 0.9625, 'recall': 0.9625, 'f1': 0.9625000000000001, 'number': 80}","{'precision': 0.952054794520548, 'recall': 0.9754385964912281, 'f1': 0.9636048526863086, 'number': 285}",0.884564,0.928169,0.905842,0.990897


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Performance of dataset

In [92]:
dataset_performance

TrainOutput(global_step=216, training_loss=0.15579555652759694, metrics={'train_runtime': 122.9287, 'train_samples_per_second': 114.798, 'train_steps_per_second': 1.757, 'total_flos': 521041893310224.0, 'train_loss': 0.15579555652759694, 'epoch': 7.819004524886878})

# Save training stats and parameter

In [93]:
log_path = f"{log_path_base}/{experiment_name}/"
Path(log_path).mkdir(parents=True, exist_ok=True)
with open(f'{log_path}/data_stats.json', 'w') as f:
    json.dump(dataset_info, f)
with open(f'{log_path}/training_args.json', "w") as f:
    json.dump(training_args, f)
trainer_state = get_trainer_state(model_path)
with open(f'{log_path}/trainer_state.json', "w") as f:
    json.dump(trainer_state, f)
with open(f'{log_path}/finished.txt', 'w') as file:
    file.write(str(datetime.datetime.now()))

## Predict Test Set

### Load the saved Model and Tokenizer

In [94]:
fn_test = '../../data/subtask2/subtask2_test.data.txt'
fn_gold = "../../data/subtask2/subtask2_test.labels.txt"
model_name = get_best_model_checkpoint(model_path)
model_name

'../../models/subtask2/2024-07-13_01-03-19/checkpoint-216'

In [95]:
model = BertForTokenClassification.from_pretrained(model_name)
model.to(device)
model.eval()
tokenizer = BertTokenizerFast.from_pretrained(model_name)

In [96]:
with open(fn_test, 'r', encoding='utf-8') as file:
    test = file.readlines()
# prepare test samples
test = Dataset.from_list([dict(sentence=s) for s in test], split="Test")

In [97]:
def predict(batch, model=None):
    id_to_label = model.config.id2label
    if "prediction" in batch:
        del batch["prediction"]
    texts = [t.split() for t in batch["sentence"]]
    tokenized = tokenizer(texts,
                          padding="longest",
                          max_length=512,
                          is_split_into_words=True,
                          return_tensors="pt")
    word_ids = [tokenized.word_ids(idx) for idx in range(len(tokenized.input_ids))]
    tokenized.to(device)
    with torch.no_grad():
        outputs = model(**tokenized)
    prediction = outputs.logits.argmax(dim=2)
    pred_labels = []
    for pred, w_ids in zip(prediction, word_ids):
        predictions_for_words = helper.align_labels(w_ids, pred.tolist())
        predicted_labels = [id_to_label[label_id] for label_id in predictions_for_words]
        pred_labels.append(predicted_labels)
    batch["prediction"] = pred_labels
    return batch

In [98]:
%%time
test = test.map(partial(predict, model=model), batched=True, batch_size=128)

Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 374/374 [00:01<00:00, 350.99 examples/s]

CPU times: user 2.54 s, sys: 345 ms, total: 2.88 s
Wall time: 1.13 s





In [99]:
with open(f'{log_path}/prediction.txt', 'w') as file:
    for prediction in test["prediction"]:
        file.write(f'{" ".join(prediction)}\n')

In [100]:
# test performance
fn_pred = f'{log_path}/prediction.txt'
preds = calc_metrics.load_bio(fn_pred)
gold = calc_metrics.load_bio(fn_gold)
print(len(preds), len(gold))
metrics_df = calc_metrics.compute(preds, gold)
metrics_df.to_json(f"{log_path}/test_metrics.json")

375 375


In [101]:
metrics_df

Unnamed: 0,precision,recall,f1,number
Abbreviation,0.5,0.7,0.583333,10.0
AlternativeName,0.666667,1.0,0.8,2.0
Citation,0.816901,0.878788,0.846715,66.0
Developer,0.794643,0.847619,0.820276,105.0
Extension,1.0,0.4,0.571429,10.0
License,0.916667,1.0,0.956522,11.0
Release,0.5,1.0,0.666667,7.0
URL,1.0,0.923077,0.96,65.0
Version,0.870748,0.941176,0.904594,136.0
micro,0.837529,0.88835,0.862191,
