In [1]:
%env CUDA_VISIBLE_DEVICES=7

env: CUDA_VISIBLE_DEVICES=7


In [None]:
import sys
sys.path.append("..")
sys.path.append("../../evaluation")
import helper

In [None]:
import calc_metrics

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import datetime
from functools import partial
from glob import glob
from pathlib import Path
import json
from itertools import chain

In [None]:
from datasets import Dataset
from sklearn.model_selection import train_test_split
import evaluate
import numpy as np
import pandas as pd
import torch

from transformers import AutoModelForTokenClassification,\
                         BertTokenizerFast, BertTokenizer, AutoTokenizer,\
                         TrainingArguments, Trainer, DataCollatorForTokenClassification,\
                         BertForTokenClassification

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [133]:
log_path_base = f"negative_sampling_rate"

## Dataset

### Read the Train Dataset (Sentences and Labels)

In [None]:
fn_train_sentences = '../../data/subtask1/subtask1_train.data.txt'
fn_train_labels = '../../data/subtask1/subtask1_train.labels.txt'
entity_label_grouping = False

# Open the Data of Sentences
with open(fn_train_sentences, 'r', encoding='utf-8') as file:
    train_sentences = file.readlines()
    train_sentences = [sentence.split() for sentence in train_sentences]

# Open the label data
with open(fn_train_labels, 'r', encoding='utf-8') as file:
    train_labels = file.readlines()
    train_labels = [sentence_labels.split() for sentence_labels in train_labels]

### Change the labels, if we have Entity Labels
if entity_label_grouping:
    train_labels = [helper.reduce_to_entity_type_labels(labels) for labels in train_labels]


In [24]:
dataset = pd.DataFrame(zip(train_sentences, train_labels), columns=["sentence", "label"])
dataset.head()

Unnamed: 0,sentence,label
0,"[Here, we, report, a, comprehensive, suite, fo...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,"[The, resource, is, available, free, of, charg...","[O, B-SoftwareCoreference_Deposition, O, O, O,..."
2,"[In, this, work, ,, we, described, the, DelPhi...","[O, O, O, O, O, O, O, B-Application_Creation, ..."
3,"["", Project, name, :, DelPhi, Project, home, p...","[O, O, O, O, B-Application_Deposition, O, O, O..."
4,"[We, have, developed, ANDES, ,, a, software, l...","[O, O, O, B-Application_Creation, O, O, O, O, ..."


## Create label id lookup

In [20]:
possible_labels = list(set(chain(*train_labels)))
possible_labels.sort(key=lambda x: (x != "O", x[2:], x[0]))

In [21]:
id_to_label = dict(enumerate(possible_labels))
label_to_id = {label:id for id, label in id_to_label.items()}
id_to_label

{0: 'O',
 1: 'B-Application_Creation',
 2: 'I-Application_Creation',
 3: 'B-Application_Deposition',
 4: 'I-Application_Deposition',
 5: 'B-Application_Mention',
 6: 'I-Application_Mention',
 7: 'B-Application_Usage',
 8: 'I-Application_Usage',
 9: 'B-OperatingSystem_Mention',
 10: 'I-OperatingSystem_Mention',
 11: 'B-OperatingSystem_Usage',
 12: 'I-OperatingSystem_Usage',
 13: 'B-PlugIn_Creation',
 14: 'I-PlugIn_Creation',
 15: 'B-PlugIn_Deposition',
 16: 'I-PlugIn_Deposition',
 17: 'B-PlugIn_Mention',
 18: 'I-PlugIn_Mention',
 19: 'B-PlugIn_Usage',
 20: 'I-PlugIn_Usage',
 21: 'B-ProgrammingEnvironment_Mention',
 22: 'I-ProgrammingEnvironment_Mention',
 23: 'B-ProgrammingEnvironment_Usage',
 24: 'I-ProgrammingEnvironment_Usage',
 25: 'B-SoftwareCoreference_Deposition',
 26: 'I-SoftwareCoreference_Deposition'}

## Prepare and Train the dataset

In [135]:
# parameter
eval_size=0.25
random_state=42
negative_sample_rate = 5. # set to None if all negative samples should be used
experiment_name = f'{negative_sample_rate:2.1f}'.replace(".", "_") if negative_sample_rate is not None else "None"
#experiment_name = "sunny"
experiment_name

'5_0'

### Split the dataset into Train and Validation

In [136]:
train, validation = train_test_split(dataset, test_size=eval_size, random_state=random_state)
len(train), len(validation)

(29826, 9942)

### Reduce the dataset to positive samples

In [137]:
if negative_sample_rate is not None:
    ### Positive Samples (Sentences with labels)
    train_contains_labels = train.label.apply(lambda x: len([l for l in x if l != "O"]) > 0)
    print("Positive Samples:", str(train_contains_labels.value_counts().to_dict()))
    pos_sample_ids = train_contains_labels[train_contains_labels].index
    neg_sample_ids = train_contains_labels[~train_contains_labels].index
    pivot_idx = int(len(pos_sample_ids) * negative_sample_rate)
    train_ids = list(pos_sample_ids) + list(neg_sample_ids[:pivot_idx])
    train = train.loc[train_ids].copy()

Positive Samples: {False: 28088, True: 1738}


In [138]:
len(train), len(validation)

(10428, 9942)

# Tokenize and map labels

In [139]:
base_model = 'bert-base-uncased'

In [140]:
%%time
tokenizer = BertTokenizerFast.from_pretrained(base_model)
tokenizer_params = dict(
    truncation=True,
    is_split_into_words=True,
    padding="do_not_pad",
    max_length=512)

Train_dataset = tokenizer(train['sentence'].to_list(), **tokenizer_params)
label_ids = train['label'].apply(lambda labels: [label_to_id.get(label) for label in labels]).to_list()
helper.map_labels(Train_dataset, label_ids)
Train_dataset = Dataset.from_dict(Train_dataset)

Val_dataset = tokenizer(validation['sentence'].to_list(), **tokenizer_params)
label_ids = validation['label'].apply(lambda labels: [label_to_id.get(label) for label in labels]).to_list()
helper.map_labels(Val_dataset, label_ids)
Val_dataset = Dataset.from_dict(Val_dataset)

dataset_info = dict(
    n_train=len(Train_dataset),
    n_eval=len(Val_dataset),
    negative_sample_rate=negative_sample_rate,
    eval_size=eval_size,
)
dataset_info

CPU times: user 22.6 s, sys: 461 ms, total: 23.1 s
Wall time: 2.2 s


{'n_train': 10428,
 'n_eval': 9942,
 'negative_sample_rate': 5.0,
 'eval_size': 0.25}

### Train Model

In [141]:
def get_trainer_state(model_path):
    states = glob(f"{model_path}/*/trainer_state.json")
    if not states:
        return
    states = [json.load(open(f)) for f in states]
    states.sort(key=lambda x: -x["epoch"])
    states_last = states[0]
    return states_last

def get_best_model_checkpoint(model_path):
    states = get_trainer_state(model_path)
    best_model_checkpoint = states["best_model_checkpoint"]
    return best_model_checkpoint

In [142]:
print()




### Training

In [143]:
# Mail: Sent: Donnerstag, 11. Juli 2024 13:39
# Epochs=5 , train_batch_size=64, validation_batch_size=32, lr=2e-5

model_base_path =  '../../models/subtask1'

training_args = dict(
    save_total_limit=2,
    save_strategy='epoch',
    metric_for_best_model="overall_f1",
    num_train_epochs=5,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=8,
    weight_decay=0.01,
    warmup_ratio=0.0,
    evaluation_strategy='epoch',
    disable_tqdm=False,
)
start = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
model_path = f'{model_base_path}/{start}'
training_args["output_dir"] = model_path
model_path

'../../models/subtask1/2024-07-13_00-22-22'

In [144]:
model = AutoModelForTokenClassification.from_pretrained(base_model, id2label=id_to_label, num_labels=len(id_to_label))
trainer = Trainer(model=model,
                  args=TrainingArguments(**training_args),
                  compute_metrics=partial(helper.ner_eval_metrics, id_to_label=id_to_label),
                  train_dataset=Train_dataset,
                  eval_dataset=Val_dataset,
                  data_collator=DataCollatorForTokenClassification(tokenizer, padding="longest", max_length=512),
                  tokenizer=tokenizer)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [145]:
# old time: 1:23 => 19 minutes
# only positive => 

In [None]:
dataset_performance = trainer.train()

Epoch,Training Loss,Validation Loss,Application Creation,Application Deposition,Application Mention,Application Usage,Operatingsystem Mention,Operatingsystem Usage,Plugin Creation,Plugin Deposition,Plugin Mention,Plugin Usage,Programmingenvironment Mention,Programmingenvironment Usage,Softwarecoreference Deposition,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,No log,0.017149,"{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 44}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 25}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 45}","{'precision': 0.4524207011686144, 'recall': 0.5122873345935728, 'f1': 0.4804964539007093, 'number': 529}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 4}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 30}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 12}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 3}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 7}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 55}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 6}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 92}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 12}",0.452421,0.313657,0.370472,0.996672
2,No log,0.010397,"{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 44}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 25}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 45}","{'precision': 0.604135893648449, 'recall': 0.7731568998109641, 'f1': 0.6782752902155887, 'number': 529}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 4}","{'precision': 0.6756756756756757, 'recall': 0.8333333333333334, 'f1': 0.746268656716418, 'number': 30}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 12}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 3}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 7}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 55}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 6}","{'precision': 0.6698113207547169, 'recall': 0.7717391304347826, 'f1': 0.7171717171717172, 'number': 92}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 12}",0.615104,0.584491,0.599407,0.997805
3,No log,0.009492,"{'precision': 0.20833333333333334, 'recall': 0.22727272727272727, 'f1': 0.21739130434782608, 'number': 44}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 25}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 45}","{'precision': 0.5614035087719298, 'recall': 0.8468809073724007, 'f1': 0.6752072343632253, 'number': 529}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 4}","{'precision': 0.6829268292682927, 'recall': 0.9333333333333333, 'f1': 0.7887323943661972, 'number': 30}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 12}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 3}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 7}","{'precision': 0.1111111111111111, 'recall': 0.03636363636363636, 'f1': 0.0547945205479452, 'number': 55}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 6}","{'precision': 0.6528925619834711, 'recall': 0.8586956521739131, 'f1': 0.7417840375586856, 'number': 92}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 12}",0.552632,0.65625,0.6,0.997755


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Performance of dataset

In [None]:
dataset_performance

# Save training stats and parameter

In [None]:
log_path = f"{log_path_base}/{experiment_name}/"
Path(log_path).mkdir(parents=True, exist_ok=True)
with open(f'{log_path}/data_stats.json', 'w') as f:
    json.dump(dataset_info, f)
with open(f'{log_path}/training_args.json', "w") as f:
    json.dump(training_args, f)
trainer_state = get_trainer_state(model_path)
with open(f'{log_path}/trainer_state.json', "w") as f:
    json.dump(trainer_state, f)
with open(f'{log_path}/finished.txt', 'w') as file:
    file.write(str(datetime.datetime.now()))

## Predict Test Set

### Load the saved Model and Tokenizer

In [None]:
fn_test = '../../data/subtask1/subtask1_test.data.txt'
fn_gold = "../../data/subtask1/subtask1_test.labels.txt"
model_name = get_best_model_checkpoint(model_path)
model_name

In [None]:
model = BertForTokenClassification.from_pretrained(model_name)
model.to(device)
model.eval()
tokenizer = BertTokenizerFast.from_pretrained(model_name)

In [None]:
with open(fn_test, 'r', encoding='utf-8') as file:
    test = file.readlines()
# prepare test samples
test = Dataset.from_list([dict(sentence=s) for s in test], split="Test")

In [None]:
def predict(batch, model=None):
    id_to_label = model.config.id2label
    if "prediction" in batch:
        del batch["prediction"]
    texts = [t.split() for t in batch["sentence"]]
    tokenized = tokenizer(texts,
                          padding="longest",
                          max_length=512,
                          is_split_into_words=True,
                          return_tensors="pt")
    word_ids = [tokenized.word_ids(idx) for idx in range(len(tokenized.input_ids))]
    tokenized.to(device)
    with torch.no_grad():
        outputs = model(**tokenized)
    prediction = outputs.logits.argmax(dim=2)
    pred_labels = []
    for pred, w_ids in zip(prediction, word_ids):
        predictions_for_words = helper.align_labels(w_ids, pred.tolist())
        predicted_labels = [id_to_label[label_id] for label_id in predictions_for_words]
        pred_labels.append(predicted_labels)
    batch["prediction"] = pred_labels
    return batch

In [None]:
%%time
test = test.map(partial(predict, model=model), batched=True, batch_size=128)

In [None]:
with open(f'{log_path}/prediction.txt', 'w') as file:
    for prediction in test["prediction"]:
        file.write(f'{" ".join(prediction)}\n')

In [None]:
# test performance
fn_pred = f'{log_path}/prediction.txt'
preds = calc_metrics.load_bio(fn_pred)
gold = calc_metrics.load_bio(fn_gold)
print(len(preds), len(gold))
metrics_df = calc_metrics.compute(preds, gold)
metrics_df.to_json(f"{log_path}/test_metrics.json")

In [None]:
metrics_df