In [1]:
# !pip install evaluate
# !pip install seqeval

In [2]:
# !pip install accelerate -U
# !pip install transformers[torch]

In [3]:
import sys
sys.path.insert(1, 'C:/Users/fschr/Desktop/Masterarbeit/master-thesis-software-ie/experiments')
import helper

# Dataset

## Read the Dataset (Sentences and Labels)

In [4]:
with open('../../data/subtask2/subtask2_train.data.txt', 'r', encoding='utf-8') as file:
    sentences = file.readlines()

In [5]:
with open('../../data/subtask2/subtask2_train.labels.txt', 'r', encoding='utf-8') as file:
    labels = file.readlines()

## Convert the Datasets into a pandas Dataframe

In [6]:
import pandas as pd

dataset = pd.DataFrame(list(zip(sentences, labels)), columns=['sentences', 'labels'])

## Split Sentences and Labels

In [7]:
dataset['sentences'] = dataset['sentences'].apply(lambda row:row.split())
dataset['labels'] = dataset['labels'].apply(lambda row:row.split())

In [8]:
dataset.head()

Unnamed: 0,sentences,labels
0,"[Here, we, report, a, comprehensive, suite, fo...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,"[The, resource, is, available, free, of, charg...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B-URL, O]"
2,"[In, this, work, ,, we, described, the, DelPhi...","[O, O, O, O, O, O, O, O, O, O, O, O, O]"
3,"["", Project, name, :, DelPhi, Project, home, p...","[O, O, O, O, O, O, O, O, O, O, B-URL, O, O, O,..."
4,"[We, have, developed, ANDES, ,, a, software, l...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


## The class labels

In [9]:
class_labels = []
for line in labels:
    labels_list = line.split()
    for label in labels_list:
        if label not in class_labels:
            class_labels.append(label)
            
class_labels

['O',
 'B-URL',
 'B-AlternativeName',
 'I-AlternativeName',
 'B-License',
 'I-License',
 'B-Version',
 'B-Abbreviation',
 'B-Citation',
 'B-Release',
 'B-Developer',
 'I-Developer',
 'I-Citation',
 'B-Extension',
 'I-Extension',
 'I-Version',
 'I-URL',
 'I-Release']

### Entity Labels

In [10]:
entity_labels = ['O']
for label in class_labels:
    if label == 'O':
        continue
    entity_label = label.split('_')[0]
    if entity_label not in entity_labels:
        entity_labels.append(entity_label)

entity_label_grouping = True
entity_labels

['O',
 'B-URL',
 'B-AlternativeName',
 'I-AlternativeName',
 'B-License',
 'I-License',
 'B-Version',
 'B-Abbreviation',
 'B-Citation',
 'B-Release',
 'B-Developer',
 'I-Developer',
 'I-Citation',
 'B-Extension',
 'I-Extension',
 'I-Version',
 'I-URL',
 'I-Release']

### Change the labels, if we have Entity Labels

In [11]:
if entity_label_grouping:
    dataset['labels'] = dataset['labels'].apply(helper.reduce_to_entity_type_labels)

### Label to ID

In [12]:
if entity_label_grouping:
    class_labels = entity_labels
label_to_id = {label:id for id, label in enumerate(class_labels)}

### ID to Labels

In [13]:
id_to_label = {id:label for id, label in enumerate(class_labels)}

## Map Labels to Class_Labels

In [14]:
dataset['ids'] = dataset['labels'].apply(lambda row: [label_to_id.get(label) for label in row])

In [15]:
dataset.head()

Unnamed: 0,sentences,labels,ids
0,"[Here, we, report, a, comprehensive, suite, fo...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[The, resource, is, available, free, of, charg...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B-URL, O]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]"
2,"[In, this, work, ,, we, described, the, DelPhi...","[O, O, O, O, O, O, O, O, O, O, O, O, O]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,"["", Project, name, :, DelPhi, Project, home, p...","[O, O, O, O, O, O, O, O, O, O, B-URL, O, O, O,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
4,"[We, have, developed, ANDES, ,, a, software, l...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


## Prepare and Train the dataset

### Split the dataset into Trian and Validation

In [16]:
from sklearn.model_selection import train_test_split

train, validation = train_test_split(dataset, test_size=0.25, random_state=42)

### Write the Train and Validation dataset

In [17]:
train.to_csv('../../data/subtask2/subtask2_split.train.txt', header=False, index=False, sep='\n', encoding='utf-8')

In [18]:
validation.to_csv('../../data/subtask2/subtask2_split.validation.txt', header=False, index=False, sep='\n', encoding='utf-8')

## Tokenize and convert the labels from tokenized into ids

In [19]:
from transformers import BertTokenizerFast, AutoTokenizer

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

## Positive Samples (Sentences with labels)

In [20]:
train_contains_labels = train.ids.apply(sum) != 0
validation_contains_labels = validation.ids.apply(sum) != 0

reduce_to_positive_samples = False

## Negative Samples (Sentences without any labels)

In [21]:
train_without_labels = train.ids.apply(sum) == 0
validation_without_labels = validation.ids.apply(sum) == 0

reduce_to_negative_samples = False

## Reduce the dataset to positive samples

In [22]:
if reduce_to_positive_samples:
    train = train[train_contains_labels].copy()
    validation = validation[validation_contains_labels].copy()
elif reduce_to_negative_samples:
    train = train[train_without_labels].copy()
    validation = validation[validation_without_labels].copy()

## Tokenize and convert the labels from tokenized into ids

In [23]:
def tokenization(input_data):
    tokenized_train_inputs = tokenizer(
        input_data,
        return_tensors="pt",
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=512,
    )
    
    return tokenized_train_inputs

### Tokenize Train

In [24]:
train['tokenized'] = train.apply(lambda row: helper.tokenized_align_labels(tokenization(row['sentences']), row['ids']), axis=1)


### Tokenize Validation

In [25]:
validation['tokenized'] = validation.apply(lambda row: helper.tokenized_align_labels(tokenization(row['sentences']), row['ids']), axis=1)

## Prepare Train Dataset for fine-tuning

In [26]:
Train_dataset = train.tokenized.apply(
    lambda x: {
        k: v[0]
        if type(v) is not list
        else torch.tensor(v)
        for k, v in x.items()}).to_list()

## Prepare Train Dataset for fine-tuning

In [27]:
Val_dataset = validation.tokenized.apply(
    lambda x: {
        k: v[0]
        if type(v) is not list
        else torch.tensor(v)
        for k, v in x.items()}).to_list()

## Load Pre_trained Model

In [28]:
from transformers import AutoModelForTokenClassification
### todo guck mal nach cased und uncased ob es mit cased besser funktioniert oder mit uncased
model = AutoModelForTokenClassification.from_pretrained('bert-base-uncased', id2label=id_to_label, num_labels=len(class_labels))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Evaluation

In [29]:
import evaluate
import numpy as np
metric = evaluate.load('seqeval')


def compute_metrics(eval_preds, label_class=class_labels):
    logits, labels = eval_preds
    # becase the logics and probabilities both are in the same order, we don't need to aply softmax here
    predictions = np.argmax(logits, axis=-1)
    # now we need to remove all the values, where the label is -100
    # before passing to metric.compute we should have these inputs as a list
    true_labels = [[label_class[l] for l in label if l != -100]
                   for label in labels]

    true_predictions = [[label_class[p] for p,l in zip(prediction, label) if l != -100]
                        for prediction, label in zip(predictions, labels)]

    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)

    return all_metrics

## Training

In [30]:
from transformers import  TrainingArguments, Trainer

training_args = TrainingArguments(output_dir='../../data/subtask2/sub_ner',
                                  num_train_epochs=3,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=8,
                                  per_device_eval_batch_size=8,
                                  weight_decay=0.01,
                                  evaluation_strategy='epoch',
                                  disable_tqdm=False)


trainer = Trainer(model=model, args=training_args, compute_metrics=compute_metrics, train_dataset=Train_dataset,
                  eval_dataset=Val_dataset,tokenizer=tokenizer)


dataset_performance = trainer.train()

Epoch,Training Loss,Validation Loss,Abbreviation,Citation,Developer,Extension,License,Release,Url,Version,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,No log,0.472182,"{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 2}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 22}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 28}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 2}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 3}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 12}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 53}",0.0,0.0,0.0,0.937388
2,No log,0.35592,"{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 2}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 22}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 28}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 2}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 3}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 12}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 53}",0.0,0.0,0.0,0.937388
3,No log,0.330088,"{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 2}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 22}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 28}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 2}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 3}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 12}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 53}",0.0,0.0,0.0,0.937388


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 2}" of type <class 'dict'> for key "eval/Abbreviation" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 22}" of type <class 'dict'> for key "eval/Citation" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 28}" of type <class 'dict'> for key "eval/Developer" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 2}" of type <class 

## Performance of dataset

In [31]:
dataset_performance

TrainOutput(global_step=39, training_loss=0.8326547084710537, metrics={'train_runtime': 1506.685, 'train_samples_per_second': 0.199, 'train_steps_per_second': 0.026, 'total_flos': 78400366387200.0, 'train_loss': 0.8326547084710537, 'epoch': 3.0})

## Save the fine-tuned Model and Tokenizer

In [32]:
model.save_pretrained("../../data/subtask2/dataset_ner_model")
tokenizer.save_pretrained("../../data/subtask2/dataset_tokenizer")

('../../data/subtask2/dataset_tokenizer\\tokenizer_config.json',
 '../../data/subtask2/dataset_tokenizer\\special_tokens_map.json',
 '../../data/subtask2/dataset_tokenizer\\vocab.txt',
 '../../data/subtask2/dataset_tokenizer\\added_tokens.json',
 '../../data/subtask2/dataset_tokenizer\\tokenizer.json')

## Read the Test dataset and Prediction for it

In [33]:
with open('../../data/subtask2/subtask2_test.data.txt', 'r', encoding='utf-8') as file:
    test = file.readlines()

## Post-process

### convert ids to label

In [34]:
import torch

def convert_ids_to_labels(sentence, model, tokenizer, id_to_label=id_to_label):
      sentence = sentence.split()
      inputs = tokenizer([sentence], truncation=True,is_split_into_words = True, padding=True,return_tensors='pt')
      word_ids = inputs.word_ids()
      with torch.no_grad():
          model.eval()
          outputs = model(**inputs)
      prediction = outputs.logits.argmax(dim=2)
      prediction = prediction[0].tolist()
      predictions_for_words = helper.align_labels(word_ids, prediction)
      predicted_labels = [id_to_label[id] for id in predictions_for_words]
      labels = ' '.join([p_label for p_label in predicted_labels]) ### Todo guck mal ob du Ã¼berhaupt hier die str brauchst
      return labels

### Predict

In [35]:
all_predicted_entities_dataset = []
for sentence in test:
    labels = convert_ids_to_labels(sentence=sentence, model=model, tokenizer=tokenizer, id_to_label=id_to_label)
    all_predicted_entities_dataset.append(labels)

In [36]:
with open('../../data/subtask2/prediction/prediction_dataset.txt', 'w') as file:
  for prediction in all_predicted_entities_dataset:
    file.write(f'{prediction}\n')