In [1]:
# !pip install evaluate
# !pip install seqeval

In [2]:
# !pip install accelerate -U
# !pip install transformers[torch]

In [3]:
import sys
sys.path.insert(1, 'C:/Users/fschr/Desktop/Masterarbeit/master-thesis-software-ie/experiments')
import helper

## Dataset

### Read the Train Dataset (Sentences and Labels)

In [4]:
# Open the Data of Sentences
with open('../../data/subtask1/subtask1_train.data.txt', 'r', encoding='utf-8') as file:
    sentences = file.readlines()


# Open the label data
with open('../../data/subtask1/subtask1_train.labels.txt', 'r', encoding='utf-8') as file:
    labels = file.readlines()

### Convert the Datasets into a pandas Dataframe

In [5]:
import pandas as pd

dataset = pd.DataFrame(list(zip(sentences, labels)), columns=['sentences', 'labels'])

### Split sentences and labels

In [6]:
dataset['sentences'] = dataset['sentences'].apply(lambda row: row.split())
dataset['labels'] = dataset['labels'].apply(lambda row: row.split())

### The class labels

In [7]:
class_labels = []
for line in labels:
    labels_list = line.split()
    for label in labels_list:
        if label not in class_labels:
            class_labels.append(label)
class_labels

['O',
 'B-Application_Creation',
 'B-Application_Mention',
 'B-SoftwareCoreference_Deposition',
 'B-Application_Deposition',
 'B-OperatingSystem_Usage',
 'B-ProgrammingEnvironment_Usage',
 'B-Application_Usage',
 'I-SoftwareCoreference_Deposition',
 'B-PlugIn_Creation',
 'B-ProgrammingEnvironment_Mention',
 'B-PlugIn_Deposition',
 'B-PlugIn_Mention',
 'I-PlugIn_Mention',
 'I-Application_Creation',
 'I-PlugIn_Creation',
 'I-PlugIn_Deposition',
 'B-PlugIn_Usage',
 'I-PlugIn_Usage',
 'I-ProgrammingEnvironment_Usage',
 'I-ProgrammingEnvironment_Mention',
 'I-Application_Usage',
 'I-Application_Mention',
 'I-Application_Deposition',
 'B-OperatingSystem_Mention',
 'I-OperatingSystem_Usage',
 'I-OperatingSystem_Mention']

### Entity Labels

In [28]:
entity_labels = ['O']
for label in class_labels:
    if label == 'O':
        continue
    entity_label = label.split('_')[0]
    if entity_label not in entity_labels:
        entity_labels.append(entity_label)

entity_label_grouping = True
entity_labels

['O',
 'B-Application',
 'B-SoftwareCoreference',
 'B-OperatingSystem',
 'B-ProgrammingEnvironment',
 'I-SoftwareCoreference',
 'B-PlugIn',
 'I-PlugIn',
 'I-Application',
 'I-ProgrammingEnvironment',
 'I-OperatingSystem']

### Change the labels, if we have Entity Labels

In [26]:
if entity_label_grouping:
    dataset['labels'] = dataset['labels'].apply(helper.reduce_to_entity_type_labels)

#### label to id

In [29]:
if entity_label_grouping:
    class_labels = entity_labels
label_to_id = {label:id for id, label in enumerate(class_labels)}

#### id to label

In [30]:
id_to_label = {id:label for id, label in enumerate(class_labels)}
id_to_label

{0: 'O',
 1: 'B-Application',
 2: 'B-SoftwareCoreference',
 3: 'B-OperatingSystem',
 4: 'B-ProgrammingEnvironment',
 5: 'I-SoftwareCoreference',
 6: 'B-PlugIn',
 7: 'I-PlugIn',
 8: 'I-Application',
 9: 'I-ProgrammingEnvironment',
 10: 'I-OperatingSystem'}

### Map Labels to Class_Labels

In [31]:
dataset['ids'] = dataset['labels'].apply(lambda row: [label_to_id.get(label) for label in row])

In [32]:
dataset.head()

Unnamed: 0,sentences,labels,ids
0,"[Here, we, report, a, comprehensive, suite, fo...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[The, resource, is, available, free, of, charg...","[O, B-SoftwareCoreference, O, O, O, O, O, O, O...","[0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,"[In, this, work, ,, we, described, the, DelPhi...","[O, O, O, O, O, O, O, B-Application, O, O, O, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]"
3,"["", Project, name, :, DelPhi, Project, home, p...","[O, O, O, O, B-Application, O, O, O, O, O, O, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[We, have, developed, ANDES, ,, a, software, l...","[O, O, O, B-Application, O, O, O, O, O, O, O, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


## Prepare and Train the dataset

### Split the dataset into Trian and Validation

In [33]:
from sklearn.model_selection import train_test_split

train, validation = train_test_split(dataset, test_size=0.25, random_state=42)

#### Write the Train and Validation dataset

In [34]:
train.to_csv('../../data/subtask1/subtask1_split.train.txt', header=False, index=False, sep='\n', encoding='utf-8')

In [35]:
validation.to_csv('../../data/subtask1/subtask1_split.validation.txt', header=False, index=False, sep='\n', encoding='utf-8')

### Tokenize and convert the labels from tokenized into ids

In [37]:
from transformers import BertTokenizerFast, AutoTokenizer

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

### Positive Samples (Sentences with labels)

In [38]:
train_contains_labels = train.ids.apply(sum) != 0
validation_contains_labels = validation.ids.apply(sum) != 0
# validation[validation_contains_labels]

### Negative Samples (Sentences without any labels)

In [39]:
train_without_labels = train.ids.apply(sum) == 0
validation_without_labels = validation.ids.apply(sum) == 0

### Reduce the dataset to positive samples

In [41]:
reduce_to_positive_samples = True
if reduce_to_positive_samples:
    train = train[train_contains_labels].copy()
    validation = validation[validation_contains_labels].copy()
else:
    train = train[train_without_labels].copy()
    validation = validation[validation_without_labels].copy()

  train = train[train_contains_labels].copy()
  validation = validation[validation_contains_labels].copy()


### Tokenize and convert the labels from tokenized into ids

In [42]:
def tokenization(input_data):
    tokenized_train_inputs = tokenizer(
        input_data,
        return_tensors="pt",
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=512,
    )
    
    return tokenized_train_inputs

#### Tokenize Train

In [43]:
train['tokenized'] = train.apply(lambda row: helper.tokenized_align_labels(tokenization(row['sentences']), row['ids']), axis=1)

#### Tokenize Validation

In [44]:
validation['tokenized'] = validation.apply(lambda row: helper.tokenized_align_labels(tokenization(row['sentences']), row['ids']), axis=1)

### Prepare Train Dataset for fine-tuning

In [45]:
Train_dataset = train.tokenized.apply(
    lambda x: {
        k: v[0]
        if type(v) is not list
        else torch.tensor(v)
        for k, v in x.items()}).to_list()

### Prepare Validation Dataset for fine-tuning

In [46]:
Val_dataset = validation.tokenized.apply(
    lambda x: {
        k: v[0]
        if type(v) is not list
        else torch.tensor(v)
        for k, v in x.items()}).to_list()

### Load Pre_trained Model

In [47]:
from transformers import AutoModelForTokenClassification
### todo guck mal nach cased und uncased ob es mit cased besser funktioniert oder mit uncased
model = AutoModelForTokenClassification.from_pretrained('bert-base-uncased', id2label=id_to_label, num_labels=len(class_labels))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Evaluation

In [48]:
import evaluate
import numpy as np
metric = evaluate.load('seqeval')


def compute_metrics(eval_preds, label_class=class_labels):
    logits, labels = eval_preds
    # becase the logics and probabilities both are in the same order, we don't need to aply softmax here
    predictions = np.argmax(logits, axis=-1)
    # now we need to remove all the values, where the label is -100
    # before passing to metric.compute we should have these inputs as a list
    true_labels = [[label_class[l] for l in label if l != -100]
                   for label in labels]

    true_predictions = [[label_class[p] for p,l in zip(prediction, label) if l != -100]
                        for prediction, label in zip(predictions, labels)]

    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)

    return all_metrics

### Training

In [49]:
from transformers import  TrainingArguments, Trainer

training_args = TrainingArguments(output_dir='../../data/subtask1/sub_ner',
                                  num_train_epochs=3,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=8,
                                  per_device_eval_batch_size=8,
                                  weight_decay=0.01,
                                  evaluation_strategy='epoch',
                                  disable_tqdm=False)


trainer = Trainer(model=model, args=training_args, compute_metrics=compute_metrics, train_dataset=Train_dataset,
                  eval_dataset=Val_dataset,tokenizer=tokenizer)


dataset_performance = trainer.train()

Epoch,Training Loss,Validation Loss,Application,Operatingsystem,Plugin,Programmingenvironment,Softwarecoreference,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,No log,0.42459,"{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 104}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 8}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 16}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 12}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}",0.0,0.0,0.0,0.9314
2,No log,0.347046,"{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 104}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 8}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 16}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 12}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}",0.0,0.0,0.0,0.9314
3,No log,0.33137,"{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 104}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 8}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 16}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 12}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}",0.0,0.0,0.0,0.9314


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 104}" of type <class 'dict'> for key "eval/Application" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 8}" of type <class 'dict'> for key "eval/OperatingSystem" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 16}" of type <class 'dict'> for key "eval/PlugIn" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 12}" of type <c

### Performance of dataset

In [50]:
dataset_performance

TrainOutput(global_step=39, training_loss=0.6284570449437851, metrics={'train_runtime': 1561.9986, 'train_samples_per_second': 0.192, 'train_steps_per_second': 0.025, 'total_flos': 78395405414400.0, 'train_loss': 0.6284570449437851, 'epoch': 3.0})

### Save the fine-tuned Model and Tokenizer

In [51]:
model.save_pretrained("../../data/subtask1/dataset_ner_model")
tokenizer.save_pretrained("../../data/subtask1/dataset_tokenizer")

('../../data/subtask1/dataset_tokenizer\\tokenizer_config.json',
 '../../data/subtask1/dataset_tokenizer\\special_tokens_map.json',
 '../../data/subtask1/dataset_tokenizer\\vocab.txt',
 '../../data/subtask1/dataset_tokenizer\\added_tokens.json',
 '../../data/subtask1/dataset_tokenizer\\tokenizer.json')

### Load the saved Model and Tokenizer

In [52]:
from transformers import BertTokenizer, BertForTokenClassification

tokenizer = BertTokenizerFast.from_pretrained('../../data/subtask1/dataset_tokenizer')
model = BertForTokenClassification.from_pretrained('../../data/subtask1/dataset_ner_model')

### Read the Test dataset and Prediction for  it

In [53]:
with open('../../data/subtask1/subtask1_test.data.txt', 'r', encoding='utf-8') as file:
    test = file.readlines()

#### Post-process

##### convert ids to label

In [54]:
import torch

def convert_ids_to_labels(sentence, model, tokenizer, id_to_label=id_to_label):
      sentence = sentence.split()
      inputs = tokenizer([sentence], truncation=True,is_split_into_words = True, padding=True,return_tensors='pt')
      word_ids = inputs.word_ids()
      with torch.no_grad():
          model.eval()
          outputs = model(**inputs)
      prediction = outputs.logits.argmax(dim=2)
      prediction = prediction[0].tolist()
      predictions_for_words = helper.align_labels(word_ids, prediction)
      predicted_labels = [id_to_label[id] for id in predictions_for_words]
      labels = ' '.join([p_label for p_label in predicted_labels]) ### Todo guck mal ob du überhaupt hier die str brauchst
      return labels

#### Predict

In [55]:
all_predicted_entities_dataset = []
for sentence in test:
    labels = convert_ids_to_labels(sentence=sentence, model=model, tokenizer=tokenizer, id_to_label=id_to_label)
    all_predicted_entities_dataset.append(labels)

In [56]:
with open('../../data/subtask1/prediction/prediction_dataset.txt', 'w') as file:
  for prediction in all_predicted_entities_dataset:
    file.write(f'{prediction}\n')