In [1]:
# !pip install evaluate
# !pip install seqeval

In [2]:
# !pip install accelerate -U
# !pip install transformers[torch]

In [5]:
import sys

sys.path.append('..')
import helper
import collections

## Dataset

### Read the Train Dataset (Sentences and Labels)

In [6]:
# Open the Data of Sentences
with open('../../data/subtask1/subtask1_train.data.txt', 'r', encoding='utf-8') as file:
    sentences = file.readlines()


# Open the label data
with open('../../data/subtask1/subtask1_train.labels.txt', 'r', encoding='utf-8') as file:
    labels = file.readlines()

### Convert the Dataset into a pandas dataframe

In [7]:
import pandas as pd

dataset = pd.DataFrame(list(zip(sentences, labels)), columns=['sentences', 'labels'])
# split sentences and labels
dataset['sentences'] = dataset['sentences'].apply(lambda row: row.split())
dataset['labels'] = dataset['labels'].apply(lambda row: row.split())

### Unique labels

In [8]:
unique_labels = []

for line in labels:
    labels_list = line.split()
    for label in labels_list:
        if label not in unique_labels:
            unique_labels.append(label)
            
uniqueLabel_to_ID = {unique_label: ID for ID, unique_label in enumerate(unique_labels)}
ID_to_uniqueLable = {ID: unique_label for ID, unique_label in enumerate(unique_labels)}

### Map each label to its ID

In [9]:
dataset['IDs'] = dataset['labels'].apply(lambda row: [uniqueLabel_to_ID.get(label) for label in row])

# Prepare the train and validation dataset

In [10]:
from sklearn.model_selection import train_test_split

train, validation = train_test_split(dataset, test_size=0.25, random_state=42)

## Tokenize and convet the labels from tokenized into IDs

In [11]:
from transformers import BertTokenizerFast, AutoTokenizer

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [12]:
def tokenization(input_data):
    tokenized_train_inputs = tokenizer(
        input_data,
        return_tensors="pt",
        truncation=True,
        is_split_into_words=True,
        padding="do_not_pad",
        max_length=512,
    )
    
    return tokenized_train_inputs

### Trian Tokenization

In [13]:
train['tokenized'] = train.apply(lambda row: helper.tokenized_align_labels(tokenization(row['sentences']), row['IDs']), axis=1)

### Validation Tokenization

In [14]:
validation['tokenized'] = validation.apply(lambda row: helper.tokenized_align_labels(tokenization(row['sentences']), row['IDs']), axis=1)

### Prepare Train dataset for fine-tuning

In [15]:
Train_dataset = train.tokenized.apply(
    lambda x: {
        k: v[0]
        if type(v) is not list
        else torch.tensor(v)
        for k, v in x.items()}).to_list()

### Prepare validation dataset for fine-tuning

In [16]:
Val_dataset = validation.tokenized.apply(
    lambda x: {
        k: v[0]
        if type(v) is not list
        else torch.tensor(v)
        for k, v in x.items()}).to_list()

## Load pretrained model

In [17]:
from transformers import AutoModelForTokenClassification
### todo guck mal nach cased und uncased ob es mit cased besser funktioniert oder mit uncased
model_name = "bert-base-uncased"
model = AutoModelForTokenClassification.from_pretrained(model_name, id2label=ID_to_uniqueLable, num_labels=len(unique_labels))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Evaluation

In [18]:
import evaluate
import numpy as np
metric = evaluate.load('seqeval')


def compute_metrics(eval_preds, label_class=unique_labels):
    logits, labels = eval_preds
    # becase the logics and probabilities both are in the same order, we don't need to aply softmax here
    predictions = np.argmax(logits, axis=-1)
    # now we need to remove all the values, where the label is -100
    # before passing to metric.compute we should have these inputs as a list
    true_labels = [[label_class[l] for l in label if l != -100]
                   for label in labels]

    true_predictions = [[label_class[p] for p,l in zip(prediction, label) if l != -100]
                        for prediction, label in zip(predictions, labels)]

    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)

    return all_metrics

# Parameter optimization

## Define the hyperparameter search

In [19]:
# !pip install transformers optuna datasets

In [20]:
import optuna
from transformers import Trainer, TrainingArguments, BertForTokenClassification
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)
def model_init():
    return BertForTokenClassification.from_pretrained(model_name, num_labels=len(unique_labels))

def compute_metrics(eval_preds, label_class=unique_labels):
    logits, labels = eval_preds
    # becase the logics and probabilities both are in the same order, we don't need to aply softmax here
    predictions = np.argmax(logits, axis=-1)
    # now we need to remove all the values, where the label is -100
    # before passing to metric.compute we should have these inputs as a list
    true_labels = [[label_class[l] for l in label if l != -100]
                   for label in labels]

    true_predictions = [[label_class[p] for p,l in zip(prediction, label) if l != -100]
                        for prediction, label in zip(predictions, labels)]

    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)

    return all_metrics

def objective(trial):
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64])
    epochs = trial.suggest_int("num_train_epochs", 7, 8, 9)
    weight_decay = trial.suggest_loguniform("weight_decay", 0.1, 0.3)
    
    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=weight_decay,
        logging_dir='./logs',
        logging_steps=10,
    )



    trainer = Trainer(
        model_init=model_init,
        args=training_args,
        train_dataset=Train_dataset,
        eval_dataset=Val_dataset,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    
    trainer.train()
    
    eval_result = trainer.evaluate()
    return eval_result

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

print("Best trial:")
trial = study.best_trial
print(trial.values)
print("Best hyperparameters: {}".format(trial.params))


[I 2024-07-13 18:59:49,862] A new study created in memory with name: no-name-eaec023f-d28e-4064-ae4b-0210fd895a6b
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
  epochs = trial.suggest_int("num_train_epochs", 7, 8, 9)
  weight_decay = trial.suggest_loguniform("weight_decay", 0.1, 0.3)
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[W 2024-07-13 18:59:57,217] Trial 0 failed with parameters: {'learning_rate': 1.8351014706990397e-05, 'batch

KeyboardInterrupt: 

## Retrain with best parameters