In [1]:

import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
torch.cuda.empty_cache()
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
from sklearn.metrics import confusion_matrix
from torch.nn import CrossEntropyLoss
import random
import nlpaug.augmenter.word as naw
import nltk
from nltk.corpus import wordnet



from peft import LoraConfig, TaskType
from peft import get_peft_model


2024-02-25 12:48:49.764168: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-25 12:48:49.764232: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-25 12:48:49.764262: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-25 12:48:49.772609: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df = pd.read_csv("de_4.0.csv")
df.columns[df.notnull().any()]

Index(['Unnamed: 0', 'policy_id', 'policy_type', 'segments', 'Segment Text',
       'First Party', 'Third Party', 'Information Type', 'Purpose',
       'Collection Process', 'Legal Basis for Collection',
       'Third-Party Entity', 'Information Type_Computer information',
       'Information Type_Contact information',
       'Information Type_Cookies and tracking elements',
       'Information Type_Demographic data', 'Information Type_Financial',
       'Information Type_Generic personal information',
       'Information Type_IP address and device IDs',
       'Information Type_Location', 'Purpose_Advertising or marketing',
       'Purpose_Analytics or research', 'Purpose_Essential service or feature',
       'Purpose_Legal requirement', 'Purpose_Service operation and security',
       'Collection Process_Collected on first-party website/app',
       'Legal Basis for Collection_Legitimate interests of first or third party',
       'Information Type_User online activities',
       'Col

In [6]:
target_column = 'Collection Process_Collected on first-party website/app'
batch_size = 8
epochs = 3

learning_rates =  [1e-5, 2e-5, 3e-5, 5e-5, 1e-4, 2e-4, 3e-4]

# original

In [24]:
best_lr = None
lowest_eval_loss = float('inf')


best_f1_lr = None
highest_f1 = 0


for lr in learning_rates:
    
#--------------------------------------------------------------------------------    
    tokenizer = AutoTokenizer.from_pretrained("uklfr/gottbert-base")
    model = AutoModelForSequenceClassification.from_pretrained("uklfr/gottbert-base")


 
#-----------------------------------------------------------------------------------           

    X_train = list(df['Segment Text'].loc[(df['policy_type'] == 'TRAIN')])
    y_train = list(df[target_column].loc[(df['policy_type'] == 'TRAIN')])


    # Split the data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.20, random_state=0)


    # mapping labels

    label_mapping = {0: 0, 1: 1}
    y_train_integers = [label_mapping[label] for label in y_train]
    y_val_integers = [label_mapping[label] for label in y_val]

    train_encodings = tokenizer(X_train, max_length=512, truncation=True, padding='max_length')
    val_encodings = tokenizer(X_val, max_length=512, truncation=True, padding='max_length')

    #
    class TextDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = self.labels[idx]  # Removed torch.tensor here
            return item

        def __len__(self):
            return len(self.labels)


    train_dataset = TextDataset(train_encodings, y_train_integers)
    val_dataset = TextDataset(val_encodings, y_val_integers)
    
#---------------------------------------------------------------------------  
    
    # Set up training arguments with the current learning rate
    training_args = TrainingArguments(
        output_dir='./results',
        learning_rate=lr,  # Set learning rate here
        num_train_epochs=epochs ,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=100,
        evaluation_strategy="steps",
        eval_steps=3000
    )

    # Set up the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )


    # Train the model with the current learning rate
    print(f"Training with learning rate: {lr}")
    trainer.train()
    
 # Evaluate the model
    eval_result = trainer.evaluate()
    print(f"Evaluation loss for learning rate {lr}: {eval_result['eval_loss']}")
    
    
    predictions = trainer.predict(val_dataset).predictions
    predictions_label = np.argmax(predictions, axis=1)

    cm=confusion_matrix( y_val_integers, predictions_label)
    print(cm)

    accuracy = accuracy_score( y_val_integers, predictions_label)
    precision, recall, f1, _ = precision_recall_fscore_support(y_val_integers, predictions_label, average='binary')
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")

    if eval_result['eval_loss'] < lowest_eval_loss:
        best_lr = lr
        lowest_eval_loss = eval_result['eval_loss']

    if f1 > highest_f1:
        best_f1_lr = lr
        highest_f1 = f1
        

print(f"Best Learning Rate: {best_lr} with loss: {lowest_eval_loss}")
print(f"Best f1 Learning Rat: {best_f1_lr} with highest f1: { highest_f1}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 1e-05


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 1e-05: 0.16487768292427063
[[1039   24]
 [  16   55]]
Accuracy: 0.9647266313932981
Precision: 0.6962025316455697
Recall: 0.7746478873239436
F1 Score: 0.7333333333333333


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 2e-05


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 2e-05: 0.1583331823348999
[[1039   24]
 [  20   51]]
Accuracy: 0.9611992945326279
Precision: 0.68
Recall: 0.7183098591549296
F1 Score: 0.6986301369863014


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 3e-05


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 3e-05: 0.1545935720205307
[[1039   24]
 [  17   54]]
Accuracy: 0.9638447971781305
Precision: 0.6923076923076923
Recall: 0.7605633802816901
F1 Score: 0.7248322147651006


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 5e-05


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 5e-05: 0.1748676300048828
[[1038   25]
 [  22   49]]
Accuracy: 0.9585537918871252
Precision: 0.6621621621621622
Recall: 0.6901408450704225
F1 Score: 0.6758620689655171


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 0.0001


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 0.0001: 0.2784275412559509
[[1063    0]
 [  71    0]]
Accuracy: 0.937389770723104
Precision: 0.0
Recall: 0.0
F1 Score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 0.0002


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 0.0002: 0.27561426162719727
[[1063    0]
 [  71    0]]
Accuracy: 0.937389770723104
Precision: 0.0
Recall: 0.0
F1 Score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 0.0003


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 0.0003: 0.27934563159942627
[[1063    0]
 [  71    0]]
Accuracy: 0.937389770723104
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
Best Learning Rate: 3e-05 with loss: 0.1545935720205307
Best f1 Learning Rat: 1e-05 with highest f1: 0.7333333333333333


  _warn_prf(average, modifier, msg_start, len(result))


### Retrain the model with certain learning rate


In [25]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")
model = BertForSequenceClassification.from_pretrained("bert-base-german-cased")

#-----------------------------------------------------------------------------------     

X_train=list(df['Segment Text'].loc[(df['policy_type'] == 'TRAIN')])
X_test=list(df['Segment Text'].loc[(df['policy_type'] == 'TEST')])

y_train=list(df[target_column].loc[(df['policy_type'] == 'TRAIN')])
y_test=list(df[target_column].loc[(df['policy_type'] == 'TEST')])


train_encodings = tokenizer(X_train, max_length=512, truncation=True, padding='max_length')
test_encodings = tokenizer(X_test, max_length=512, truncation=True, padding='max_length')

#-----------

label_mapping = {0: 0, 1: 1}
y_train_integers = [label_mapping[label] for label in y_train]
y_test_integers = [label_mapping[label] for label in y_test]


class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]  # Removed torch.tensor here
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings, y_train_integers)
test_dataset = TextDataset(test_encodings, y_test_integers)

#--------------------------

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate= best_f1_lr , # Set learning rate here
    num_train_epochs=epochs ,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=3000

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()


predictions = trainer.predict(test_dataset).predictions
predictions_label = np.argmax(predictions, axis=1)

cm=confusion_matrix(y_test_integers, predictions_label)
print(cm)

accuracy = accuracy_score(y_test_integers, predictions_label)
precision, recall, f1, _ = precision_recall_fscore_support(y_test_integers, predictions_label, average='binary')
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

print(f"Best f1 Learning Rat: {best_f1_lr}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


[[1134   18]
 [  10   41]]
Accuracy: 0.9767248545303409
Precision: 0.6949152542372882
Recall: 0.803921568627451
F1 Score: 0.7454545454545455
Best f1 Learning Rat: 1e-05


# Fine Tuning last 2 layers 

In [7]:

best_lr = None
lowest_eval_loss = float('inf')

best_f1_lr = None
highest_f1 = 0



for lr in learning_rates:
    
#--------------------------------------------------------------------------------    
    tokenizer = AutoTokenizer.from_pretrained("uklfr/gottbert-base")
    model = AutoModelForSequenceClassification.from_pretrained("uklfr/gottbert-base")


    # Freeze all parameters
    for param in model.parameters():
        param.requires_grad = False

    # Unfreeze the last two layers of the RoBERTa model
    # Access the layers as model.roberta.encoder.layer
    for module in [model.roberta.encoder.layer[-2], model.roberta.encoder.layer[-1]]:
        for param in module.parameters():
            param.requires_grad = True
            
#-----------------------------------------------------------------------------------           

    X_train = list(df['Segment Text'].loc[(df['policy_type'] == 'TRAIN')])
    y_train = list(df[target_column].loc[(df['policy_type'] == 'TRAIN')])


    # Split the data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.20, random_state=0)


    # mapping labels

    label_mapping = {0: 0, 1: 1}
    y_train_integers = [label_mapping[label] for label in y_train]
    y_val_integers = [label_mapping[label] for label in y_val]

    train_encodings = tokenizer(X_train, max_length=512, truncation=True, padding='max_length')
    val_encodings = tokenizer(X_val, max_length=512, truncation=True, padding='max_length')

    #
    class TextDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = self.labels[idx]  # Removed torch.tensor here
            return item

        def __len__(self):
            return len(self.labels)


    train_dataset = TextDataset(train_encodings, y_train_integers)
    val_dataset = TextDataset(val_encodings, y_val_integers)
    
#---------------------------------------------------------------------------  
    
    # Set up training arguments with the current learning rate
    training_args = TrainingArguments(
        output_dir='./results',
        learning_rate=lr,  # Set learning rate here
        num_train_epochs=epochs ,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=100,
        evaluation_strategy="steps",
        eval_steps=3000
    )

    # Set up the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )


    # Train the model with the current learning rate
    print(f"Training with learning rate: {lr}")
    trainer.train()
    
 # Evaluate the model
    eval_result = trainer.evaluate()
    print(f"Evaluation loss for learning rate {lr}: {eval_result['eval_loss']}")
    
    
    predictions = trainer.predict(val_dataset).predictions
    predictions_label = np.argmax(predictions, axis=1)

    cm=confusion_matrix( y_val_integers, predictions_label)
    print(cm)

    accuracy = accuracy_score( y_val_integers, predictions_label)
    precision, recall, f1, _ = precision_recall_fscore_support(y_val_integers, predictions_label, average='binary')
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")

    if eval_result['eval_loss'] < lowest_eval_loss:
        best_lr = lr
        lowest_eval_loss = eval_result['eval_loss']
        
    if f1 > highest_f1:
        best_f1_lr = lr
        highest_f1 = f1
        
print(f"Best Learning Rate: {best_lr} with loss: {lowest_eval_loss}")
print(f"Best f1 Learning Rat: {best_f1_lr} with highest f1: { highest_f1}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at uklfr/gottbert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 1e-05


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 1e-05: 0.19040130078792572
[[1074    0]
 [  60    0]]
Accuracy: 0.9470899470899471
Precision: 0.0
Recall: 0.0
F1 Score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at uklfr/gottbert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 2e-05


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 2e-05: 0.181021049618721
[[1074    0]
 [  60    0]]
Accuracy: 0.9470899470899471
Precision: 0.0
Recall: 0.0
F1 Score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at uklfr/gottbert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 3e-05


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 3e-05: 0.1750197410583496
[[1071    3]
 [  56    4]]
Accuracy: 0.9479717813051146
Precision: 0.5714285714285714
Recall: 0.06666666666666667
F1 Score: 0.11940298507462686


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at uklfr/gottbert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 5e-05


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 5e-05: 0.16854016482830048
[[1067    7]
 [  49   11]]
Accuracy: 0.9506172839506173
Precision: 0.6111111111111112
Recall: 0.18333333333333332
F1 Score: 0.282051282051282


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at uklfr/gottbert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 0.0001


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 0.0001: 0.1695026159286499
[[1065    9]
 [  45   15]]
Accuracy: 0.9523809523809523
Precision: 0.625
Recall: 0.25
F1 Score: 0.35714285714285715


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at uklfr/gottbert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 0.0002


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 0.0002: 0.15718339383602142
[[1066    8]
 [  53    7]]
Accuracy: 0.9462081128747796
Precision: 0.4666666666666667
Recall: 0.11666666666666667
F1 Score: 0.18666666666666668


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at uklfr/gottbert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 0.0003


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 0.0003: 0.15607497096061707
[[1062   12]
 [  49   11]]
Accuracy: 0.9462081128747796
Precision: 0.4782608695652174
Recall: 0.18333333333333332
F1 Score: 0.2650602409638554
Best Learning Rate: 0.0003 with loss: 0.15607497096061707
Best f1 Learning Rat: 0.0001 with highest f1: 0.35714285714285715


### Retrain the model with certain learning rate

In [8]:
tokenizer = AutoTokenizer.from_pretrained("uklfr/gottbert-base")
model = AutoModelForSequenceClassification.from_pretrained("uklfr/gottbert-base")


# Freeze all parameters
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the last two layers of the RoBERTa model
# Access the layers as model.roberta.encoder.layer
for module in [model.roberta.encoder.layer[-2], model.roberta.encoder.layer[-1]]:
    for param in module.parameters():
        param.requires_grad = True
            
#-----------------------------------------------------------------------------------     

X_train=list(df['Segment Text'].loc[(df['policy_type'] == 'TRAIN')])
X_test=list(df['Segment Text'].loc[(df['policy_type'] == 'TEST')])

y_train=list(df[target_column].loc[(df['policy_type'] == 'TRAIN')])
y_test=list(df[target_column].loc[(df['policy_type'] == 'TEST')])


train_encodings = tokenizer(X_train, max_length=512, truncation=True, padding='max_length')
test_encodings = tokenizer(X_test, max_length=512, truncation=True, padding='max_length')

#-----------

label_mapping = {0: 0, 1: 1}
y_train_integers = [label_mapping[label] for label in y_train]
y_test_integers = [label_mapping[label] for label in y_test]


class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]  # Removed torch.tensor here
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings, y_train_integers)
test_dataset = TextDataset(test_encodings, y_test_integers)

#--------------------------

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate= best_f1_lr , # Set learning rate here
    num_train_epochs=epochs ,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=3000

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()

predictions = trainer.predict(test_dataset).predictions
predictions_label = np.argmax(predictions, axis=1)

cm=confusion_matrix(y_test_integers, predictions_label)
print(cm)

accuracy = accuracy_score(y_test_integers, predictions_label)
precision, recall, f1, _ = precision_recall_fscore_support(y_test_integers, predictions_label, average='binary')
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

print(f"Best f1 Learning Rat: {best_f1_lr} ")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at uklfr/gottbert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


[[1133    7]
 [  45   18]]
Accuracy: 0.9567747298420615
Precision: 0.72
Recall: 0.2857142857142857
F1 Score: 0.409090909090909
Best f1 Learning Rat: 0.0001 


# Imbalance / PEFT /Cost sensitve learning

In [9]:
best_lr = None
lowest_eval_loss = float('inf')


best_f1_lr = None
highest_f1 = 0




for lr in learning_rates:
    
#--------------------------------------------------------------------------------    
    tokenizer = AutoTokenizer.from_pretrained("uklfr/gottbert-base")
    model = AutoModelForSequenceClassification.from_pretrained("uklfr/gottbert-base")


    # Freeze all parameters
    for param in model.parameters():
        param.requires_grad = False

    # Unfreeze the last two layers of the RoBERTa model
    # Access the layers as model.roberta.encoder.layer
    for module in [model.roberta.encoder.layer[-2], model.roberta.encoder.layer[-1]]:
        for param in module.parameters():
            param.requires_grad = True
#-----------------------------------------------------------------------------------           

    X_train = list(df['Segment Text'].loc[(df['policy_type'] == 'TRAIN')])
    y_train = list(df[target_column].loc[(df['policy_type'] == 'TRAIN')])


    # Split the data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.20, random_state=0)


    # mapping labels

    label_mapping = {0: 0, 1: 1}
    y_train_integers = [label_mapping[label] for label in y_train]
    y_val_integers = [label_mapping[label] for label in y_val]

    train_encodings = tokenizer(X_train, max_length=512, truncation=True, padding='max_length')
    val_encodings = tokenizer(X_val, max_length=512, truncation=True, padding='max_length')

    #
    class TextDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = self.labels[idx]  # Removed torch.tensor here
            return item

        def __len__(self):
            return len(self.labels)


    train_dataset = TextDataset(train_encodings, y_train_integers)
    val_dataset = TextDataset(val_encodings, y_val_integers)
    
#---------------------------------------------------------------------------  


    from collections import Counter
    print(Counter(y_train))
    print(Counter(y_val))
    
        # Calculate class weights (example calculation, adjust as needed)

    total_samples = Counter(y_train)[0] + Counter(y_train)[1]
    class_weights = torch.tensor([total_samples /Counter(y_train)[0], total_samples / Counter(y_train)[1]], dtype=torch.float)

    print(total_samples /Counter(y_train)[0])
    print(total_samples / Counter(y_train)[1])

    # Ensure the model is on the correct device (e.g., GPU or CPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)  # Ensure model is on the correct device

    # Move class_weights to the same device as the model
    class_weights = class_weights.to(device)



    class CustomTrainer(Trainer):
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            self.loss_fct = CrossEntropyLoss(weight=class_weights).to(device)

        def compute_loss(self, model, inputs, return_outputs=False):
            inputs = {k: v.to(device) for k, v in inputs.items()}
            labels = inputs.get("labels").to(device)

            outputs = model(**inputs)
            logits = outputs.logits

            loss = self.loss_fct(logits, labels)
            return (loss, outputs) if return_outputs else loss

    #-------------------------------------------------
    
    
    
    
    # Set up training arguments with the current learning rate
    training_args = TrainingArguments(
        output_dir='./results',
        learning_rate=lr,  # Set learning rate here
        num_train_epochs=epochs ,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=100,
        evaluation_strategy="steps",
        eval_steps=3000
    )

    # Set up the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )



    # Train the model with the current learning rate
    print(f"Training with learning rate: {lr}")
    trainer.train()
    
 # Evaluate the model
    eval_result = trainer.evaluate()
    print(f"Evaluation loss for learning rate {lr}: {eval_result['eval_loss']}")
    
    
    predictions = trainer.predict(val_dataset).predictions
    predictions_label = np.argmax(predictions, axis=1)

    cm=confusion_matrix( y_val_integers, predictions_label)
    print(cm)

    accuracy = accuracy_score( y_val_integers, predictions_label)
    precision, recall, f1, _ = precision_recall_fscore_support(y_val_integers, predictions_label, average='binary')
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")

    if eval_result['eval_loss'] < lowest_eval_loss:
        best_lr = lr
        lowest_eval_loss = eval_result['eval_loss']

    if f1 > highest_f1:
        best_f1_lr = lr
        highest_f1 = f1
        

print(f"Best Learning Rate: {best_lr} with loss: {lowest_eval_loss}")
print(f"Best f1 Learning Rat: {best_f1_lr} with highest f1: { highest_f1}")


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at uklfr/gottbert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Counter({0: 4370, 1: 164})
Counter({0: 1074, 1: 60})
1.037528604118993
27.646341463414632
Training with learning rate: 1e-05


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 1e-05: 0.19040130078792572
[[1074    0]
 [  60    0]]
Accuracy: 0.9470899470899471
Precision: 0.0
Recall: 0.0
F1 Score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at uklfr/gottbert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Counter({0: 4370, 1: 164})
Counter({0: 1074, 1: 60})
1.037528604118993
27.646341463414632
Training with learning rate: 2e-05


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 2e-05: 0.181021049618721
[[1074    0]
 [  60    0]]
Accuracy: 0.9470899470899471
Precision: 0.0
Recall: 0.0
F1 Score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at uklfr/gottbert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Counter({0: 4370, 1: 164})
Counter({0: 1074, 1: 60})
1.037528604118993
27.646341463414632
Training with learning rate: 3e-05


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 3e-05: 0.1750197410583496
[[1071    3]
 [  56    4]]
Accuracy: 0.9479717813051146
Precision: 0.5714285714285714
Recall: 0.06666666666666667
F1 Score: 0.11940298507462686


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at uklfr/gottbert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Counter({0: 4370, 1: 164})
Counter({0: 1074, 1: 60})
1.037528604118993
27.646341463414632
Training with learning rate: 5e-05


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 5e-05: 0.16854016482830048
[[1067    7]
 [  49   11]]
Accuracy: 0.9506172839506173
Precision: 0.6111111111111112
Recall: 0.18333333333333332
F1 Score: 0.282051282051282


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at uklfr/gottbert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Counter({0: 4370, 1: 164})
Counter({0: 1074, 1: 60})
1.037528604118993
27.646341463414632
Training with learning rate: 0.0001


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 0.0001: 0.1695026159286499
[[1065    9]
 [  45   15]]
Accuracy: 0.9523809523809523
Precision: 0.625
Recall: 0.25
F1 Score: 0.35714285714285715


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at uklfr/gottbert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Counter({0: 4370, 1: 164})
Counter({0: 1074, 1: 60})
1.037528604118993
27.646341463414632
Training with learning rate: 0.0002


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 0.0002: 0.15718339383602142
[[1066    8]
 [  53    7]]
Accuracy: 0.9462081128747796
Precision: 0.4666666666666667
Recall: 0.11666666666666667
F1 Score: 0.18666666666666668


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at uklfr/gottbert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Counter({0: 4370, 1: 164})
Counter({0: 1074, 1: 60})
1.037528604118993
27.646341463414632
Training with learning rate: 0.0003


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 0.0003: 0.15607497096061707
[[1062   12]
 [  49   11]]
Accuracy: 0.9462081128747796
Precision: 0.4782608695652174
Recall: 0.18333333333333332
F1 Score: 0.2650602409638554
Best Learning Rate: 0.0003 with loss: 0.15607497096061707
Best f1 Learning Rat: 0.0001 with highest f1: 0.35714285714285715


### Retrain the model with certain learning rate

In [10]:
tokenizer = AutoTokenizer.from_pretrained("uklfr/gottbert-base")
model = AutoModelForSequenceClassification.from_pretrained("uklfr/gottbert-base")


# Freeze all parameters
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the last two layers of the RoBERTa model
# Access the layers as model.roberta.encoder.layer
for module in [model.roberta.encoder.layer[-2], model.roberta.encoder.layer[-1]]:
    for param in module.parameters():
        param.requires_grad = True
            
                       
#-----------------------------------------------------------------------------------     

X_train=list(df['Segment Text'].loc[(df['policy_type'] == 'TRAIN')])
X_test=list(df['Segment Text'].loc[(df['policy_type'] == 'TEST')])

y_train=list(df[target_column].loc[(df['policy_type'] == 'TRAIN')])
y_test=list(df[target_column].loc[(df['policy_type'] == 'TEST')])


train_encodings = tokenizer(X_train, max_length=512, truncation=True, padding='max_length')
test_encodings = tokenizer(X_test, max_length=512, truncation=True, padding='max_length')

#-----------

label_mapping = {0: 0, 1: 1}
y_train_integers = [label_mapping[label] for label in y_train]
y_test_integers = [label_mapping[label] for label in y_test]


class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]  # Removed torch.tensor here
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings, y_train_integers)
test_dataset = TextDataset(test_encodings, y_test_integers)

#--------------------------


from collections import Counter
print(Counter(y_train))
print(Counter(y_test))


# Calculate class weights (example calculation, adjust as needed)

total_samples = Counter(y_train)[0] + Counter(y_train)[1]
class_weights = torch.tensor([total_samples /Counter(y_train)[0], total_samples / Counter(y_train)[1]], dtype=torch.float)

print(total_samples /Counter(y_train)[0])
print(total_samples / Counter(y_train)[1])

# Ensure the model is on the correct device (e.g., GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Ensure model is on the correct device

# Move class_weights to the same device as the model
class_weights = class_weights.to(device)



class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fct = CrossEntropyLoss(weight=class_weights).to(device)

    def compute_loss(self, model, inputs, return_outputs=False):
        inputs = {k: v.to(device) for k, v in inputs.items()}
        labels = inputs.get("labels").to(device)

        outputs = model(**inputs)
        logits = outputs.logits

        loss = self.loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

#-------------------------------------------------


training_args = TrainingArguments(
    learning_rate= best_f1_lr,
    output_dir='./results',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    #logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=3000

)

trainer = CustomTrainer(
    model=model,  # Your model
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)


trainer.train()


predictions = trainer.predict(test_dataset).predictions
predictions_label = np.argmax(predictions, axis=1)

cm=confusion_matrix(y_test_integers, predictions_label)
print(cm)

accuracy = accuracy_score(y_test_integers, predictions_label)
precision, recall, f1, _ = precision_recall_fscore_support(y_test_integers, predictions_label, average='binary')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

print(f"Best f1 Learning Rat: {best_f1_lr}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at uklfr/gottbert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Counter({0: 5444, 1: 224})
Counter({0: 1140, 1: 63})
1.0411462160176341
25.303571428571427


Step,Training Loss,Validation Loss


[[1127   13]
 [  43   20]]
Accuracy: 0.9534497090606816
Precision: 0.6060606060606061
Recall: 0.31746031746031744
F1 Score: 0.4166666666666667
Best f1 Learning Rat: 0.0001


In [24]:
target_column

'Collection Process_Collected on first-party website/app'

# Imbalance / AEDA

In [13]:

def aeda(text):
    """
    Apply AEDA augmentation to a given text.

    :param text: The input text string to be augmented.
    :return: Augmented sentence.
    """
    punctuation_marks = [".", ";", "?", ":", "!", ""]
    words = text.split()

    n_insertions = random.randint(1, max(1, len(words) // 3))
    insert_positions = np.random.choice(len(words), n_insertions, replace=False)

    for pos in sorted(insert_positions, reverse=True):
        words.insert(pos, random.choice(punctuation_marks))

    return ' '.join(words)

def balance_classes(df, column_name, text_column , modification_function=aeda):
    """
    Balances the classes in a DataFrame.

    Parameters:
    - df: pandas DataFrame to balance.
    - column_name: the name of the column to balance by (e.g., 'Third Party').
    - text_column: the name of the text column to modify (e.g., 'Segment Text').
    - modification_function: the function to apply to modify the text (default: aeda).

    Returns:
    - A balanced DataFrame.
    """
    # Count the instances of each class
    class_counts = df[column_name].value_counts()
    max_class = class_counts.idxmax()
    min_class = class_counts.idxmin()

    # Calculate the number of samples needed to balance the classes
    count_diff = class_counts[max_class] - class_counts[min_class]


    # Select rows of the minority class and duplicate them
    df_minority = df[df[column_name] == min_class]
    df_minority_augmented = df_minority.sample(count_diff, replace=True)

    # Apply the modification function to the text column
    df_minority_augmented[text_column] = df_minority_augmented[text_column].apply(modification_function)

    # Combine the original DataFrame with the augmented rows
    df_balanced = pd.concat([df, df_minority_augmented], ignore_index=True)

    return df_balanced



In [14]:

best_lr = None
lowest_eval_loss = float('inf')

best_f1_lr = None
highest_f1 = 0


for lr in learning_rates:
    
#--------------------------------------------------------------------------------    

    tokenizer = AutoTokenizer.from_pretrained("uklfr/gottbert-base")
    model = AutoModelForSequenceClassification.from_pretrained("uklfr/gottbert-base")


    # Freeze all parameters
    for param in model.parameters():
        param.requires_grad = False

    # Unfreeze the last two layers of the RoBERTa model
    # Access the layers as model.roberta.encoder.layer
    for module in [model.roberta.encoder.layer[-2], model.roberta.encoder.layer[-1]]:
        for param in module.parameters():
            param.requires_grad = True
#-----------------------------------------------------------------------------------  
    df_train = df[['Segment Text',target_column]].loc[df['policy_type'] == 'TRAIN']
    
    X = df_train['Segment Text']
    y = df_train[target_column]


    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=0)

    df_train = pd.concat([X_train, y_train], axis=1)
    print(df_train[target_column].value_counts())

    df_train_aug = balance_classes(df_train,target_column,'Segment Text' )
    print(df_train_aug[target_column].value_counts())

    X_train=list(df_train_aug['Segment Text'])
    y_train=list(df_train_aug[target_column])
    

    X_val=list(X_val)
    y_val=list( y_val)


    # mapping labels

    label_mapping = {0: 0, 1: 1}
    y_train_integers = [label_mapping[label] for label in y_train]
    y_val_integers = [label_mapping[label] for label in y_val]

    train_encodings = tokenizer(X_train, max_length=512, truncation=True, padding='max_length')
    val_encodings = tokenizer(X_val, max_length=512, truncation=True, padding='max_length')


    #
    class TextDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = self.labels[idx]  # Removed torch.tensor here
            return item

        def __len__(self):
            return len(self.labels)


    train_dataset = TextDataset(train_encodings, y_train_integers)
    val_dataset = TextDataset(val_encodings, y_val_integers)



#---------------------------------------------------------------------------  
    
    # Set up training arguments with the current learning rate
    training_args = TrainingArguments(
        output_dir='./results',
        learning_rate=lr,  # Set learning rate here
        num_train_epochs=epochs ,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=100,
        evaluation_strategy="steps",
        eval_steps=3000
    )

    # Set up the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )


    # Train the model with the current learning rate
    print(f"Training with learning rate: {lr}")
    trainer.train()
    
 # Evaluate the model
    eval_result = trainer.evaluate()
    print(f"Evaluation loss for learning rate {lr}: {eval_result['eval_loss']}")
    
    
    predictions = trainer.predict(val_dataset).predictions
    predictions_label = np.argmax(predictions, axis=1)

    cm=confusion_matrix( y_val_integers, predictions_label)
    print(cm)

    accuracy = accuracy_score( y_val_integers, predictions_label)
    precision, recall, f1, _ = precision_recall_fscore_support(y_val_integers, predictions_label, average='binary')
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")

    if eval_result['eval_loss'] < lowest_eval_loss:
        best_lr = lr
        lowest_eval_loss = eval_result['eval_loss']
            
    if f1 > highest_f1:
        best_f1_lr = lr
        highest_f1 = f1
        

print(f"Best Learning Rate: {best_lr} with loss: {lowest_eval_loss}")
print(f"Best f1 Learning Rat: {best_f1_lr} with highest f1: { highest_f1}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Collection Process
0    4136
1     398
Name: count, dtype: int64
Collection Process
0    4136
1    4136
Name: count, dtype: int64
Training with learning rate: 1e-05


Step,Training Loss,Validation Loss
3000,0.1283,0.371673


Evaluation loss for learning rate 1e-05: 0.3782637119293213
[[950  67]
 [ 53  64]]
Accuracy: 0.8941798941798942
Precision: 0.48854961832061067
Recall: 0.5470085470085471
F1 Score: 0.5161290322580645


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Collection Process
0    4136
1     398
Name: count, dtype: int64
Collection Process
0    4136
1    4136
Name: count, dtype: int64
Training with learning rate: 2e-05


Step,Training Loss,Validation Loss
3000,0.0881,0.385995


Evaluation loss for learning rate 2e-05: 0.39945146441459656
[[952  65]
 [ 49  68]]
Accuracy: 0.8994708994708994
Precision: 0.5112781954887218
Recall: 0.5811965811965812
F1 Score: 0.544


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Collection Process
0    4136
1     398
Name: count, dtype: int64
Collection Process
0    4136
1    4136
Name: count, dtype: int64
Training with learning rate: 3e-05


Step,Training Loss,Validation Loss
3000,0.0712,0.397506


Evaluation loss for learning rate 3e-05: 0.41345664858818054
[[953  64]
 [ 47  70]]
Accuracy: 0.9021164021164021
Precision: 0.5223880597014925
Recall: 0.5982905982905983
F1 Score: 0.5577689243027888


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Collection Process
0    4136
1     398
Name: count, dtype: int64
Collection Process
0    4136
1    4136
Name: count, dtype: int64
Training with learning rate: 5e-05


Step,Training Loss,Validation Loss
3000,0.0505,0.400942


Evaluation loss for learning rate 5e-05: 0.40742531418800354
[[964  53]
 [ 50  67]]
Accuracy: 0.9091710758377425
Precision: 0.5583333333333333
Recall: 0.5726495726495726
F1 Score: 0.5654008438818565


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Collection Process
0    4136
1     398
Name: count, dtype: int64
Collection Process
0    4136
1    4136
Name: count, dtype: int64
Training with learning rate: 0.0001


Step,Training Loss,Validation Loss
3000,0.0482,0.426697


Evaluation loss for learning rate 0.0001: 0.42695868015289307
[[964  53]
 [ 50  67]]
Accuracy: 0.9091710758377425
Precision: 0.5583333333333333
Recall: 0.5726495726495726
F1 Score: 0.5654008438818565


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Collection Process
0    4136
1     398
Name: count, dtype: int64
Collection Process
0    4136
1    4136
Name: count, dtype: int64
Training with learning rate: 0.0002


Step,Training Loss,Validation Loss
3000,0.0816,0.351737


Evaluation loss for learning rate 0.0002: 0.355380654335022
[[959  58]
 [ 51  66]]
Accuracy: 0.9038800705467372
Precision: 0.532258064516129
Recall: 0.5641025641025641
F1 Score: 0.5477178423236515


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Collection Process
0    4136
1     398
Name: count, dtype: int64
Collection Process
0    4136
1    4136
Name: count, dtype: int64
Training with learning rate: 0.0003


Step,Training Loss,Validation Loss
3000,0.092,0.323472


Evaluation loss for learning rate 0.0003: 0.3287490904331207
[[961  56]
 [ 52  65]]
Accuracy: 0.9047619047619048
Precision: 0.5371900826446281
Recall: 0.5555555555555556
F1 Score: 0.5462184873949579
Best Learning Rate: 0.0003 with loss: 0.3287490904331207
Best f1 Learning Rat: 5e-05 with highest f1: 0.5654008438818565


### Retrain the model with certain learning rate

In [15]:

tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")
model = BertForSequenceClassification.from_pretrained("bert-base-german-cased")



# Freeze all parameters
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the last two layers of the BERT model
# BERT uses an encoder structure, so you will need to access its layers accordingly
for module in [ model.bert.encoder.layer[-2], model.bert.encoder.layer[-1]]:
    for param in module.parameters():
        param.requires_grad = True

                       
#-----------------------------------------------------------------------------------     
df_train = df[['Segment Text',target_column]].loc[df['policy_type'] == 'TRAIN']
print(df_train[target_column].value_counts())

df_train_aug = balance_classes(df_train,target_column,'Segment Text' )
print(df_train_aug[target_column].value_counts())


X_train=list(df_train_aug['Segment Text'])
y_train=list(df_train_aug[target_column])

X_test=list(df['Segment Text'].loc[(df['policy_type'] == 'TEST')])
y_test=list(df[target_column].loc[(df['policy_type'] == 'TEST')])


train_encodings = tokenizer(X_train, max_length=512, truncation=True, padding='max_length')
test_encodings = tokenizer(X_test, max_length=512, truncation=True, padding='max_length')


label_mapping = {0: 0, 1: 1}
y_train_integers = [label_mapping[label] for label in y_train]
y_test_integers = [label_mapping[label] for label in y_test]


class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]  # Removed torch.tensor here
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings, y_train_integers)
test_dataset = TextDataset(test_encodings, y_test_integers)


#------------------------
from collections import Counter
print(Counter(y_train))
print(Counter(y_test))

#--------------------------

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate= best_f1_lr , # Set learning rate here
    num_train_epochs=epochs ,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=3000

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()

predictions = trainer.predict(test_dataset).predictions
predictions_label = np.argmax(predictions, axis=1)

cm=confusion_matrix(y_test_integers, predictions_label)
print(cm)

accuracy = accuracy_score(y_test_integers, predictions_label)
precision, recall, f1, _ = precision_recall_fscore_support(y_test_integers, predictions_label, average='binary')
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

print(f"Best f1 Learning Rat: {best_f1_lr} ")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Collection Process
0    5153
1     515
Name: count, dtype: int64
Collection Process
0    5153
1    5153
Name: count, dtype: int64
Counter({0: 5153, 1: 5153})
Counter({0: 1063, 1: 140})


Step,Training Loss,Validation Loss
3000,0.0741,0.418247


[[1008   55]
 [  72   68]]
Accuracy: 0.8944305901911886
Precision: 0.5528455284552846
Recall: 0.4857142857142857
F1 Score: 0.5171102661596958
Best f1 Learning Rat: 5e-05 
