In [1]:

import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, BertForSequenceClassification
import torch
torch.cuda.empty_cache()
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
from sklearn.metrics import confusion_matrix
from torch.nn import CrossEntropyLoss
import random
import nlpaug.augmenter.word as naw
import nltk
from nltk.corpus import wordnet



from peft import LoraConfig, TaskType
from peft import get_peft_model


2024-01-28 20:38:48.703460: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-28 20:38:48.703567: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-28 20:38:48.703617: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-28 20:38:48.712418: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df = pd.read_csv("de_4.0.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,policy_id,policy_type,segments,Segment Text,First Party,Third Party,Information Type,Purpose,Collection Process,...,Information Type_Location,Purpose_Advertising or marketing,Purpose_Analytics or research,Purpose_Essential service or feature,Purpose_Legal requirement,Purpose_Service operation and security,Collection Process_Collected on first-party website/app,Legal Basis for Collection_Legitimate interests of first or third party,Information Type_User online activities,Collection Process_Shared by first party with a third party
0,0,1,TRAIN,"{'attribute': [], 'category': [], 'segment_id'...",Budge Studios-Mobile Apps For Kids Budge Studi...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,TRAIN,"{'attribute': ['Choice Type (opt)'], 'category...",Wenn Sie die Plattformen nutzen oder mit uns o...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,1,TRAIN,"{'attribute': [], 'category': [], 'segment_id'...",Zurück nach oben 2. ESRB PRIVACY CERTIFIED Das...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,1,TRAIN,"{'attribute': [], 'category': [], 'segment_id'...",Zurück nach oben 3. Änderungen DIESER DATENSCH...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,1,TRAIN,"{'attribute': [], 'category': [], 'segment_id'...",Zurück nach oben 4. WIE HANDHABT BUDGE STUDIOS...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
df.columns[df.notnull().any()]


Index(['Unnamed: 0', 'policy_id', 'policy_type', 'segments', 'Segment Text',
       'First Party', 'Third Party', 'Information Type', 'Purpose',
       'Collection Process', 'Legal Basis for Collection',
       'Third-Party Entity', 'Information Type_Computer information',
       'Information Type_Contact information',
       'Information Type_Cookies and tracking elements',
       'Information Type_Demographic data', 'Information Type_Financial',
       'Information Type_Generic personal information',
       'Information Type_IP address and device IDs',
       'Information Type_Location', 'Purpose_Advertising or marketing',
       'Purpose_Analytics or research', 'Purpose_Essential service or feature',
       'Purpose_Legal requirement', 'Purpose_Service operation and security',
       'Collection Process_Collected on first-party website/app',
       'Legal Basis for Collection_Legitimate interests of first or third party',
       'Information Type_User online activities',
       'Col

In [3]:
target_column =  'First Party'
batch_size = 8
epochs = 3


# original

# Fine Tuning last 2 layers 

In [4]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")
model = BertForSequenceClassification.from_pretrained("bert-base-german-cased")



# Freeze all parameters
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the last two layers of the BERT model
# BERT uses an encoder structure, so you will need to access its layers accordingly
for module in [ model.bert.encoder.layer[-2], model.bert.encoder.layer[-1]]:
    for param in module.parameters():
        param.requires_grad = True

#---------------------------------------------------------------------------



X_train=list(df['Segment Text'].loc[(df['policy_type'] == 'TRAIN')])
X_test=list(df['Segment Text'].loc[(df['policy_type'] == 'TEST')])

y_train=list(df[target_column].loc[(df['policy_type'] == 'TRAIN')])
y_test=list(df[target_column].loc[(df['policy_type'] == 'TEST')])

train_encodings = tokenizer(X_train, max_length=512, truncation=True, padding='max_length')
test_encodings = tokenizer(X_test, max_length=512, truncation=True, padding='max_length')

label_mapping = {0: 0, 1: 1}
y_train_integers = [label_mapping[label] for label in y_train]
y_test_integers = [label_mapping[label] for label in y_test]


class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]  # Removed torch.tensor here
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings, y_train_integers)
test_dataset = TextDataset(test_encodings, y_test_integers)

from collections import Counter
print(Counter(y_train))
print(Counter(y_test))


torch.cuda.empty_cache()  

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    #logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=3000

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()


predictions = trainer.predict(test_dataset).predictions
predictions_label = np.argmax(predictions, axis=1)

cm=confusion_matrix(y_test_integers, predictions_label)
print(cm)

accuracy = accuracy_score(y_test_integers, predictions_label)
precision, recall, f1, _ = precision_recall_fscore_support(y_test_integers, predictions_label, average='binary')
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Counter({0: 3862, 1: 1806})
Counter({0: 819, 1: 384})


Step,Training Loss,Validation Loss


[[713 106]
 [100 284]]
Accuracy: 0.828761429758936
Precision: 0.7282051282051282
Recall: 0.7395833333333334
F1 Score: 0.7338501291989664


# Imbalance / PEFT /Cost sensitve learning

In [5]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")
model = BertForSequenceClassification.from_pretrained("bert-base-german-cased")



# Freeze all parameters
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the last two layers of the BERT model
# BERT uses an encoder structure, so you will need to access its layers accordingly
for module in [ model.bert.encoder.layer[-2], model.bert.encoder.layer[-1]]:
    for param in module.parameters():
        param.requires_grad = True



X_train=list(df['Segment Text'].loc[(df['policy_type'] == 'TRAIN')])
X_test=list(df['Segment Text'].loc[(df['policy_type'] == 'TEST')])

y_train=list(df[target_column].loc[(df['policy_type'] == 'TRAIN')])
y_test=list(df[target_column].loc[(df['policy_type'] == 'TEST')])

train_encodings = tokenizer(X_train, max_length=512, truncation=True, padding='max_length')
test_encodings = tokenizer(X_test, max_length=512, truncation=True, padding='max_length')

label_mapping = {0: 0, 1: 1}
y_train_integers = [label_mapping[label] for label in y_train]
y_test_integers = [label_mapping[label] for label in y_test]


class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]  # Removed torch.tensor here
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings, y_train_integers)
test_dataset = TextDataset(test_encodings, y_test_integers)

from collections import Counter
print(Counter(y_train))
print(Counter(y_test))


# Calculate class weights (example calculation, adjust as needed)

total_samples = Counter(y_train)[0] + Counter(y_train)[1]
class_weights = torch.tensor([total_samples /Counter(y_train)[0], total_samples / Counter(y_train)[1]], dtype=torch.float)

print(total_samples /Counter(y_train)[0])
print(total_samples / Counter(y_train)[1])

# Ensure the model is on the correct device (e.g., GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Ensure model is on the correct device

# Move class_weights to the same device as the model
class_weights = class_weights.to(device)



class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fct = CrossEntropyLoss(weight=class_weights).to(device)

    def compute_loss(self, model, inputs, return_outputs=False):
        inputs = {k: v.to(device) for k, v in inputs.items()}
        labels = inputs.get("labels").to(device)

        outputs = model(**inputs)
        logits = outputs.logits

        loss = self.loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss
    
#-------------------------------------------------
    
    
training_args = TrainingArguments(
    learning_rate=3e-5,
    output_dir='./results',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    #logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=3000

)

trainer = CustomTrainer(
    model=model,  # Your model
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train the model
trainer.train()



#----------------------------------------------------------------------------
predictions = trainer.predict(test_dataset).predictions
predictions_label = np.argmax(predictions, axis=1)

cm=confusion_matrix(y_test_integers, predictions_label)
print(cm)

accuracy = accuracy_score(y_test_integers, predictions_label)
precision, recall, f1, _ = precision_recall_fscore_support(y_test_integers, predictions_label, average='binary')
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Counter({0: 3862, 1: 1806})
Counter({0: 819, 1: 384})
1.4676333505955463
3.1384274640088594


Step,Training Loss,Validation Loss


[[675 144]
 [ 75 309]]
Accuracy: 0.8179551122194514
Precision: 0.6821192052980133
Recall: 0.8046875
F1 Score: 0.7383512544802867


# Imbalance / AEDA

In [6]:

tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")
model = BertForSequenceClassification.from_pretrained("bert-base-german-cased")



# Freeze all parameters
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the last two layers of the BERT model
# BERT uses an encoder structure, so you will need to access its layers accordingly
for module in [ model.bert.encoder.layer[-2], model.bert.encoder.layer[-1]]:
    for param in module.parameters():
        param.requires_grad = True

        
#======================================


def aeda(text):
    """
    Apply AEDA augmentation to a given text.

    :param text: The input text string to be augmented.
    :return: Augmented sentence.
    """
    punctuation_marks = [".", ";", "?", ":", "!", ""]
    words = text.split()

    n_insertions = random.randint(1, max(1, len(words) // 3))
    insert_positions = np.random.choice(len(words), n_insertions, replace=False)

    for pos in sorted(insert_positions, reverse=True):
        words.insert(pos, random.choice(punctuation_marks))

    return ' '.join(words)



# def balance_classes(df, column_name, text_column):
    
#        # Count the instances of each class
#     class_counts = df[column_name].value_counts()
#     #max_class = class_counts.idxmax()
#     min_class = class_counts.idxmin()
#     aug_number = class_counts[min_class] 
    
#     print(aug_number)
    
#     # Calculate the number of samples needed to balance the classes
#     #count_diff = class_counts[max_class] - class_counts[min_class]

#     # Select rows of the minority class and duplicate them
#     df_minority = df[df[column_name] == min_class]
#     df_minority_augmented = df_minority.sample(aug_number, replace=True)
    
#     print(len(df_minority_augmented))

#     # Apply a random augmentation function to each row
#     augmentation_functions = [aeda]
#     df_minority_augmented[text_column] = df_minority_augmented[text_column].apply(
#         lambda x: random.choice(augmentation_functions)(x)
#     )

#     # Combine the original DataFrame with the augmented rows
#     df_balanced = pd.concat([df, df_minority_augmented], ignore_index=True)
#     return df_balanced

def balance_classes(df, column_name, text_column , modification_function=aeda):
    """
    Balances the classes in a DataFrame.

    Parameters:
    - df: pandas DataFrame to balance.
    - column_name: the name of the column to balance by (e.g., 'Third Party').
    - text_column: the name of the text column to modify (e.g., 'Segment Text').
    - modification_function: the function to apply to modify the text (default: aeda).

    Returns:
    - A balanced DataFrame.
    """
    # Count the instances of each class
    class_counts = df[column_name].value_counts()
    max_class = class_counts.idxmax()
    min_class = class_counts.idxmin()

    # Calculate the number of samples needed to balance the classes
    count_diff = class_counts[max_class] - class_counts[min_class]


    # Select rows of the minority class and duplicate them
    df_minority = df[df[column_name] == min_class]
    df_minority_augmented = df_minority.sample(count_diff, replace=True)

    # Apply the modification function to the text column
    df_minority_augmented[text_column] = df_minority_augmented[text_column].apply(modification_function)

    # Combine the original DataFrame with the augmented rows
    df_balanced = pd.concat([df, df_minority_augmented], ignore_index=True)

    return df_balanced

df_train = df[['Segment Text',target_column]].loc[df['policy_type'] == 'TRAIN']
print(df_train[target_column].value_counts())

df_train_aug = balance_classes(df_train,target_column,'Segment Text' )
print(df_train_aug[target_column].value_counts())


#_------------------------------------------------------------------------------------
X_train=list(df_train_aug['Segment Text'])
y_train=list(df_train_aug[target_column])

X_test=list(df['Segment Text'].loc[(df['policy_type'] == 'TEST')])
y_test=list(df[target_column].loc[(df['policy_type'] == 'TEST')])


train_encodings = tokenizer(X_train, max_length=512, truncation=True, padding='max_length')
test_encodings = tokenizer(X_test, max_length=512, truncation=True, padding='max_length')


label_mapping = {0: 0, 1: 1}
y_train_integers = [label_mapping[label] for label in y_train]
y_test_integers = [label_mapping[label] for label in y_test]


class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]  # Removed torch.tensor here
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings, y_train_integers)
test_dataset = TextDataset(test_encodings, y_test_integers)


#------------------------
from collections import Counter
print(Counter(y_train))
print(Counter(y_test))




#-------------------------------------------

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    #logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=5000

)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()

predictions = trainer.predict(test_dataset).predictions
predictions_label = np.argmax(predictions, axis=1)

cm=confusion_matrix(y_test_integers, predictions_label)
print(cm)


accuracy = accuracy_score(y_test_integers, predictions_label)
precision, recall, f1, _ = precision_recall_fscore_support(y_test_integers, predictions_label, average='binary')
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


First Party
0    3862
1    1806
Name: count, dtype: int64
First Party
0    3862
1    3862
Name: count, dtype: int64
Counter({0: 3862, 1: 3862})
Counter({0: 819, 1: 384})


Step,Training Loss,Validation Loss


[[697 122]
 [ 83 301]]
Accuracy: 0.829592684954281
Precision: 0.7115839243498818
Recall: 0.7838541666666666
F1 Score: 0.7459727385377942
