In [1]:

import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, XLNetForSequenceClassification
import torch
torch.cuda.empty_cache()
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
from sklearn.metrics import confusion_matrix
from torch.nn import CrossEntropyLoss
import random
import nlpaug.augmenter.word as naw
import nltk
from nltk.corpus import wordnet



from peft import LoraConfig, TaskType
from peft import get_peft_model


2024-01-25 12:25:43.718775: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-25 12:25:43.718841: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-25 12:25:43.718883: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-25 12:25:43.728862: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df = pd.read_csv("en_5.0.csv")
df.columns[df.notnull().any()]

Index(['policy_id', 'policy_type', 'segments', 'Segment Text', 'First Party',
       'Third Party', 'Information Type', 'Purpose', 'Collection Process',
       'Legal Basis for Collection', 'Third-Party Entity',
       'Information Type_Computer information',
       'Information Type_Contact information',
       'Information Type_Cookies and tracking elements',
       'Information Type_Demographic data', 'Information Type_Financial',
       'Information Type_Generic personal information',
       'Information Type_IP address and device IDs',
       'Information Type_Location', 'Purpose_Advertising or marketing',
       'Purpose_Analytics or research', 'Purpose_Essential service or feature',
       'Purpose_Legal requirement', 'Purpose_Service operation and security',
       'Collection Process_Collected on first-party website/app',
       'Legal Basis for Collection_Legitimate interests of first or third party',
       'Information Type_User online activities',
       'Collection Proces

In [14]:
target_column =   'Legal Basis for Collection'
batch_size = 8
epochs = 3


# original

In [9]:
best_lr = None
lowest_eval_loss = float('inf')


best_f1_lr = None
highest_f1 = 0


learning_rates = [5e-5, 1e-4, 2e-4, 3e-4]

for lr in learning_rates:
    
#--------------------------------------------------------------------------------    

    tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")

    model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased")

 
#-----------------------------------------------------------------------------------           

    X_train = list(df['Segment Text'].loc[(df['policy_type'] == 'TRAIN')])
    y_train = list(df[target_column].loc[(df['policy_type'] == 'TRAIN')])


    # Split the data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.20, random_state=0)


    # mapping labels

    label_mapping = {0: 0, 1: 1}
    y_train_integers = [label_mapping[label] for label in y_train]
    y_val_integers = [label_mapping[label] for label in y_val]

    train_encodings = tokenizer(X_train, max_length=512, truncation=True, padding='max_length')
    val_encodings = tokenizer(X_val, max_length=512, truncation=True, padding='max_length')

    #
    class TextDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = self.labels[idx]  # Removed torch.tensor here
            return item

        def __len__(self):
            return len(self.labels)


    train_dataset = TextDataset(train_encodings, y_train_integers)
    val_dataset = TextDataset(val_encodings, y_val_integers)
    
#---------------------------------------------------------------------------  
    
    # Set up training arguments with the current learning rate
    training_args = TrainingArguments(
        output_dir='./results',
        learning_rate=lr,  # Set learning rate here
        num_train_epochs=epochs ,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=100,
        evaluation_strategy="steps",
        eval_steps=3000
    )

    # Set up the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )


    # Train the model with the current learning rate
    print(f"Training with learning rate: {lr}")
    trainer.train()
    
 # Evaluate the model
    eval_result = trainer.evaluate()
    print(f"Evaluation loss for learning rate {lr}: {eval_result['eval_loss']}")
    
    
    predictions = trainer.predict(val_dataset).predictions
    predictions_label = np.argmax(predictions, axis=1)

    cm=confusion_matrix( y_val_integers, predictions_label)
    print(cm)

    accuracy = accuracy_score( y_val_integers, predictions_label)
    precision, recall, f1, _ = precision_recall_fscore_support(y_val_integers, predictions_label, average='binary')
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")

    if eval_result['eval_loss'] < lowest_eval_loss:
        best_lr = lr
        lowest_eval_loss = eval_result['eval_loss']

    if f1 > highest_f1:
        best_f1_lr = lr
        highest_f1 = f1
        

print(f"Best Learning Rate: {best_lr} with loss: {lowest_eval_loss}")
print(f"Best f1 Learning Rat: {best_f1_lr} with highest f1: { highest_f1}")


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 5e-05


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 5e-05: 0.4012907147407532
[[437  38]
 [ 21 152]]
Accuracy: 0.9089506172839507
Precision: 0.8
Recall: 0.8786127167630058
F1 Score: 0.8374655647382921


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 0.0001


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 0.0001: 0.4492437243461609
[[437  38]
 [ 78  95]]
Accuracy: 0.8209876543209876
Precision: 0.7142857142857143
Recall: 0.5491329479768786
F1 Score: 0.6209150326797386


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 0.0002


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 0.0002: 0.5816572308540344
[[475   0]
 [173   0]]
Accuracy: 0.7330246913580247
Precision: 0.0
Recall: 0.0
F1 Score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 0.0003


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 0.0003: 0.5809800028800964
[[475   0]
 [173   0]]
Accuracy: 0.7330246913580247
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
Best Learning Rate: 5e-05 with loss: 0.4012907147407532
Best f1 Learning Rat: 5e-05 with highest f1: 0.8374655647382921


  _warn_prf(average, modifier, msg_start, len(result))


### Retrain the model with certain learning rate


In [10]:
tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")

model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased")
            
#-----------------------------------------------------------------------------------     

X_train=list(df['Segment Text'].loc[(df['policy_type'] == 'TRAIN')])
X_test=list(df['Segment Text'].loc[(df['policy_type'] == 'TEST')])

y_train=list(df[target_column].loc[(df['policy_type'] == 'TRAIN')])
y_test=list(df[target_column].loc[(df['policy_type'] == 'TEST')])


train_encodings = tokenizer(X_train, max_length=512, truncation=True, padding='max_length')
test_encodings = tokenizer(X_test, max_length=512, truncation=True, padding='max_length')

#-----------

label_mapping = {0: 0, 1: 1}
y_train_integers = [label_mapping[label] for label in y_train]
y_test_integers = [label_mapping[label] for label in y_test]


class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]  # Removed torch.tensor here
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings, y_train_integers)
test_dataset = TextDataset(test_encodings, y_test_integers)

#--------------------------

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate= best_f1_lr , # Set learning rate here
    num_train_epochs=epochs ,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=3000

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()


predictions = trainer.predict(test_dataset).predictions
predictions_label = np.argmax(predictions, axis=1)

cm=confusion_matrix(y_test_integers, predictions_label)
print(cm)

accuracy = accuracy_score(y_test_integers, predictions_label)
precision, recall, f1, _ = precision_recall_fscore_support(y_test_integers, predictions_label, average='binary')
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


[[529  36]
 [ 42 130]]
Accuracy: 0.8941655359565808
Precision: 0.7831325301204819
Recall: 0.7558139534883721
F1 Score: 0.7692307692307692


# Fine Tuning last 2 layers 

In [9]:

best_lr = None
lowest_eval_loss = float('inf')

best_f1_lr = None
highest_f1 = 0

learning_rates = [ 5e-5, 1e-4, 2e-4, 3e-4]

for lr in learning_rates:
    
#--------------------------------------------------------------------------------    

    tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")

    model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased")

    # Load pre-trained XLNet model
    # Freeze all parameters
    for param in model.parameters():
        param.requires_grad = False

    # Unfreeze the last two layers
    # Adapt this to the specific structure of XLNet
    for layer in [model.transformer.layer[-2], model.transformer.layer[-1]]:
        for param in layer.parameters():
            param.requires_grad = True
            
#-----------------------------------------------------------------------------------           

    X_train = list(df['Segment Text'].loc[(df['policy_type'] == 'TRAIN')])
    y_train = list(df[target_column].loc[(df['policy_type'] == 'TRAIN')])


    # Split the data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.20, random_state=0)


    # mapping labels

    label_mapping = {0: 0, 1: 1}
    y_train_integers = [label_mapping[label] for label in y_train]
    y_val_integers = [label_mapping[label] for label in y_val]

    train_encodings = tokenizer(X_train, max_length=512, truncation=True, padding='max_length')
    val_encodings = tokenizer(X_val, max_length=512, truncation=True, padding='max_length')

    #
    class TextDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = self.labels[idx]  # Removed torch.tensor here
            return item

        def __len__(self):
            return len(self.labels)


    train_dataset = TextDataset(train_encodings, y_train_integers)
    val_dataset = TextDataset(val_encodings, y_val_integers)
    
#---------------------------------------------------------------------------  
    
    # Set up training arguments with the current learning rate
    training_args = TrainingArguments(
        output_dir='./results',
        learning_rate=lr,  # Set learning rate here
        num_train_epochs=epochs ,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=100,
        evaluation_strategy="steps",
        eval_steps=3000
    )

    # Set up the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )


    # Train the model with the current learning rate
    print(f"Training with learning rate: {lr}")
    trainer.train()
    
 # Evaluate the model
    eval_result = trainer.evaluate()
    print(f"Evaluation loss for learning rate {lr}: {eval_result['eval_loss']}")
    
    
    predictions = trainer.predict(val_dataset).predictions
    predictions_label = np.argmax(predictions, axis=1)

    cm=confusion_matrix( y_val_integers, predictions_label)
    print(cm)

    accuracy = accuracy_score( y_val_integers, predictions_label)
    precision, recall, f1, _ = precision_recall_fscore_support(y_val_integers, predictions_label, average='binary')
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")

    if eval_result['eval_loss'] < lowest_eval_loss:
        best_lr = lr
        lowest_eval_loss = eval_result['eval_loss']
        
    if f1 > highest_f1:
        best_f1_lr = lr
        highest_f1 = f1
        
print(f"Best Learning Rate: {best_lr} with loss: {lowest_eval_loss}")
print(f"Best f1 Learning Rat: {best_f1_lr} with highest f1: { highest_f1}")

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'sequence_summary.summary.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 5e-05


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 5e-05: 0.06161521375179291
[[617   5]
 [  6  20]]
Accuracy: 0.9830246913580247
Precision: 0.8
Recall: 0.7692307692307693
F1 Score: 0.7843137254901961


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'sequence_summary.summary.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 0.0001


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 0.0001: 0.1437637358903885
[[622   0]
 [ 22   4]]
Accuracy: 0.9660493827160493
Precision: 1.0
Recall: 0.15384615384615385
F1 Score: 0.2666666666666667


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'sequence_summary.summary.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 0.0002


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 0.0002: 0.06060611456632614
[[617   5]
 [  6  20]]
Accuracy: 0.9830246913580247
Precision: 0.8
Recall: 0.7692307692307693
F1 Score: 0.7843137254901961


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'sequence_summary.summary.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 0.0003


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 0.0003: 0.06909492611885071
[[615   7]
 [  7  19]]
Accuracy: 0.9783950617283951
Precision: 0.7307692307692307
Recall: 0.7307692307692307
F1 Score: 0.7307692307692306
Best Learning Rate: 0.0002 with loss: 0.06060611456632614
Best f1 Learning Rat: 5e-05 with highest f1: 0.7843137254901961


### Retrain the model with certain learning rate

In [10]:

tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")

model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased")

# Load pre-trained XLNet model
# Freeze all parameters
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the last two layers
# Adapt this to the specific structure of XLNet
for layer in [model.transformer.layer[-2], model.transformer.layer[-1]]:
    for param in layer.parameters():
        param.requires_grad = True
                       
#-----------------------------------------------------------------------------------     

X_train=list(df['Segment Text'].loc[(df['policy_type'] == 'TRAIN')])
X_test=list(df['Segment Text'].loc[(df['policy_type'] == 'TEST')])

y_train=list(df[target_column].loc[(df['policy_type'] == 'TRAIN')])
y_test=list(df[target_column].loc[(df['policy_type'] == 'TEST')])


train_encodings = tokenizer(X_train, max_length=512, truncation=True, padding='max_length')
test_encodings = tokenizer(X_test, max_length=512, truncation=True, padding='max_length')

#-----------

label_mapping = {0: 0, 1: 1}
y_train_integers = [label_mapping[label] for label in y_train]
y_test_integers = [label_mapping[label] for label in y_test]


class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]  # Removed torch.tensor here
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings, y_train_integers)
test_dataset = TextDataset(test_encodings, y_test_integers)

#--------------------------

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate= best_f1_lr , # Set learning rate here
    num_train_epochs=epochs ,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=3000

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()

predictions = trainer.predict(test_dataset).predictions
predictions_label = np.argmax(predictions, axis=1)

cm=confusion_matrix(y_test_integers, predictions_label)
print(cm)

accuracy = accuracy_score(y_test_integers, predictions_label)
precision, recall, f1, _ = precision_recall_fscore_support(y_test_integers, predictions_label, average='binary')
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'sequence_summary.summary.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


[[694  11]
 [ 14  18]]
Accuracy: 0.966078697421981
Precision: 0.6206896551724138
Recall: 0.5625
F1 Score: 0.5901639344262296


In [13]:
print(f"Best f1 Learning Rat: {best_f1_lr}")

Best f1 Learning Rat: 5e-05


In [None]:
[[694  11]
 [ 14  18]]
Accuracy: 0.966078697421981
Precision: 0.6206896551724138
Recall: 0.5625
F1 Score: 0.5901639344262296

# Imbalance / PEFT /Cost sensitve learning

In [15]:
best_lr = None
lowest_eval_loss = float('inf')


best_f1_lr = None
highest_f1 = 0


learning_rates = [5e-5, 1e-4, 2e-4, 3e-4]

for lr in learning_rates:
    
#--------------------------------------------------------------------------------    

    tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")

    model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased")

    # Load pre-trained XLNet model
    # Freeze all parameters
    for param in model.parameters():
        param.requires_grad = False

    # Unfreeze the last two layers
    # Adapt this to the specific structure of XLNet
    for layer in [model.transformer.layer[-2], model.transformer.layer[-1]]:
        for param in layer.parameters():
            param.requires_grad = True
            
#-----------------------------------------------------------------------------------           

    X_train = list(df['Segment Text'].loc[(df['policy_type'] == 'TRAIN')])
    y_train = list(df[target_column].loc[(df['policy_type'] == 'TRAIN')])


    # Split the data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.20, random_state=0)


    # mapping labels

    label_mapping = {0: 0, 1: 1}
    y_train_integers = [label_mapping[label] for label in y_train]
    y_val_integers = [label_mapping[label] for label in y_val]

    train_encodings = tokenizer(X_train, max_length=512, truncation=True, padding='max_length')
    val_encodings = tokenizer(X_val, max_length=512, truncation=True, padding='max_length')

    #
    class TextDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = self.labels[idx]  # Removed torch.tensor here
            return item

        def __len__(self):
            return len(self.labels)


    train_dataset = TextDataset(train_encodings, y_train_integers)
    val_dataset = TextDataset(val_encodings, y_val_integers)
    
#---------------------------------------------------------------------------  


    from collections import Counter
    print(Counter(y_train))
    print(Counter(y_val))
    
        # Calculate class weights (example calculation, adjust as needed)

    total_samples = Counter(y_train)[0] + Counter(y_train)[1]
    class_weights = torch.tensor([total_samples /Counter(y_train)[0], total_samples / Counter(y_train)[1]], dtype=torch.float)

    print(total_samples /Counter(y_train)[0])
    print(total_samples / Counter(y_train)[1])

    # Ensure the model is on the correct device (e.g., GPU or CPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)  # Ensure model is on the correct device

    # Move class_weights to the same device as the model
    class_weights = class_weights.to(device)



    class CustomTrainer(Trainer):
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            self.loss_fct = CrossEntropyLoss(weight=class_weights).to(device)

        def compute_loss(self, model, inputs, return_outputs=False):
            inputs = {k: v.to(device) for k, v in inputs.items()}
            labels = inputs.get("labels").to(device)

            outputs = model(**inputs)
            logits = outputs.logits

            loss = self.loss_fct(logits, labels)
            return (loss, outputs) if return_outputs else loss

    #-------------------------------------------------
    
    
    
    
    # Set up training arguments with the current learning rate
    training_args = TrainingArguments(
        output_dir='./results',
        learning_rate=lr,  # Set learning rate here
        num_train_epochs=epochs ,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=100,
        evaluation_strategy="steps",
        eval_steps=3000
    )

    # Set up the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )



    # Train the model with the current learning rate
    print(f"Training with learning rate: {lr}")
    trainer.train()
    
 # Evaluate the model
    eval_result = trainer.evaluate()
    print(f"Evaluation loss for learning rate {lr}: {eval_result['eval_loss']}")
    
    
    predictions = trainer.predict(val_dataset).predictions
    predictions_label = np.argmax(predictions, axis=1)

    cm=confusion_matrix( y_val_integers, predictions_label)
    print(cm)

    accuracy = accuracy_score( y_val_integers, predictions_label)
    precision, recall, f1, _ = precision_recall_fscore_support(y_val_integers, predictions_label, average='binary')
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")

    if eval_result['eval_loss'] < lowest_eval_loss:
        best_lr = lr
        lowest_eval_loss = eval_result['eval_loss']

    if f1 > highest_f1:
        best_f1_lr = lr
        highest_f1 = f1
        

print(f"Best Learning Rate: {best_lr} with loss: {lowest_eval_loss}")
print(f"Best f1 Learning Rat: {best_f1_lr} with highest f1: { highest_f1}")


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'sequence_summary.summary.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Counter({0: 2518, 1: 73})
Counter({0: 627, 1: 21})
1.028991262907069
35.49315068493151
Training with learning rate: 5e-05


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 5e-05: 0.131798654794693
[[623   4]
 [ 12   9]]
Accuracy: 0.9753086419753086
Precision: 0.6923076923076923
Recall: 0.42857142857142855
F1 Score: 0.5294117647058824


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'sequence_summary.summary.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Counter({0: 2518, 1: 73})
Counter({0: 627, 1: 21})
1.028991262907069
35.49315068493151
Training with learning rate: 0.0001


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 0.0001: 0.13624346256256104
[[620   7]
 [ 10  11]]
Accuracy: 0.9737654320987654
Precision: 0.6111111111111112
Recall: 0.5238095238095238
F1 Score: 0.5641025641025642


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'sequence_summary.summary.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Counter({0: 2518, 1: 73})
Counter({0: 627, 1: 21})
1.028991262907069
35.49315068493151
Training with learning rate: 0.0002


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 0.0002: 0.11377318948507309
[[620   7]
 [  9  12]]
Accuracy: 0.9753086419753086
Precision: 0.631578947368421
Recall: 0.5714285714285714
F1 Score: 0.6


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'sequence_summary.summary.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Counter({0: 2518, 1: 73})
Counter({0: 627, 1: 21})
1.028991262907069
35.49315068493151
Training with learning rate: 0.0003


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 0.0003: 0.10171730071306229
[[623   4]
 [ 10  11]]
Accuracy: 0.9783950617283951
Precision: 0.7333333333333333
Recall: 0.5238095238095238
F1 Score: 0.611111111111111
Best Learning Rate: 0.0003 with loss: 0.10171730071306229
Best f1 Learning Rat: 0.0003 with highest f1: 0.611111111111111


### Retrain the model with certain learning rate

In [16]:
tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")

model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased")

# Load pre-trained XLNet model
# Freeze all parameters
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the last two layers
# Adapt this to the specific structure of XLNet
for layer in [model.transformer.layer[-2], model.transformer.layer[-1]]:
    for param in layer.parameters():
        param.requires_grad = True
                       
#-----------------------------------------------------------------------------------     

X_train=list(df['Segment Text'].loc[(df['policy_type'] == 'TRAIN')])
X_test=list(df['Segment Text'].loc[(df['policy_type'] == 'TEST')])

y_train=list(df[target_column].loc[(df['policy_type'] == 'TRAIN')])
y_test=list(df[target_column].loc[(df['policy_type'] == 'TEST')])


train_encodings = tokenizer(X_train, max_length=512, truncation=True, padding='max_length')
test_encodings = tokenizer(X_test, max_length=512, truncation=True, padding='max_length')

#-----------

label_mapping = {0: 0, 1: 1}
y_train_integers = [label_mapping[label] for label in y_train]
y_test_integers = [label_mapping[label] for label in y_test]


class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]  # Removed torch.tensor here
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings, y_train_integers)
test_dataset = TextDataset(test_encodings, y_test_integers)

#--------------------------


from collections import Counter
print(Counter(y_train))
print(Counter(y_test))


# Calculate class weights (example calculation, adjust as needed)

total_samples = Counter(y_train)[0] + Counter(y_train)[1]
class_weights = torch.tensor([total_samples /Counter(y_train)[0], total_samples / Counter(y_train)[1]], dtype=torch.float)

print(total_samples /Counter(y_train)[0])
print(total_samples / Counter(y_train)[1])

# Ensure the model is on the correct device (e.g., GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Ensure model is on the correct device

# Move class_weights to the same device as the model
class_weights = class_weights.to(device)



class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fct = CrossEntropyLoss(weight=class_weights).to(device)

    def compute_loss(self, model, inputs, return_outputs=False):
        inputs = {k: v.to(device) for k, v in inputs.items()}
        labels = inputs.get("labels").to(device)

        outputs = model(**inputs)
        logits = outputs.logits

        loss = self.loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

#-------------------------------------------------


training_args = TrainingArguments(
    learning_rate= best_f1_lr,
    output_dir='./results',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    #logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=3000

)

trainer = CustomTrainer(
    model=model,  # Your model
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)


trainer.train()


predictions = trainer.predict(test_dataset).predictions
predictions_label = np.argmax(predictions, axis=1)

cm=confusion_matrix(y_test_integers, predictions_label)
print(cm)

accuracy = accuracy_score(y_test_integers, predictions_label)
precision, recall, f1, _ = precision_recall_fscore_support(y_test_integers, predictions_label, average='binary')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

print(f"Best f1 Learning Rat: {best_f1_lr}")

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'sequence_summary.summary.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Counter({0: 3145, 1: 94})
Counter({0: 691, 1: 46})
1.0298887122416533
34.45744680851064


Step,Training Loss,Validation Loss


[[687   4]
 [ 13  33]]
Accuracy: 0.9769335142469471
Precision: 0.8918918918918919
Recall: 0.717391304347826
F1 Score: 0.7951807228915664
Best f1 Learning Rat: 0.0003


# Imbalance / AEDA

In [4]:

def aeda(text):
    """
    Apply AEDA augmentation to a given text.

    :param text: The input text string to be augmented.
    :return: Augmented sentence.
    """
    punctuation_marks = [".", ";", "?", ":", "!", ""]
    words = text.split()

    n_insertions = random.randint(1, max(1, len(words) // 3))
    insert_positions = np.random.choice(len(words), n_insertions, replace=False)

    for pos in sorted(insert_positions, reverse=True):
        words.insert(pos, random.choice(punctuation_marks))

    return ' '.join(words)

def balance_classes(df, column_name, text_column , modification_function=aeda):
    """
    Balances the classes in a DataFrame.

    Parameters:
    - df: pandas DataFrame to balance.
    - column_name: the name of the column to balance by (e.g., 'Third Party').
    - text_column: the name of the text column to modify (e.g., 'Segment Text').
    - modification_function: the function to apply to modify the text (default: aeda).

    Returns:
    - A balanced DataFrame.
    """
    # Count the instances of each class
    class_counts = df[column_name].value_counts()
    max_class = class_counts.idxmax()
    min_class = class_counts.idxmin()

    # Calculate the number of samples needed to balance the classes
    count_diff = class_counts[max_class] - class_counts[min_class]


    # Select rows of the minority class and duplicate them
    df_minority = df[df[column_name] == min_class]
    df_minority_augmented = df_minority.sample(count_diff, replace=True)

    # Apply the modification function to the text column
    df_minority_augmented[text_column] = df_minority_augmented[text_column].apply(modification_function)

    # Combine the original DataFrame with the augmented rows
    df_balanced = pd.concat([df, df_minority_augmented], ignore_index=True)

    return df_balanced



In [6]:

best_lr = None
lowest_eval_loss = float('inf')

best_f1_lr = None
highest_f1 = 0

learning_rates = [5e-5, 1e-4, 2e-4, 3e-4]
for lr in learning_rates:
    
#--------------------------------------------------------------------------------    

    tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")

    model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased")

    # Load pre-trained XLNet model
    # Freeze all parameters
    for param in model.parameters():
        param.requires_grad = False

    # Unfreeze the last two layers
    # Adapt this to the specific structure of XLNet
    for layer in [model.transformer.layer[-2], model.transformer.layer[-1]]:
        for param in layer.parameters():
            param.requires_grad = True
            
#-----------------------------------------------------------------------------------  
    df_train = df[['Segment Text',target_column]].loc[df['policy_type'] == 'TRAIN']
    
    X = df_train['Segment Text']
    y = df_train[target_column]


    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=0)

    df_train = pd.concat([X_train, y_train], axis=1)
    print(df_train[target_column].value_counts())

    df_train_aug = balance_classes(df_train,target_column,'Segment Text' )
    print(df_train_aug[target_column].value_counts())

    X_train=list(df_train_aug['Segment Text'])
    y_train=list(df_train_aug[target_column])
    

    X_val=list(X_val)
    y_val=list( y_val)


    # mapping labels

    label_mapping = {0: 0, 1: 1}
    y_train_integers = [label_mapping[label] for label in y_train]
    y_val_integers = [label_mapping[label] for label in y_val]

    train_encodings = tokenizer(X_train, max_length=512, truncation=True, padding='max_length')
    val_encodings = tokenizer(X_val, max_length=512, truncation=True, padding='max_length')


    #
    class TextDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = self.labels[idx]  # Removed torch.tensor here
            return item

        def __len__(self):
            return len(self.labels)


    train_dataset = TextDataset(train_encodings, y_train_integers)
    val_dataset = TextDataset(val_encodings, y_val_integers)



#---------------------------------------------------------------------------  
    
    # Set up training arguments with the current learning rate
    training_args = TrainingArguments(
        output_dir='./results',
        learning_rate=lr,  # Set learning rate here
        num_train_epochs=epochs ,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=100,
        evaluation_strategy="steps",
        eval_steps=3000
    )

    # Set up the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )


    # Train the model with the current learning rate
    print(f"Training with learning rate: {lr}")
    trainer.train()
    
 # Evaluate the model
    eval_result = trainer.evaluate()
    print(f"Evaluation loss for learning rate {lr}: {eval_result['eval_loss']}")
    
    
    predictions = trainer.predict(val_dataset).predictions
    predictions_label = np.argmax(predictions, axis=1)

    cm=confusion_matrix( y_val_integers, predictions_label)
    print(cm)

    accuracy = accuracy_score( y_val_integers, predictions_label)
    precision, recall, f1, _ = precision_recall_fscore_support(y_val_integers, predictions_label, average='binary')
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")

    if eval_result['eval_loss'] < lowest_eval_loss:
        best_lr = lr
        lowest_eval_loss = eval_result['eval_loss']
            
    if f1 > highest_f1:
        best_f1_lr = lr
        highest_f1 = f1
        

print(f"Best Learning Rate: {best_lr} with loss: {lowest_eval_loss}")
print(f"Best f1 Learning Rat: {best_f1_lr} with highest f1: { highest_f1}")

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'sequence_summary.summary.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Purpose
0    1899
1     692
Name: count, dtype: int64
Purpose
0    1899
1    1899
Name: count, dtype: int64
Training with learning rate: 5e-05


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 5e-05: 0.4174136817455292
[[432  43]
 [ 31 142]]
Accuracy: 0.8858024691358025
Precision: 0.7675675675675676
Recall: 0.8208092485549133
F1 Score: 0.7932960893854748


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'sequence_summary.summary.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Purpose
0    1899
1     692
Name: count, dtype: int64
Purpose
0    1899
1    1899
Name: count, dtype: int64
Training with learning rate: 0.0001


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 0.0001: 0.43371525406837463
[[436  39]
 [ 26 147]]
Accuracy: 0.8996913580246914
Precision: 0.7903225806451613
Recall: 0.8497109826589595
F1 Score: 0.8189415041782729


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'sequence_summary.summary.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Purpose
0    1899
1     692
Name: count, dtype: int64
Purpose
0    1899
1    1899
Name: count, dtype: int64
Training with learning rate: 0.0002


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 0.0002: 0.40075409412384033
[[441  34]
 [ 28 145]]
Accuracy: 0.904320987654321
Precision: 0.8100558659217877
Recall: 0.838150289017341
F1 Score: 0.8238636363636362


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'sequence_summary.summary.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Purpose
0    1899
1     692
Name: count, dtype: int64
Purpose
0    1899
1    1899
Name: count, dtype: int64
Training with learning rate: 0.0003


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 0.0003: 0.3943680226802826
[[439  36]
 [ 34 139]]
Accuracy: 0.8919753086419753
Precision: 0.7942857142857143
Recall: 0.8034682080924855
F1 Score: 0.7988505747126438
Best Learning Rate: 0.0003 with loss: 0.3943680226802826
Best f1 Learning Rat: 0.0002 with highest f1: 0.8238636363636362


### Retrain the model with certain learning rate

In [7]:

tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")

model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased")

# Load pre-trained XLNet model
# Freeze all parameters
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the last two layers
# Adapt this to the specific structure of XLNet
for layer in [model.transformer.layer[-2], model.transformer.layer[-1]]:
    for param in layer.parameters():
        param.requires_grad = True
                       
#-----------------------------------------------------------------------------------     
df_train = df[['Segment Text',target_column]].loc[df['policy_type'] == 'TRAIN']
print(df_train[target_column].value_counts())

df_train_aug = balance_classes(df_train,target_column,'Segment Text' )
print(df_train_aug[target_column].value_counts())


X_train=list(df_train_aug['Segment Text'])
y_train=list(df_train_aug[target_column])

X_test=list(df['Segment Text'].loc[(df['policy_type'] == 'TEST')])
y_test=list(df[target_column].loc[(df['policy_type'] == 'TEST')])


train_encodings = tokenizer(X_train, max_length=512, truncation=True, padding='max_length')
test_encodings = tokenizer(X_test, max_length=512, truncation=True, padding='max_length')


label_mapping = {0: 0, 1: 1}
y_train_integers = [label_mapping[label] for label in y_train]
y_test_integers = [label_mapping[label] for label in y_test]


class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]  # Removed torch.tensor here
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings, y_train_integers)
test_dataset = TextDataset(test_encodings, y_test_integers)


#------------------------
from collections import Counter
print(Counter(y_train))
print(Counter(y_test))

#--------------------------

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate= best_f1_lr , # Set learning rate here
    num_train_epochs=epochs ,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=3000

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()

predictions = trainer.predict(test_dataset).predictions
predictions_label = np.argmax(predictions, axis=1)

cm=confusion_matrix(y_test_integers, predictions_label)
print(cm)

accuracy = accuracy_score(y_test_integers, predictions_label)
precision, recall, f1, _ = precision_recall_fscore_support(y_test_integers, predictions_label, average='binary')
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'sequence_summary.summary.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Purpose
0    2374
1     865
Name: count, dtype: int64
Purpose
0    2374
1    2374
Name: count, dtype: int64
Counter({0: 2374, 1: 2374})
Counter({0: 565, 1: 172})


Step,Training Loss,Validation Loss


[[530  35]
 [ 38 134]]
Accuracy: 0.9009497964721845
Precision: 0.7928994082840237
Recall: 0.7790697674418605
F1 Score: 0.7859237536656891
