In [1]:

import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, ErnieForSequenceClassification
import torch
torch.cuda.empty_cache()
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
from sklearn.metrics import confusion_matrix
from torch.nn import CrossEntropyLoss
import random
import nlpaug.augmenter.word as naw
import nltk
from nltk.corpus import wordnet



from peft import LoraConfig, TaskType
from peft import get_peft_model


In [2]:
df = pd.read_csv("en_5.0.csv")
df.columns[df.notnull().any()]

Index(['policy_id', 'policy_type', 'segments', 'Segment Text', 'First Party',
       'Third Party', 'Information Type', 'Purpose', 'Collection Process',
       'Legal Basis for Collection', 'Third-Party Entity',
       'Information Type_Computer information',
       'Information Type_Contact information',
       'Information Type_Cookies and tracking elements',
       'Information Type_Demographic data', 'Information Type_Financial',
       'Information Type_Generic personal information',
       'Information Type_IP address and device IDs',
       'Information Type_Location', 'Purpose_Advertising or marketing',
       'Purpose_Analytics or research', 'Purpose_Essential service or feature',
       'Purpose_Legal requirement', 'Purpose_Service operation and security',
       'Collection Process_Collected on first-party website/app',
       'Legal Basis for Collection_Legitimate interests of first or third party',
       'Information Type_User online activities',
       'Collection Proces

In [28]:
target_column =  'Collection Process_Collected on first-party website/app'
batch_size = 8
epochs = 3


# original

In [26]:
best_lr = None
lowest_eval_loss = float('inf')


best_f1_lr = None
highest_f1 = 0


learning_rates =[1e-5, 2e-5, 3e-5, 5e-5, 1e-4, 2e-4, 3e-4]

for lr in learning_rates:
    
#--------------------------------------------------------------------------------    
    tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-2.0-base-en")

    model = ErnieForSequenceClassification.from_pretrained("nghuyong/ernie-2.0-base-en")

 
#-----------------------------------------------------------------------------------           

    X_train = list(df['Segment Text'].loc[(df['policy_type'] == 'TRAIN')])
    y_train = list(df[target_column].loc[(df['policy_type'] == 'TRAIN')])


    # Split the data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.20, random_state=0)


    # mapping labels

    label_mapping = {0: 0, 1: 1}
    y_train_integers = [label_mapping[label] for label in y_train]
    y_val_integers = [label_mapping[label] for label in y_val]

    train_encodings = tokenizer(X_train, max_length=512, truncation=True, padding='max_length')
    val_encodings = tokenizer(X_val, max_length=512, truncation=True, padding='max_length')

    #
    class TextDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = self.labels[idx]  # Removed torch.tensor here
            return item

        def __len__(self):
            return len(self.labels)


    train_dataset = TextDataset(train_encodings, y_train_integers)
    val_dataset = TextDataset(val_encodings, y_val_integers)
    
#---------------------------------------------------------------------------  
    
    # Set up training arguments with the current learning rate
    training_args = TrainingArguments(
        output_dir='./results',
        learning_rate=lr,  # Set learning rate here
        num_train_epochs=epochs ,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=100,
        evaluation_strategy="steps",
        eval_steps=3000
    )

    # Set up the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )


    # Train the model with the current learning rate
    print(f"Training with learning rate: {lr}")
    trainer.train()
    
 # Evaluate the model
    eval_result = trainer.evaluate()
    print(f"Evaluation loss for learning rate {lr}: {eval_result['eval_loss']}")
    
    
    predictions = trainer.predict(val_dataset).predictions
    predictions_label = np.argmax(predictions, axis=1)

    cm=confusion_matrix( y_val_integers, predictions_label)
    print(cm)

    accuracy = accuracy_score( y_val_integers, predictions_label)
    precision, recall, f1, _ = precision_recall_fscore_support(y_val_integers, predictions_label, average='binary')
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")

    if eval_result['eval_loss'] < lowest_eval_loss:
        best_lr = lr
        lowest_eval_loss = eval_result['eval_loss']

    if f1 > highest_f1:
        best_f1_lr = lr
        highest_f1 = f1
        

print(f"Best Learning Rate: {best_lr} with loss: {lowest_eval_loss}")
print(f"Best f1 Learning Rat: {best_f1_lr} with highest f1: { highest_f1}")


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 1e-05


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 1e-05: 0.062117695808410645
[[621   9]
 [  4  14]]
Accuracy: 0.9799382716049383
Precision: 0.6086956521739131
Recall: 0.7777777777777778
F1 Score: 0.6829268292682927


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 2e-05


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 2e-05: 0.0657379999756813
[[618  12]
 [  2  16]]
Accuracy: 0.9783950617283951
Precision: 0.5714285714285714
Recall: 0.8888888888888888
F1 Score: 0.6956521739130435


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 3e-05


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 3e-05: 0.05122268199920654
[[621   9]
 [  2  16]]
Accuracy: 0.9830246913580247
Precision: 0.64
Recall: 0.8888888888888888
F1 Score: 0.7441860465116279


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 5e-05


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 5e-05: 0.11569096148014069
[[626   4]
 [ 10   8]]
Accuracy: 0.9783950617283951
Precision: 0.6666666666666666
Recall: 0.4444444444444444
F1 Score: 0.5333333333333333


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 0.0001


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 0.0001: 0.1488148421049118
[[630   0]
 [ 18   0]]
Accuracy: 0.9722222222222222
Precision: 0.0
Recall: 0.0
F1 Score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 0.0002


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 0.0002: 0.1518235057592392
[[630   0]
 [ 18   0]]
Accuracy: 0.9722222222222222
Precision: 0.0
Recall: 0.0
F1 Score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 0.0003


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 0.0003: 0.1542862355709076
[[630   0]
 [ 18   0]]
Accuracy: 0.9722222222222222
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
Best Learning Rate: 3e-05 with loss: 0.05122268199920654
Best f1 Learning Rat: 3e-05 with highest f1: 0.7441860465116279


  _warn_prf(average, modifier, msg_start, len(result))


### Retrain the model with certain learning rate


In [27]:
tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-2.0-base-en")

model = ErnieForSequenceClassification.from_pretrained("nghuyong/ernie-2.0-base-en")
            
#-----------------------------------------------------------------------------------     

X_train=list(df['Segment Text'].loc[(df['policy_type'] == 'TRAIN')])
X_test=list(df['Segment Text'].loc[(df['policy_type'] == 'TEST')])

y_train=list(df[target_column].loc[(df['policy_type'] == 'TRAIN')])
y_test=list(df[target_column].loc[(df['policy_type'] == 'TEST')])


train_encodings = tokenizer(X_train, max_length=512, truncation=True, padding='max_length')
test_encodings = tokenizer(X_test, max_length=512, truncation=True, padding='max_length')

#-----------

label_mapping = {0: 0, 1: 1}
y_train_integers = [label_mapping[label] for label in y_train]
y_test_integers = [label_mapping[label] for label in y_test]


class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]  # Removed torch.tensor here
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings, y_train_integers)
test_dataset = TextDataset(test_encodings, y_test_integers)

#--------------------------

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate= best_f1_lr , # Set learning rate here
    num_train_epochs=epochs ,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=3000

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()


predictions = trainer.predict(test_dataset).predictions
predictions_label = np.argmax(predictions, axis=1)

cm=confusion_matrix(y_test_integers, predictions_label)
print(cm)

accuracy = accuracy_score(y_test_integers, predictions_label)
precision, recall, f1, _ = precision_recall_fscore_support(y_test_integers, predictions_label, average='binary')
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

print(f"Best f1 Learning Rat: {best_f1_lr}")

Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


[[717   3]
 [  5  12]]
Accuracy: 0.989145183175034
Precision: 0.8
Recall: 0.7058823529411765
F1 Score: 0.7500000000000001
Best f1 Learning Rat: 3e-05


# Fine Tuning last 2 layers 

In [23]:

best_lr = None
lowest_eval_loss = float('inf')

best_f1_lr = None
highest_f1 = 0

learning_rates =[1e-5, 2e-5, 3e-5, 5e-5, 1e-4, 2e-4, 3e-4]# [5e-5, 1e-4, 2e-4, 3e-4][ 1e-5, 2e-5, 3e-5][1e-5, 2e-5, 3e-5, 5e-5, 1e-4, 2e-4, 3e-4]

for lr in learning_rates:
    
#--------------------------------------------------------------------------------    
    tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-2.0-base-en")

    model = ErnieForSequenceClassification.from_pretrained("nghuyong/ernie-2.0-base-en")

    for param in model.parameters():
        param.requires_grad = False

    # Unfreeze the last two layers of the ErnieEncoder
    for param in model.ernie.encoder.layer[-2:].parameters():
        param.requires_grad = True
            
#-----------------------------------------------------------------------------------           

    X_train = list(df['Segment Text'].loc[(df['policy_type'] == 'TRAIN')])
    y_train = list(df[target_column].loc[(df['policy_type'] == 'TRAIN')])


    # Split the data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.20, random_state=0)


    # mapping labels

    label_mapping = {0: 0, 1: 1}
    y_train_integers = [label_mapping[label] for label in y_train]
    y_val_integers = [label_mapping[label] for label in y_val]

    train_encodings = tokenizer(X_train, max_length=512, truncation=True, padding='max_length')
    val_encodings = tokenizer(X_val, max_length=512, truncation=True, padding='max_length')

    #
    class TextDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = self.labels[idx]  # Removed torch.tensor here
            return item

        def __len__(self):
            return len(self.labels)


    train_dataset = TextDataset(train_encodings, y_train_integers)
    val_dataset = TextDataset(val_encodings, y_val_integers)
    
#---------------------------------------------------------------------------  
    
    # Set up training arguments with the current learning rate
    training_args = TrainingArguments(
        output_dir='./results',
        learning_rate=lr,  # Set learning rate here
        num_train_epochs=epochs ,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=100,
        evaluation_strategy="steps",
        eval_steps=3000
    )

    # Set up the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )


    # Train the model with the current learning rate
    print(f"Training with learning rate: {lr}")
    trainer.train()
    
 # Evaluate the model
    eval_result = trainer.evaluate()
    print(f"Evaluation loss for learning rate {lr}: {eval_result['eval_loss']}")
    
    
    predictions = trainer.predict(val_dataset).predictions
    predictions_label = np.argmax(predictions, axis=1)

    cm=confusion_matrix( y_val_integers, predictions_label)
    print(cm)

    accuracy = accuracy_score( y_val_integers, predictions_label)
    precision, recall, f1, _ = precision_recall_fscore_support(y_val_integers, predictions_label, average='binary')
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")

    if eval_result['eval_loss'] < lowest_eval_loss:
        best_lr = lr
        lowest_eval_loss = eval_result['eval_loss']
        
    if f1 > highest_f1:
        best_f1_lr = lr
        highest_f1 = f1
        
print(f"Best Learning Rate: {best_lr} with loss: {lowest_eval_loss}")
print(f"Best f1 Learning Rat: {best_f1_lr} with highest f1: { highest_f1}")

Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 1e-05


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 1e-05: 0.08479280024766922
[[601  16]
 [  7  24]]
Accuracy: 0.9645061728395061
Precision: 0.6
Recall: 0.7741935483870968
F1 Score: 0.676056338028169


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 2e-05


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 2e-05: 0.10813398659229279
[[595  22]
 [  8  23]]
Accuracy: 0.9537037037037037
Precision: 0.5111111111111111
Recall: 0.7419354838709677
F1 Score: 0.6052631578947368


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 3e-05


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 3e-05: 0.10972175747156143
[[597  20]
 [  6  25]]
Accuracy: 0.9598765432098766
Precision: 0.5555555555555556
Recall: 0.8064516129032258
F1 Score: 0.6578947368421053


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 5e-05


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 5e-05: 0.11369504779577255
[[595  22]
 [  5  26]]
Accuracy: 0.9583333333333334
Precision: 0.5416666666666666
Recall: 0.8387096774193549
F1 Score: 0.6582278481012658


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 0.0001


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 0.0001: 0.09745791554450989
[[595  22]
 [  4  27]]
Accuracy: 0.9598765432098766
Precision: 0.5510204081632653
Recall: 0.8709677419354839
F1 Score: 0.6749999999999999


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 0.0002


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 0.0002: 0.0965372622013092
[[595  22]
 [  5  26]]
Accuracy: 0.9583333333333334
Precision: 0.5416666666666666
Recall: 0.8387096774193549
F1 Score: 0.6582278481012658


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with learning rate: 0.0003


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 0.0003: 0.09088344126939774
[[594  23]
 [  6  25]]
Accuracy: 0.9552469135802469
Precision: 0.5208333333333334
Recall: 0.8064516129032258
F1 Score: 0.6329113924050633
Best Learning Rate: 1e-05 with loss: 0.08479280024766922
Best f1 Learning Rat: 1e-05 with highest f1: 0.676056338028169


### Retrain the model with certain learning rate

In [24]:
tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-2.0-base-en")

model = ErnieForSequenceClassification.from_pretrained("nghuyong/ernie-2.0-base-en")

# Freeze all parameters
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the last two layers of the ErnieEncoder
for param in model.ernie.encoder.layer[-2:].parameters():
    param.requires_grad = True
#-----------------------------------------------------------------------------------     

X_train=list(df['Segment Text'].loc[(df['policy_type'] == 'TRAIN')])
X_test=list(df['Segment Text'].loc[(df['policy_type'] == 'TEST')])

y_train=list(df[target_column].loc[(df['policy_type'] == 'TRAIN')])
y_test=list(df[target_column].loc[(df['policy_type'] == 'TEST')])


train_encodings = tokenizer(X_train, max_length=512, truncation=True, padding='max_length')
test_encodings = tokenizer(X_test, max_length=512, truncation=True, padding='max_length')

#-----------

label_mapping = {0: 0, 1: 1}
y_train_integers = [label_mapping[label] for label in y_train]
y_test_integers = [label_mapping[label] for label in y_test]


class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]  # Removed torch.tensor here
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings, y_train_integers)
test_dataset = TextDataset(test_encodings, y_test_integers)

#--------------------------

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate= best_f1_lr , # Set learning rate here
    num_train_epochs=epochs ,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=3000

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()

predictions = trainer.predict(test_dataset).predictions
predictions_label = np.argmax(predictions, axis=1)

cm=confusion_matrix(y_test_integers, predictions_label)
print(cm)

accuracy = accuracy_score(y_test_integers, predictions_label)
precision, recall, f1, _ = precision_recall_fscore_support(y_test_integers, predictions_label, average='binary')
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

print(f"Best f1 Learning Rat: {best_f1_lr} ")

Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


[[709   9]
 [  5  14]]
Accuracy: 0.9810040705563093
Precision: 0.6086956521739131
Recall: 0.7368421052631579
F1 Score: 0.6666666666666666
Best f1 Learning Rat: 1e-05 


# Imbalance / PEFT /Cost sensitve learning

In [13]:
best_lr = None
lowest_eval_loss = float('inf')


best_f1_lr = None
highest_f1 = 0


learning_rates =[1e-5, 2e-5, 3e-5, 5e-5, 1e-4, 2e-4, 3e-4]

for lr in learning_rates:
    
#--------------------------------------------------------------------------------    
    tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-2.0-base-en")

    model = ErnieForSequenceClassification.from_pretrained("nghuyong/ernie-2.0-base-en")

    # Freeze all parameters
    for param in model.parameters():
        param.requires_grad = False

    # Unfreeze the last two layers of the ErnieEncoder
    for param in model.ernie.encoder.layer[-2:].parameters():
        param.requires_grad = True
            
#-----------------------------------------------------------------------------------           

    X_train = list(df['Segment Text'].loc[(df['policy_type'] == 'TRAIN')])
    y_train = list(df[target_column].loc[(df['policy_type'] == 'TRAIN')])


    # Split the data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.20, random_state=0)


    # mapping labels

    label_mapping = {0: 0, 1: 1}
    y_train_integers = [label_mapping[label] for label in y_train]
    y_val_integers = [label_mapping[label] for label in y_val]

    train_encodings = tokenizer(X_train, max_length=512, truncation=True, padding='max_length')
    val_encodings = tokenizer(X_val, max_length=512, truncation=True, padding='max_length')

    #
    class TextDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = self.labels[idx]  # Removed torch.tensor here
            return item

        def __len__(self):
            return len(self.labels)


    train_dataset = TextDataset(train_encodings, y_train_integers)
    val_dataset = TextDataset(val_encodings, y_val_integers)
    
#---------------------------------------------------------------------------  


    from collections import Counter
    print(Counter(y_train))
    print(Counter(y_val))
    
        # Calculate class weights (example calculation, adjust as needed)

    total_samples = Counter(y_train)[0] + Counter(y_train)[1]
    class_weights = torch.tensor([total_samples /Counter(y_train)[0], total_samples / Counter(y_train)[1]], dtype=torch.float)

    print(total_samples /Counter(y_train)[0])
    print(total_samples / Counter(y_train)[1])

    # Ensure the model is on the correct device (e.g., GPU or CPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)  # Ensure model is on the correct device

    # Move class_weights to the same device as the model
    class_weights = class_weights.to(device)



    class CustomTrainer(Trainer):
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            self.loss_fct = CrossEntropyLoss(weight=class_weights).to(device)

        def compute_loss(self, model, inputs, return_outputs=False):
            inputs = {k: v.to(device) for k, v in inputs.items()}
            labels = inputs.get("labels").to(device)

            outputs = model(**inputs)
            logits = outputs.logits

            loss = self.loss_fct(logits, labels)
            return (loss, outputs) if return_outputs else loss

    #-------------------------------------------------
    
    
    
    
    # Set up training arguments with the current learning rate
    training_args = TrainingArguments(
        output_dir='./results',
        learning_rate=lr,  # Set learning rate here
        num_train_epochs=epochs ,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=100,
        evaluation_strategy="steps",
        eval_steps=3000
    )

    # Set up the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )



    # Train the model with the current learning rate
    print(f"Training with learning rate: {lr}")
    trainer.train()
    
 # Evaluate the model
    eval_result = trainer.evaluate()
    print(f"Evaluation loss for learning rate {lr}: {eval_result['eval_loss']}")
    
    
    predictions = trainer.predict(val_dataset).predictions
    predictions_label = np.argmax(predictions, axis=1)

    cm=confusion_matrix( y_val_integers, predictions_label)
    print(cm)

    accuracy = accuracy_score( y_val_integers, predictions_label)
    precision, recall, f1, _ = precision_recall_fscore_support(y_val_integers, predictions_label, average='binary')
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")

    if eval_result['eval_loss'] < lowest_eval_loss:
        best_lr = lr
        lowest_eval_loss = eval_result['eval_loss']

    if f1 > highest_f1:
        best_f1_lr = lr
        highest_f1 = f1
        

print(f"Best Learning Rate: {best_lr} with loss: {lowest_eval_loss}")
print(f"Best f1 Learning Rat: {best_f1_lr} with highest f1: { highest_f1}")


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Counter({0: 2558, 1: 33})
Counter({0: 640, 1: 8})
1.012900703674746
78.51515151515152
Training with learning rate: 1e-05


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 1e-05: 0.0627056136727333
[[640   0]
 [  8   0]]
Accuracy: 0.9876543209876543
Precision: 0.0
Recall: 0.0
F1 Score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Counter({0: 2558, 1: 33})
Counter({0: 640, 1: 8})
1.012900703674746
78.51515151515152
Training with learning rate: 2e-05


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 2e-05: 0.027622675523161888
[[639   1]
 [  7   1]]
Accuracy: 0.9876543209876543
Precision: 0.5
Recall: 0.125
F1 Score: 0.2


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Counter({0: 2558, 1: 33})
Counter({0: 640, 1: 8})
1.012900703674746
78.51515151515152
Training with learning rate: 3e-05


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 3e-05: 0.029483383521437645
[[638   2]
 [  5   3]]
Accuracy: 0.9891975308641975
Precision: 0.6
Recall: 0.375
F1 Score: 0.4615384615384615


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Counter({0: 2558, 1: 33})
Counter({0: 640, 1: 8})
1.012900703674746
78.51515151515152
Training with learning rate: 5e-05


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 5e-05: 0.030799565836787224
[[639   1]
 [  5   3]]
Accuracy: 0.9907407407407407
Precision: 0.75
Recall: 0.375
F1 Score: 0.5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Counter({0: 2558, 1: 33})
Counter({0: 640, 1: 8})
1.012900703674746
78.51515151515152
Training with learning rate: 0.0001


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 0.0001: 0.04429016634821892
[[639   1]
 [  6   2]]
Accuracy: 0.9891975308641975
Precision: 0.6666666666666666
Recall: 0.25
F1 Score: 0.36363636363636365


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Counter({0: 2558, 1: 33})
Counter({0: 640, 1: 8})
1.012900703674746
78.51515151515152
Training with learning rate: 0.0002


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 0.0002: 0.027863502502441406
[[637   3]
 [  5   3]]
Accuracy: 0.9876543209876543
Precision: 0.5
Recall: 0.375
F1 Score: 0.42857142857142855


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Counter({0: 2558, 1: 33})
Counter({0: 640, 1: 8})
1.012900703674746
78.51515151515152
Training with learning rate: 0.0003


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 0.0003: 0.03434814140200615
[[640   0]
 [  8   0]]
Accuracy: 0.9876543209876543
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
Best Learning Rate: 2e-05 with loss: 0.027622675523161888
Best f1 Learning Rat: 5e-05 with highest f1: 0.5


  _warn_prf(average, modifier, msg_start, len(result))


### Retrain the model with certain learning rate

In [14]:
tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-2.0-base-en")

model = ErnieForSequenceClassification.from_pretrained("nghuyong/ernie-2.0-base-en")

# Freeze all parameters
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the last two layers of the ErnieEncoder
for param in model.ernie.encoder.layer[-2:].parameters():
    param.requires_grad = True
                       
#-----------------------------------------------------------------------------------     

X_train=list(df['Segment Text'].loc[(df['policy_type'] == 'TRAIN')])
X_test=list(df['Segment Text'].loc[(df['policy_type'] == 'TEST')])

y_train=list(df[target_column].loc[(df['policy_type'] == 'TRAIN')])
y_test=list(df[target_column].loc[(df['policy_type'] == 'TEST')])


train_encodings = tokenizer(X_train, max_length=512, truncation=True, padding='max_length')
test_encodings = tokenizer(X_test, max_length=512, truncation=True, padding='max_length')

#-----------

label_mapping = {0: 0, 1: 1}
y_train_integers = [label_mapping[label] for label in y_train]
y_test_integers = [label_mapping[label] for label in y_test]


class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]  # Removed torch.tensor here
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings, y_train_integers)
test_dataset = TextDataset(test_encodings, y_test_integers)

#--------------------------


from collections import Counter
print(Counter(y_train))
print(Counter(y_test))


# Calculate class weights (example calculation, adjust as needed)

total_samples = Counter(y_train)[0] + Counter(y_train)[1]
class_weights = torch.tensor([total_samples /Counter(y_train)[0], total_samples / Counter(y_train)[1]], dtype=torch.float)

print(total_samples /Counter(y_train)[0])
print(total_samples / Counter(y_train)[1])

# Ensure the model is on the correct device (e.g., GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Ensure model is on the correct device

# Move class_weights to the same device as the model
class_weights = class_weights.to(device)



class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fct = CrossEntropyLoss(weight=class_weights).to(device)

    def compute_loss(self, model, inputs, return_outputs=False):
        inputs = {k: v.to(device) for k, v in inputs.items()}
        labels = inputs.get("labels").to(device)

        outputs = model(**inputs)
        logits = outputs.logits

        loss = self.loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

#-------------------------------------------------


training_args = TrainingArguments(
    learning_rate= best_f1_lr,
    output_dir='./results',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    #logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=3000

)

trainer = CustomTrainer(
    model=model,  # Your model
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)


trainer.train()


predictions = trainer.predict(test_dataset).predictions
predictions_label = np.argmax(predictions, axis=1)

cm=confusion_matrix(y_test_integers, predictions_label)
print(cm)

accuracy = accuracy_score(y_test_integers, predictions_label)
precision, recall, f1, _ = precision_recall_fscore_support(y_test_integers, predictions_label, average='binary')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

print(f"Best f1 Learning Rat: {best_f1_lr}")

Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Counter({0: 3198, 1: 41})
Counter({0: 715, 1: 22})
1.0128205128205128
79.0


Step,Training Loss,Validation Loss


[[712   3]
 [  5  17]]
Accuracy: 0.989145183175034
Precision: 0.85
Recall: 0.7727272727272727
F1 Score: 0.8095238095238095
Best f1 Learning Rat: 5e-05


# Imbalance / AEDA

In [29]:

def aeda(text):
    """
    Apply AEDA augmentation to a given text.

    :param text: The input text string to be augmented.
    :return: Augmented sentence.
    """
    punctuation_marks = [".", ";", "?", ":", "!", ""]
    words = text.split()

    n_insertions = random.randint(1, max(1, len(words) // 3))
    insert_positions = np.random.choice(len(words), n_insertions, replace=False)

    for pos in sorted(insert_positions, reverse=True):
        words.insert(pos, random.choice(punctuation_marks))

    return ' '.join(words)

def balance_classes(df, column_name, text_column , modification_function=aeda):
    """
    Balances the classes in a DataFrame.

    Parameters:
    - df: pandas DataFrame to balance.
    - column_name: the name of the column to balance by (e.g., 'Third Party').
    - text_column: the name of the text column to modify (e.g., 'Segment Text').
    - modification_function: the function to apply to modify the text (default: aeda).

    Returns:
    - A balanced DataFrame.
    """
    # Count the instances of each class
    class_counts = df[column_name].value_counts()
    max_class = class_counts.idxmax()
    min_class = class_counts.idxmin()

    # Calculate the number of samples needed to balance the classes
    count_diff = class_counts[max_class] - class_counts[min_class]


    # Select rows of the minority class and duplicate them
    df_minority = df[df[column_name] == min_class]
    df_minority_augmented = df_minority.sample(count_diff, replace=True)

    # Apply the modification function to the text column
    df_minority_augmented[text_column] = df_minority_augmented[text_column].apply(modification_function)

    # Combine the original DataFrame with the augmented rows
    df_balanced = pd.concat([df, df_minority_augmented], ignore_index=True)

    return df_balanced



In [30]:

best_lr = None
lowest_eval_loss = float('inf')

best_f1_lr = None
highest_f1 = 0

learning_rates = [1e-5, 2e-5, 3e-5, 5e-5, 1e-4, 2e-4, 3e-4]
for lr in learning_rates:
    
#--------------------------------------------------------------------------------    

    tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-2.0-base-en")

    model = ErnieForSequenceClassification.from_pretrained("nghuyong/ernie-2.0-base-en")

    # Freeze all parameters
    for param in model.parameters():
        param.requires_grad = False

    # Unfreeze the last two layers of the ErnieEncoder
    for param in model.ernie.encoder.layer[-2:].parameters():
        param.requires_grad = True
            
#-----------------------------------------------------------------------------------  
    df_train = df[['Segment Text',target_column]].loc[df['policy_type'] == 'TRAIN']
    
    X = df_train['Segment Text']
    y = df_train[target_column]


    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=0)

    df_train = pd.concat([X_train, y_train], axis=1)
    print(df_train[target_column].value_counts())

    df_train_aug = balance_classes(df_train,target_column,'Segment Text' )
    print(df_train_aug[target_column].value_counts())

    X_train=list(df_train_aug['Segment Text'])
    y_train=list(df_train_aug[target_column])
    

    X_val=list(X_val)
    y_val=list( y_val)


    # mapping labels

    label_mapping = {0: 0, 1: 1}
    y_train_integers = [label_mapping[label] for label in y_train]
    y_val_integers = [label_mapping[label] for label in y_val]

    train_encodings = tokenizer(X_train, max_length=512, truncation=True, padding='max_length')
    val_encodings = tokenizer(X_val, max_length=512, truncation=True, padding='max_length')


    #
    class TextDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = self.labels[idx]  # Removed torch.tensor here
            return item

        def __len__(self):
            return len(self.labels)


    train_dataset = TextDataset(train_encodings, y_train_integers)
    val_dataset = TextDataset(val_encodings, y_val_integers)



#---------------------------------------------------------------------------  
    
    # Set up training arguments with the current learning rate
    training_args = TrainingArguments(
        output_dir='./results',
        learning_rate=lr,  # Set learning rate here
        num_train_epochs=epochs ,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=100,
        evaluation_strategy="steps",
        eval_steps=3000
    )

    # Set up the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )


    # Train the model with the current learning rate
    print(f"Training with learning rate: {lr}")
    trainer.train()
    
 # Evaluate the model
    eval_result = trainer.evaluate()
    print(f"Evaluation loss for learning rate {lr}: {eval_result['eval_loss']}")
    
    
    predictions = trainer.predict(val_dataset).predictions
    predictions_label = np.argmax(predictions, axis=1)

    cm=confusion_matrix( y_val_integers, predictions_label)
    print(cm)

    accuracy = accuracy_score( y_val_integers, predictions_label)
    precision, recall, f1, _ = precision_recall_fscore_support(y_val_integers, predictions_label, average='binary')
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")

    if eval_result['eval_loss'] < lowest_eval_loss:
        best_lr = lr
        lowest_eval_loss = eval_result['eval_loss']
            
    if f1 > highest_f1:
        best_f1_lr = lr
        highest_f1 = f1
        

print(f"Best Learning Rate: {best_lr} with loss: {lowest_eval_loss}")
print(f"Best f1 Learning Rat: {best_f1_lr} with highest f1: { highest_f1}")

Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Collection Process_Collected on first-party website/app
0    2372
1     219
Name: count, dtype: int64
Collection Process_Collected on first-party website/app
0    2372
1    2372
Name: count, dtype: int64
Training with learning rate: 1e-05


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 1e-05: 0.26696377992630005
[[580  13]
 [ 33  22]]
Accuracy: 0.9290123456790124
Precision: 0.6285714285714286
Recall: 0.4
F1 Score: 0.488888888888889


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Collection Process_Collected on first-party website/app
0    2372
1     219
Name: count, dtype: int64
Collection Process_Collected on first-party website/app
0    2372
1    2372
Name: count, dtype: int64
Training with learning rate: 2e-05


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 2e-05: 0.2481098175048828
[[578  15]
 [ 26  29]]
Accuracy: 0.9367283950617284
Precision: 0.6590909090909091
Recall: 0.5272727272727272
F1 Score: 0.5858585858585857


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Collection Process_Collected on first-party website/app
0    2372
1     219
Name: count, dtype: int64
Collection Process_Collected on first-party website/app
0    2372
1    2372
Name: count, dtype: int64
Training with learning rate: 3e-05


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 3e-05: 0.22485728561878204
[[580  13]
 [ 25  30]]
Accuracy: 0.941358024691358
Precision: 0.6976744186046512
Recall: 0.5454545454545454
F1 Score: 0.6122448979591837


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Collection Process_Collected on first-party website/app
0    2372
1     219
Name: count, dtype: int64
Collection Process_Collected on first-party website/app
0    2372
1    2372
Name: count, dtype: int64
Training with learning rate: 5e-05


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 5e-05: 0.21738296747207642
[[578  15]
 [ 29  26]]
Accuracy: 0.9320987654320988
Precision: 0.6341463414634146
Recall: 0.4727272727272727
F1 Score: 0.5416666666666666


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Collection Process_Collected on first-party website/app
0    2372
1     219
Name: count, dtype: int64
Collection Process_Collected on first-party website/app
0    2372
1    2372
Name: count, dtype: int64
Training with learning rate: 0.0001


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 0.0001: 0.21494674682617188
[[575  18]
 [ 29  26]]
Accuracy: 0.9274691358024691
Precision: 0.5909090909090909
Recall: 0.4727272727272727
F1 Score: 0.5252525252525252


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Collection Process_Collected on first-party website/app
0    2372
1     219
Name: count, dtype: int64
Collection Process_Collected on first-party website/app
0    2372
1    2372
Name: count, dtype: int64
Training with learning rate: 0.0002


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 0.0002: 0.2396884709596634
[[579  14]
 [ 40  15]]
Accuracy: 0.9166666666666666
Precision: 0.5172413793103449
Recall: 0.2727272727272727
F1 Score: 0.3571428571428571


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Collection Process_Collected on first-party website/app
0    2372
1     219
Name: count, dtype: int64
Collection Process_Collected on first-party website/app
0    2372
1    2372
Name: count, dtype: int64
Training with learning rate: 0.0003


Step,Training Loss,Validation Loss


Evaluation loss for learning rate 0.0003: 0.21375030279159546
[[573  20]
 [ 24  31]]
Accuracy: 0.9320987654320988
Precision: 0.6078431372549019
Recall: 0.5636363636363636
F1 Score: 0.5849056603773585
Best Learning Rate: 0.0003 with loss: 0.21375030279159546
Best f1 Learning Rat: 3e-05 with highest f1: 0.6122448979591837


### Retrain the model with certain learning rate

In [31]:
tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-2.0-base-en")

model = ErnieForSequenceClassification.from_pretrained("nghuyong/ernie-2.0-base-en")

# Freeze all parameters
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the last two layers of the ErnieEncoder
for param in model.ernie.encoder.layer[-2:].parameters():
    param.requires_grad = True
                       
#-----------------------------------------------------------------------------------     
df_train = df[['Segment Text',target_column]].loc[df['policy_type'] == 'TRAIN']
print(df_train[target_column].value_counts())

df_train_aug = balance_classes(df_train,target_column,'Segment Text' )
print(df_train_aug[target_column].value_counts())


X_train=list(df_train_aug['Segment Text'])
y_train=list(df_train_aug[target_column])

X_test=list(df['Segment Text'].loc[(df['policy_type'] == 'TEST')])
y_test=list(df[target_column].loc[(df['policy_type'] == 'TEST')])


train_encodings = tokenizer(X_train, max_length=512, truncation=True, padding='max_length')
test_encodings = tokenizer(X_test, max_length=512, truncation=True, padding='max_length')


label_mapping = {0: 0, 1: 1}
y_train_integers = [label_mapping[label] for label in y_train]
y_test_integers = [label_mapping[label] for label in y_test]


class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]  # Removed torch.tensor here
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings, y_train_integers)
test_dataset = TextDataset(test_encodings, y_test_integers)


#------------------------
from collections import Counter
print(Counter(y_train))
print(Counter(y_test))

#--------------------------

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate= best_f1_lr , # Set learning rate here
    num_train_epochs=epochs ,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=3000

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()

predictions = trainer.predict(test_dataset).predictions
predictions_label = np.argmax(predictions, axis=1)

cm=confusion_matrix(y_test_integers, predictions_label)
print(cm)

accuracy = accuracy_score(y_test_integers, predictions_label)
precision, recall, f1, _ = precision_recall_fscore_support(y_test_integers, predictions_label, average='binary')
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

print(f"Best f1 Learning Rat: {best_f1_lr} ")

Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Collection Process_Collected on first-party website/app
0    2965
1     274
Name: count, dtype: int64
Collection Process_Collected on first-party website/app
1    2965
0    2965
Name: count, dtype: int64
Counter({1: 2965, 0: 2965})
Counter({0: 680, 1: 57})


Step,Training Loss,Validation Loss


[[664  16]
 [ 36  21]]
Accuracy: 0.9294436906377205
Precision: 0.5675675675675675
Recall: 0.3684210526315789
F1 Score: 0.44680851063829785
Best f1 Learning Rat: 3e-05 
