# Setting up

## installing libraries and importing dataset

In [14]:
# !pip install transformers
# !pip install torch
!pip install openpyxl



In [15]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertModel
import torch.nn as nn
import torch
from torch import nn, optim
import torch
from torch.optim.lr_scheduler import StepLR
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, label_ranking_average_precision_score
import pandas as pd
from tqdm.notebook import tqdm
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, label_ranking_average_precision_score
import pandas as pd

from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from torch.optim.lr_scheduler import OneCycleLR




In [16]:
df = pd.read_excel("/content/SECBERT_labelled_training_set.xlsx")
df = df[['text','E','S','G']]
df.head()

Unnamed: 0,text,E,S,G
0,Entergy continues to support national legislat...,1,0,0
1,"On August 16, 2012, the EPA published final re...",1,0,0
2,"We are subject to a variety of risks, includin...",1,1,1
3,Water Discharges. The Federal Water Pollution ...,1,0,0
4,There are increasing and rapidly evolving conc...,1,0,0


## Define Funtions an parameters

### Custom dataset class

In [26]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data[['E', 'S', 'G']]
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )

        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.targets.iloc[index].tolist(), dtype=torch.float)
        }




## Define Hyperparameters

In [27]:
LR = 3e-5
MAX_LEN = 512
BATCH_SIZE = 16
DROPOUT = 0.3
SCHEDULER_OPT_GAMMA = 0.999
MAX_EPOCHS = 50
MIN_DELTA_ES = 0.015
PATIENCE_ES = 7
GU_START = 2



### Define training funtion

In [28]:
def train_and_evaluate(model, train_loader, validation_loader,testing_loader, optimizer, scheduler, loss_fn,
                       num_epochs=MAX_EPOCHS,min_delta=MIN_DELTA_ES,patience=PATIENCE_ES):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # scheduler = StepLR(optimizer, step_size=1, gamma=SCHEDULER_OPT_GAMMA)  # Adjust as needed

    best_score = float('inf')
    early_stopping_counter = 0

    # Move model to the specified device
    model = model.to(device)

    # Initialize logs
    train_logs = pd.DataFrame(columns=['Epoch', 'Loss'])
    columns = [ 'Epoch', 'Loss',
                'LRAP', 'micro avg Acc', 'micro avg F1', 'micro Rec', 'micro avg Prec',
                'E Acc', 'E F1', 'E Rec', 'E Prec',
                'S Acc', 'S F1', 'S Rec', 'S Prec',
                'G Acc', 'G F1', 'G Rec', 'G Prec'
        ]
    validation_logs = {col: [] for col in columns}

    best_loss = float('inf')
    best_model = None

    i = 0
    for epoch in tqdm(range(num_epochs), desc='Epoch Progress'):


        # Training
        for param_group in optimizer.param_groups:
            current_lr = param_group['lr']
            print('~'*70,f'\nEpoch {epoch+1} \n', f'Learning Rate: \t\t{round(current_lr, 9)}')

        i+=1
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}'):
            # Adjusting to the new batch structure
            inputs = batch['ids'].to(device)
            attention_mask = batch['mask'].to(device)
            labels = batch['targets'].to(device)

            optimizer.zero_grad()
            # Assuming your model takes these as inputs, adjust if your model's forward method differs
            outputs = model(inputs, attention_mask=attention_mask)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            # scheduler.step()
        scheduler.step()





        avg_train_loss = total_loss / len(train_loader)
        new_row = pd.DataFrame({'Epoch': [epoch+1], 'Loss': [avg_train_loss]})
        train_logs = pd.concat([train_logs, new_row], ignore_index=True)


        # Validation
        model.eval()
        total_loss = 0
        all_predictions = []
        all_targets = []
        with torch.no_grad():
            for batch in validation_loader:
                inputs = batch['ids'].to(device)
                attention_mask = batch['mask'].to(device)
                labels = batch['targets'].to(device)


                outputs = model(inputs, attention_mask=attention_mask)
                loss = loss_fn(outputs, labels)
                total_loss += loss.item()

                all_predictions.append(outputs.detach().cpu().numpy())
                all_targets.append(labels.detach().cpu().numpy())

        avg_val_loss = total_loss / len(validation_loader)

        # Calculate metrics
        all_predictions = np.vstack(all_predictions)
        # Apply Sigmoid Funtion
        all_predictions = 1 / (1 + np.exp(-all_predictions))
        pred_E = all_predictions[:, 0].tolist()  # First column to list
        pred_S = all_predictions[:, 1].tolist()  # Second column to list
        pred_G = all_predictions[:, 2].tolist()  # Third column to list

        all_targets = np.vstack(all_targets)
        target_E = all_targets[:, 0].tolist()  # First column to list
        target_S = all_targets[:, 1].tolist()  # Second column to list
        target_G = all_targets[:, 2].tolist()  # Third column to list

        binary_pred_E = [1 if pred >= 0.5 else 0 for pred in pred_E]
        binary_pred_S = [1 if pred >= 0.5 else 0 for pred in pred_S]
        binary_pred_G = [1 if pred >= 0.5 else 0 for pred in pred_G]

        validation_logs['LRAP'].append(label_ranking_average_precision_score(all_targets, all_predictions))
        # validation_logs['LRAP'].append(i)

        def calculate_and_append_metrics(pred, target, prefix):
            acc = accuracy_score(target, pred)
            f1 = f1_score(target, pred,zero_division=0)
            rec = recall_score(target, pred,zero_division=0)
            prec = precision_score(target, pred,zero_division=0)

            validation_logs[f'{prefix} Acc'].append(acc)
            validation_logs[f'{prefix} F1'].append(f1)
            validation_logs[f'{prefix} Rec'].append(rec)
            validation_logs[f'{prefix} Prec'].append(prec)

        # Calculate and append metrics for E, S, G
        calculate_and_append_metrics(binary_pred_E, target_E, 'E')
        calculate_and_append_metrics(binary_pred_S, target_S, 'S')
        calculate_and_append_metrics(binary_pred_G, target_G, 'G')

        # Concatenate predictions and targets across all categories for micro-average calculation
        all_binary_preds = binary_pred_E + binary_pred_S + binary_pred_G
        all_targets_joined = target_E + target_S + target_G
        binary_predictions = np.where(all_predictions >= 0.5, 1, 0)

        # Calculate micro-averaged metrics
        micro_acc = accuracy_score(all_targets_joined, all_binary_preds)
        micro_f1 = f1_score(binary_predictions, all_targets, average='micro',zero_division=0)
        micro_rec = recall_score(binary_predictions, all_targets, average='micro',zero_division=0)
        micro_prec = precision_score(binary_predictions, all_targets, average='micro',zero_division=0)

        # Append micro-averaged metrics to the validation_logs dictionary
        validation_logs['micro avg Acc'].append(micro_acc)
        validation_logs['micro avg F1'].append(micro_f1)
        validation_logs['micro Rec'].append(micro_rec)
        validation_logs['micro avg Prec'].append(micro_prec)

        validation_logs['Loss'].append(avg_val_loss)
        validation_logs['Epoch'].append(i)


        print(f'\t TRAINING    avg loss:\t {avg_train_loss}')
        print(f'\t EVALUATION  avg loss:\t {avg_val_loss}\n')

        # print(all_predictions)
        # print(all_targets)


        # Early stopping
        score = avg_val_loss  # Or any other metric you're monitoring
        if best_score - score > min_delta:
            best_score = score
            best_model = model
            best_model_epoch = i
            early_stopping_counter = 0
        else:
            early_stopping_counter += 1

        if early_stopping_counter >= patience:
            print('-'*70, f"\nEarly stopping triggered. Stopping training at epoch {epoch+1}\n")
            break

    validation_logs = pd.DataFrame(validation_logs)
    torch.cuda.empty_cache()

    print('-'*70, f"\n\t\tBEST MODEL: \tEPOCH: {best_model_epoch}\n")

    display(validation_logs[validation_logs['Epoch']==best_model_epoch])

    del model

    return best_model, train_logs, validation_logs


def highlight_max(s):
    '''
    Highlight the maximum in a Series yellow.
    '''
    is_max = s == s.max()
    return ['background-color: green' if v else '' for v in is_max]

# Function to highlight the min value in each row
def highlight_min(s):
    '''
    Highlight the minimum in a Series lightgreen.
    '''
    is_min = s == s.min()
    return ['background-color: red' if v else '' for v in is_min]

In [29]:
def train_and_evaluate_GU(model, train_loader, validation_loader,testing_loader, optimizer, scheduler, loss_fn,
                       num_epochs=MAX_EPOCHS,min_delta=MIN_DELTA_ES,patience=PATIENCE_ES, gu_start=GU_START):

    print('Fine Tuning with: Early Stopping, Gradual Unfreezing')
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # scheduler = StepLR(optimizer, step_size=1, gamma=SCHEDULER_OPT_GAMMA)  # Adjust as needed


    best_score = float('inf')
    early_stopping_counter = 0

    # Move model to the specified device
    model = model.to(device)

    # Initialize logs
    train_logs = pd.DataFrame(columns=['Epoch', 'Loss'])
    columns = [ 'Epoch', 'Loss',
                'LRAP', 'micro avg Acc', 'micro avg F1', 'micro Rec', 'micro avg Prec',
                'E Acc', 'E F1', 'E Rec', 'E Prec',
                'S Acc', 'S F1', 'S Rec', 'S Prec',
                'G Acc', 'G F1', 'G Rec', 'G Prec'
        ]
    validation_logs = {col: [] for col in columns}

    best_loss = float('inf')
    best_model = None
    i = 0
    gu_start = GU_START-1


    # Gradual Unfreezing
    # Freeze all layers in the BERT model
    for param in model.bert.parameters():
        param.requires_grad = False

    # Epoch Loop
    for epoch in tqdm(range(num_epochs), desc='Epoch Progress'):

        # Training
        for param_group in optimizer.param_groups:
            current_lr = param_group['lr']
            print('~'*70,f'\nEpoch {epoch+1} ', f'\nLearning Rate: \t{round(current_lr, 9)}')


        # GU Logic
        if epoch >= gu_start:
            # Calculate the layer to unfreeze (12 layers in total, indexed from 0 to 11)
            layer_to_unfreeze = epoch -gu_start  # This will be 0 in the third epoch, 1 in the fourth, and so on.
            # Ensure the layer index is within the bounds of the model's layers
            if layer_to_unfreeze < 7:
                # Unfreeze the specified layer
                print('Layers froozen: \t', 12-1-layer_to_unfreeze)
                for param in model.bert.encoder.layer[layer_to_unfreeze].parameters():
                    param.requires_grad = True
            else:
                print('Layers froozen: \t 0 (NONE)')
        else:
          print('Layers froozen: \t 12 (ALL)')



        i+=1
        model.train()
        total_loss = 0


        for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}'):
            # Adjusting to the new batch structure
            inputs = batch['ids'].to(device)
            attention_mask = batch['mask'].to(device)
            labels = batch['targets'].to(device)

            optimizer.zero_grad()
            # Assuming your model takes these as inputs, adjust if your model's forward method differs
            outputs = model(inputs, attention_mask=attention_mask)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            scheduler.step()
        # scheduler.step()





        avg_train_loss = total_loss / len(train_loader)
        new_row = pd.DataFrame({'Epoch': [epoch+1], 'Loss': [avg_train_loss]})
        train_logs = pd.concat([train_logs, new_row], ignore_index=True)


        # Validation
        model.eval()
        total_loss = 0
        all_predictions = []
        all_targets = []
        with torch.no_grad():
            for batch in validation_loader:
                inputs = batch['ids'].to(device)
                attention_mask = batch['mask'].to(device)
                labels = batch['targets'].to(device)


                outputs = model(inputs, attention_mask=attention_mask)
                loss = loss_fn(outputs, labels)
                total_loss += loss.item()

                all_predictions.append(outputs.detach().cpu().numpy())
                all_targets.append(labels.detach().cpu().numpy())

        avg_val_loss = total_loss / len(validation_loader)

        # Calculate metrics
        all_predictions = np.vstack(all_predictions)
        # Apply Sigmoid Funtion
        all_predictions = 1 / (1 + np.exp(-all_predictions))
        pred_E = all_predictions[:, 0].tolist()  # First column to list
        pred_S = all_predictions[:, 1].tolist()  # Second column to list
        pred_G = all_predictions[:, 2].tolist()  # Third column to list

        all_targets = np.vstack(all_targets)
        target_E = all_targets[:, 0].tolist()  # First column to list
        target_S = all_targets[:, 1].tolist()  # Second column to list
        target_G = all_targets[:, 2].tolist()  # Third column to list

        binary_pred_E = [1 if pred >= 0.5 else 0 for pred in pred_E]
        binary_pred_S = [1 if pred >= 0.5 else 0 for pred in pred_S]
        binary_pred_G = [1 if pred >= 0.5 else 0 for pred in pred_G]

        validation_logs['LRAP'].append(label_ranking_average_precision_score(all_targets, all_predictions))
        # validation_logs['LRAP'].append(i)

        def calculate_and_append_metrics(pred, target, prefix):
            acc = accuracy_score(target, pred)
            f1 = f1_score(target, pred,zero_division=0)
            rec = recall_score(target, pred,zero_division=0)
            prec = precision_score(target, pred,zero_division=0)


            validation_logs[f'{prefix} Acc'].append(acc)
            validation_logs[f'{prefix} F1'].append(f1)
            validation_logs[f'{prefix} Rec'].append(rec)
            validation_logs[f'{prefix} Prec'].append(prec)

        # Calculate and append metrics for E, S, G
        calculate_and_append_metrics(binary_pred_E, target_E, 'E')
        calculate_and_append_metrics(binary_pred_S, target_S, 'S')
        calculate_and_append_metrics(binary_pred_G, target_G, 'G')

        # Concatenate predictions and targets across all categories for micro-average calculation
        all_binary_preds = binary_pred_E + binary_pred_S + binary_pred_G
        all_targets_joined = target_E + target_S + target_G
        binary_predictions = np.where(all_predictions >= 0.5, 1, 0)

        # Calculate micro-averaged metrics
        micro_acc = accuracy_score(all_targets_joined, all_binary_preds)
        micro_f1 = f1_score(binary_predictions, all_targets, average='micro',zero_division=0)
        micro_rec = recall_score(binary_predictions, all_targets, average='micro',zero_division=0)
        micro_prec = precision_score(binary_predictions, all_targets, average='micro',zero_division=0)

        # Append micro-averaged metrics to the validation_logs dictionary
        validation_logs['micro avg Acc'].append(micro_acc)
        validation_logs['micro avg F1'].append(micro_f1)
        validation_logs['micro Rec'].append(micro_rec)
        validation_logs['micro avg Prec'].append(micro_prec)

        validation_logs['Loss'].append(avg_val_loss)
        validation_logs['Epoch'].append(i)


        print(f'\t TRAINING    avg loss:\t {avg_train_loss}')
        print(f'\t EVALUATION  avg loss:\t {avg_val_loss}\n')

        # print(all_predictions)
        # print(all_targets)


        # Early stopping
        score = avg_val_loss  # Or any other metric you're monitoring
        if best_score - score > min_delta:
            best_score = score
            best_model = model
            best_model_epoch = i
            early_stopping_counter = 0
        else:
            early_stopping_counter += 1

        if early_stopping_counter >= patience:
            print('-'*70, f"\nEarly stopping triggered. Stopping training at epoch {epoch+1}\n")
            break

    validation_logs = pd.DataFrame(validation_logs)
    torch.cuda.empty_cache()

    print('-'*70, f"\n\t\tBEST MODEL: \tEPOCH: {best_model_epoch}\n")

    display(validation_logs[validation_logs['Epoch']==best_model_epoch])

    del model

    return best_model, train_logs, validation_logs



## Define funtion that tokenise the dataset and fine tune the model

In [37]:
def tokenize_and_fine_tune(selected_model,training_funtion, df = df):


  torch.cuda.empty_cache()
  print('SELECTED MODEL: \t', selected_model)

  tokenizer = BertTokenizer.from_pretrained(selected_model)
  class BERT_base(nn.Module):
      def __init__(self, n_classes):
          super(BERT_base, self).__init__()
          self.bert = BertModel.from_pretrained(selected_model)
          self.drop = nn.Dropout(p=DROPOUT)
          self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

      def forward(self, input_ids, attention_mask):
          _, pooled_output = self.bert(
              input_ids=input_ids,
              attention_mask=attention_mask,
              return_dict=False
          )
          output = self.drop(pooled_output)
          return self.out(output)
          # return torch.sigmoid(self.out(output)) # DO NOT USE IF USING BCE With logit

  # OTHER MODELS

  # Assuming model is an instance of BERTMultiLabelClassifier
  model_base = BERT_base(n_classes=3)  # Adjust n_classes based on your dataset
  # OTHER MODELS

  # Loss Function
  # loss_fn = nn.BCELoss()
  loss_fn = nn.BCEWithLogitsLoss()

  # Define the size of each split
  train_size = 0.7
  validation_test_size = 0.5

  train_dataset, temp_dataset = train_test_split(df, train_size=train_size, random_state=1984)
  val_dataset, test_dataset = train_test_split(temp_dataset, test_size=validation_test_size, random_state=1984)

  # Reset the index
  train_dataset = train_dataset.reset_index(drop=True)
  val_dataset = val_dataset.reset_index(drop=True)
  test_dataset = test_dataset.reset_index(drop=True)

  # Proceed to create datasets for DataLoader
  training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
  validation_set = CustomDataset(val_dataset, tokenizer, MAX_LEN)
  testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

  # DataLoader parameters
  train_params = {'batch_size': BATCH_SIZE, 'shuffle': True, 'num_workers': 0}
  # Create DataLoaders for each dataset
  training_loader = DataLoader(training_set, **train_params)
  validation_loader = DataLoader(validation_set, **train_params)
  testing_loader = DataLoader(testing_set, **train_params)

  print('Tokenization completed')

  optimizer = optim.AdamW(model_base.parameters(), lr=LR)
  # optimizer = optim.SGD(model_base.parameters(), lr=LR, momentum=0.9, nesterov=True)


  scheduler = OneCycleLR(optimizer, max_lr=9e-5, steps_per_epoch=len(training_loader), epochs=MAX_EPOCHS,
                          pct_start=0.1, anneal_strategy='linear',
                          div_factor=10, final_div_factor=10000)



  best_model, train_logs, validation_logs = training_funtion(model_base, training_loader, validation_loader, testing_loader, optimizer, scheduler, loss_fn, num_epochs=MAX_EPOCHS)
  print('-'*40, '\n\n')
  display(best_model)
  print('-'*40, '\n\n')
  display(train_logs.T.style.apply(highlight_max, axis=1).apply(highlight_min, axis=1))
  print('-'*40, '\n\n')
  display(validation_logs.T.style.apply(highlight_max, axis=1).apply(highlight_min, axis=1))


  torch.cuda.empty_cache()


  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  best_model.to(device)
  all_predictions = []



  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  best_model.to(device)
  all_predictions = []


  encoded_esg = tokenizer(test_dataset['text'].tolist(), padding=True, truncation=True, return_tensors="pt")

  input_ids = encoded_esg['input_ids'].to(device)
  attention_mask = encoded_esg['attention_mask'].to(device)

  encoded_esg = tokenizer(test_dataset['text'].tolist(), padding=True, return_tensors="pt")
  with torch.no_grad():
      outputs_esg = best_model(input_ids=input_ids, attention_mask=attention_mask)
      predictions = (outputs_esg.cpu().numpy())
      all_predictions.extend(predictions)
      probabilities = 1 / (1 + np.exp(-np.array(all_predictions)))
      all_predictions = np.where(probabilities >= 0.5, 1, 0)


    # Calculate metrics
  all_predictions = np.vstack(all_predictions)
  pred_E = all_predictions[:, 0].tolist()  # First column to list
  pred_S = all_predictions[:, 1].tolist()  # Second column to list
  pred_G = all_predictions[:, 2].tolist()  # Third column to list

  # Extract true labels from the test_dataset DataFrame
  true_E = test_dataset['E'].values
  true_S = test_dataset['S'].values
  true_G = test_dataset['G'].values
  true_labels = np.column_stack((true_E, true_S, true_G))



  # Calculate metrics for each label
  def calculate_metrics(true, pred):
      accuracy = accuracy_score(true, pred)
      f1 = f1_score(true, pred,zero_division=0)
      recall = recall_score(true, pred,zero_division=0)
      precision = precision_score(true, pred,zero_division=0)
      return accuracy, f1, precision, recall

  # Assuming you have true labels similar to how you obtained pred_E, pred_S, pred_G
  metrics_E = calculate_metrics(true_E, pred_E)
  metrics_S = calculate_metrics(true_S, pred_S)
  metrics_G = calculate_metrics(true_G, pred_G)

  # Create a DataFrame to display these metrics
  metrics_df = pd.DataFrame({
      'Label': ['E', 'S', 'G'],
      'Accuracy': [metrics_E[0], metrics_S[0], metrics_G[0]],
      'F1 Score': [metrics_E[1], metrics_S[1], metrics_G[1]],
      'Precision': [metrics_E[2], metrics_S[2], metrics_G[2]],
      'Recall': [metrics_E[3], metrics_S[3], metrics_G[3]],
  })

  # Flatten the arrays to compute micro-average across all labels
  true_labels_flattened = list(true_E) + list(true_S) + list(true_G)
  all_predictions_flattened = all_predictions.flatten()

  # Calculate micro-averaged metrics
  micro_accuracy = accuracy_score(true_labels_flattened, all_predictions_flattened)
  micro_f1 = f1_score(true_labels, all_predictions, average='micro')
  micro_precision = precision_score(true_labels, all_predictions, average='micro')
  micro_recall = recall_score(true_labels, all_predictions, average='micro')

  # Append the micro-averaged metrics to your DataFrame
  micro_avg_row = pd.DataFrame({
      'Label': ['Micro Avg'],
      'Accuracy': [micro_accuracy],
      'F1 Score': [micro_f1],
      'Precision': [micro_precision],
      'Recall': [micro_recall],
  })
  metrics_df_test = pd.concat([micro_avg_row, metrics_df], ignore_index=True)

  display(metrics_df_test)

  return best_model, train_logs, validation_logs

# Fine Tune and Evaluate Different Pre Trained Models

In [38]:
%%time


best_model_base, train_logs_base, validation_logs_base = tokenize_and_fine_tune('bert-base-uncased', train_and_evaluate)


SELECTED MODEL: 	 bert-base-uncased
Tokenization completed


Epoch Progress:   0%|          | 0/50 [00:00<?, ?it/s]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
Epoch 1 
 Learning Rate: 		9e-06


Epoch 1:   0%|          | 0/88 [00:00<?, ?it/s]

	 TRAINING    avg loss:	 0.4425328570333394
	 EVALUATION  avg loss:	 0.37795377091357585

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
Epoch 2 
 Learning Rate: 		9.185e-06


Epoch 2:   0%|          | 0/88 [00:00<?, ?it/s]

	 TRAINING    avg loss:	 0.33395386910574004
	 EVALUATION  avg loss:	 0.26649340202933863

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
Epoch 3 
 Learning Rate: 		9.369e-06


Epoch 3:   0%|          | 0/88 [00:00<?, ?it/s]

	 TRAINING    avg loss:	 0.22826848094436256
	 EVALUATION  avg loss:	 0.2079857572128898

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
Epoch 4 
 Learning Rate: 		9.554e-06


Epoch 4:   0%|          | 0/88 [00:00<?, ?it/s]

	 TRAINING    avg loss:	 0.16433106328953395
	 EVALUATION  avg loss:	 0.16971781065589503

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
Epoch 5 
 Learning Rate: 		9.738e-06


Epoch 5:   0%|          | 0/88 [00:00<?, ?it/s]

	 TRAINING    avg loss:	 0.11845316606658426
	 EVALUATION  avg loss:	 0.16383375972509384

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
Epoch 6 
 Learning Rate: 		9.923e-06


Epoch 6:   0%|          | 0/88 [00:00<?, ?it/s]

	 TRAINING    avg loss:	 0.08785709946162322
	 EVALUATION  avg loss:	 0.16304620649469526

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
Epoch 7 
 Learning Rate: 		1.0107e-05


Epoch 7:   0%|          | 0/88 [00:00<?, ?it/s]

	 TRAINING    avg loss:	 0.06468549307266419
	 EVALUATION  avg loss:	 0.17304346945724988

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
Epoch 8 
 Learning Rate: 		1.0292e-05


Epoch 8:   0%|          | 0/88 [00:00<?, ?it/s]

	 TRAINING    avg loss:	 0.044130067277530376
	 EVALUATION  avg loss:	 0.19144596198671743

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
Epoch 9 
 Learning Rate: 		1.0476e-05


Epoch 9:   0%|          | 0/88 [00:00<?, ?it/s]

	 TRAINING    avg loss:	 0.036849150751632725
	 EVALUATION  avg loss:	 0.20365376966564278

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
Epoch 10 
 Learning Rate: 		1.0661e-05


Epoch 10:   0%|          | 0/88 [00:00<?, ?it/s]

	 TRAINING    avg loss:	 0.028024807572364807
	 EVALUATION  avg loss:	 0.2023585709302049

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
Epoch 11 
 Learning Rate: 		1.0845e-05


Epoch 11:   0%|          | 0/88 [00:00<?, ?it/s]

	 TRAINING    avg loss:	 0.02285678138617765
	 EVALUATION  avg loss:	 0.19309139957553462

---------------------------------------------------------------------- 
Early stopping triggered. Stopping training at epoch 11

---------------------------------------------------------------------- 
		BEST MODEL: 	EPOCH: 4



Unnamed: 0,Epoch,Loss,LRAP,micro avg Acc,micro avg F1,micro Rec,micro avg Prec,E Acc,E F1,E Rec,E Prec,S Acc,S F1,S Rec,S Prec,G Acc,G F1,G Rec,G Prec
3,4,0.169718,0.993333,0.942222,0.765766,0.867347,0.685484,0.97,0.903226,0.933333,0.875,0.913333,0.580645,0.439024,0.857143,0.943333,0.746269,0.657895,0.862069


---------------------------------------- 




BERT_base(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=

---------------------------------------- 




Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
Epoch,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0
Loss,0.442533,0.333954,0.228268,0.164331,0.118453,0.087857,0.064685,0.04413,0.036849,0.028025,0.022857


---------------------------------------- 




Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
Epoch,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0
Loss,0.377954,0.266493,0.207986,0.169718,0.163834,0.163046,0.173043,0.191446,0.203654,0.202359,0.193091
LRAP,0.928611,0.970556,0.982778,0.993333,0.99,0.989444,0.987778,0.982778,0.979444,0.984444,0.991667
micro avg Acc,0.862222,0.912222,0.941111,0.942222,0.943333,0.944444,0.945556,0.942222,0.936667,0.938889,0.933333
micro avg F1,0.0,0.553672,0.746411,0.765766,0.781116,0.780702,0.780269,0.767857,0.755365,0.746544,0.756098
micro Rec,0.0,0.924528,0.917647,0.867347,0.834862,0.855769,0.878788,0.86,0.807339,0.870968,0.762295
micro avg Prec,0.0,0.395161,0.629032,0.685484,0.733871,0.717742,0.701613,0.693548,0.709677,0.653226,0.75
E Acc,0.85,0.966667,0.97,0.97,0.976667,0.98,0.98,0.976667,0.97,0.98,0.98
E F1,0.0,0.880952,0.894118,0.903226,0.926316,0.93617,0.93617,0.926316,0.907216,0.934783,0.93617
E Rec,0.0,0.822222,0.844444,0.933333,0.977778,0.977778,0.977778,0.977778,0.977778,0.955556,0.977778


Unnamed: 0,Label,Accuracy,F1 Score,Precision,Recall
0,Micro Avg,0.724444,0.798611,0.771812,0.827338
1,E,0.98,0.93617,0.897959,0.977778
2,S,0.94,0.804348,0.860465,0.755102
3,G,0.886667,0.666667,0.596491,0.755556


CPU times: user 8min 16s, sys: 1.6 s, total: 8min 18s
Wall time: 8min 16s


In [39]:
%%time


best_model_Prosus, train_logs_Prosus, validation_logs_Prosus = tokenize_and_fine_tune('ProsusAI/finbert', train_and_evaluate)


SELECTED MODEL: 	 ProsusAI/finbert
Tokenization completed


Epoch Progress:   0%|          | 0/50 [00:00<?, ?it/s]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
Epoch 1 
 Learning Rate: 		9e-06


Epoch 1:   0%|          | 0/88 [00:00<?, ?it/s]

	 TRAINING    avg loss:	 0.47672574289820413
	 EVALUATION  avg loss:	 0.39477363856215225

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
Epoch 2 
 Learning Rate: 		9.185e-06


Epoch 2:   0%|          | 0/88 [00:00<?, ?it/s]

	 TRAINING    avg loss:	 0.37286718003451824
	 EVALUATION  avg loss:	 0.2952939177814283

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
Epoch 3 
 Learning Rate: 		9.369e-06


Epoch 3:   0%|          | 0/88 [00:00<?, ?it/s]

	 TRAINING    avg loss:	 0.24587001820856874
	 EVALUATION  avg loss:	 0.20823064132740624

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
Epoch 4 
 Learning Rate: 		9.554e-06


Epoch 4:   0%|          | 0/88 [00:00<?, ?it/s]

	 TRAINING    avg loss:	 0.16883564621887423
	 EVALUATION  avg loss:	 0.18297137869031807

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
Epoch 5 
 Learning Rate: 		9.738e-06


Epoch 5:   0%|          | 0/88 [00:00<?, ?it/s]

	 TRAINING    avg loss:	 0.11823060325431553
	 EVALUATION  avg loss:	 0.17105697252248464

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
Epoch 6 
 Learning Rate: 		9.923e-06


Epoch 6:   0%|          | 0/88 [00:00<?, ?it/s]

	 TRAINING    avg loss:	 0.08461126435378735
	 EVALUATION  avg loss:	 0.18134312243445924

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
Epoch 7 
 Learning Rate: 		1.0107e-05


Epoch 7:   0%|          | 0/88 [00:00<?, ?it/s]

	 TRAINING    avg loss:	 0.05844289096156982
	 EVALUATION  avg loss:	 0.1893626271109832

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
Epoch 8 
 Learning Rate: 		1.0292e-05


Epoch 8:   0%|          | 0/88 [00:00<?, ?it/s]

	 TRAINING    avg loss:	 0.043655416356738315
	 EVALUATION  avg loss:	 0.19533660958864188

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
Epoch 9 
 Learning Rate: 		1.0476e-05


Epoch 9:   0%|          | 0/88 [00:00<?, ?it/s]

	 TRAINING    avg loss:	 0.034882103971374985
	 EVALUATION  avg loss:	 0.21094059355949102

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
Epoch 10 
 Learning Rate: 		1.0661e-05


Epoch 10:   0%|          | 0/88 [00:00<?, ?it/s]

	 TRAINING    avg loss:	 0.025876973279413174
	 EVALUATION  avg loss:	 0.21514989749381416

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
Epoch 11 
 Learning Rate: 		1.0845e-05


Epoch 11:   0%|          | 0/88 [00:00<?, ?it/s]

	 TRAINING    avg loss:	 0.020023786039514976
	 EVALUATION  avg loss:	 0.21730871155465903

---------------------------------------------------------------------- 
Early stopping triggered. Stopping training at epoch 11

---------------------------------------------------------------------- 
		BEST MODEL: 	EPOCH: 4



Unnamed: 0,Epoch,Loss,LRAP,micro avg Acc,micro avg F1,micro Rec,micro avg Prec,E Acc,E F1,E Rec,E Prec,S Acc,S F1,S Rec,S Prec,G Acc,G F1,G Rec,G Prec
3,4,0.182971,0.979444,0.942222,0.763636,0.875,0.677419,0.973333,0.914894,0.955556,0.877551,0.913333,0.566667,0.414634,0.894737,0.94,0.727273,0.631579,0.857143


---------------------------------------- 




BERT_base(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=

---------------------------------------- 




Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
Epoch,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0
Loss,0.476726,0.372867,0.24587,0.168836,0.118231,0.084611,0.058443,0.043655,0.034882,0.025877,0.020024


---------------------------------------- 




Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
Epoch,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0
Loss,0.394774,0.295294,0.208231,0.182971,0.171057,0.181343,0.189363,0.195337,0.210941,0.21515,0.217309
LRAP,0.886111,0.978333,0.985,0.979444,0.980556,0.980556,0.985,0.982778,0.983333,0.980556,0.978889
micro avg Acc,0.862222,0.9,0.937778,0.942222,0.94,0.934444,0.936667,0.936667,0.928889,0.932222,0.936667
micro avg F1,0.0,0.43038,0.733333,0.763636,0.767241,0.744589,0.746667,0.753247,0.735537,0.748971,0.755365
micro Rec,0.0,1.0,0.895349,0.875,0.824074,0.803738,0.831683,0.813084,0.754237,0.764706,0.807339
micro avg Prec,0.0,0.274194,0.620968,0.677419,0.717742,0.693548,0.677419,0.701613,0.717742,0.733871,0.709677
E Acc,0.85,0.93,0.976667,0.973333,0.976667,0.973333,0.976667,0.97,0.973333,0.97,0.97
E F1,0.0,0.695652,0.921348,0.914894,0.923077,0.913043,0.923077,0.903226,0.914894,0.903226,0.903226
E Rec,0.0,0.533333,0.911111,0.955556,0.933333,0.933333,0.933333,0.933333,0.955556,0.933333,0.933333


Unnamed: 0,Label,Accuracy,F1 Score,Precision,Recall
0,Micro Avg,0.745556,0.812261,0.868852,0.76259
1,E,0.98,0.933333,0.933333,0.933333
2,S,0.933333,0.767442,0.891892,0.673469
3,G,0.923333,0.729412,0.775,0.688889


CPU times: user 8min 17s, sys: 1.75 s, total: 8min 18s
Wall time: 8min 16s


In [40]:
%%time


best_model_Yi, train_logs_Yi, validation_logs_Yi = tokenize_and_fine_tune('yiyanghkust/finbert-pretrain', train_and_evaluate)


SELECTED MODEL: 	 yiyanghkust/finbert-pretrain
Tokenization completed


Epoch Progress:   0%|          | 0/50 [00:00<?, ?it/s]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
Epoch 1 
 Learning Rate: 		9e-06


Epoch 1:   0%|          | 0/88 [00:00<?, ?it/s]

	 TRAINING    avg loss:	 0.43279228190129454
	 EVALUATION  avg loss:	 0.30758470924277054

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
Epoch 2 
 Learning Rate: 		9.185e-06


Epoch 2:   0%|          | 0/88 [00:00<?, ?it/s]

	 TRAINING    avg loss:	 0.2468855342065746
	 EVALUATION  avg loss:	 0.19680269101732656

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
Epoch 3 
 Learning Rate: 		9.369e-06


Epoch 3:   0%|          | 0/88 [00:00<?, ?it/s]

	 TRAINING    avg loss:	 0.16176386397670617
	 EVALUATION  avg loss:	 0.18068117452295204

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
Epoch 4 
 Learning Rate: 		9.554e-06


Epoch 4:   0%|          | 0/88 [00:00<?, ?it/s]

	 TRAINING    avg loss:	 0.11995525988326831
	 EVALUATION  avg loss:	 0.16356384871821655

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
Epoch 5 
 Learning Rate: 		9.738e-06


Epoch 5:   0%|          | 0/88 [00:00<?, ?it/s]

	 TRAINING    avg loss:	 0.08674745634198189
	 EVALUATION  avg loss:	 0.1868710429652741

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
Epoch 6 
 Learning Rate: 		9.923e-06


Epoch 6:   0%|          | 0/88 [00:00<?, ?it/s]

	 TRAINING    avg loss:	 0.07103574329944835
	 EVALUATION  avg loss:	 0.18576748139764132

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
Epoch 7 
 Learning Rate: 		1.0107e-05


Epoch 7:   0%|          | 0/88 [00:00<?, ?it/s]

	 TRAINING    avg loss:	 0.04802073247265071
	 EVALUATION  avg loss:	 0.210926211586124

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
Epoch 8 
 Learning Rate: 		1.0292e-05


Epoch 8:   0%|          | 0/88 [00:00<?, ?it/s]

	 TRAINING    avg loss:	 0.03250498477030884
	 EVALUATION  avg loss:	 0.20684017015523032

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
Epoch 9 
 Learning Rate: 		1.0476e-05


Epoch 9:   0%|          | 0/88 [00:00<?, ?it/s]

	 TRAINING    avg loss:	 0.02328217155511745
	 EVALUATION  avg loss:	 0.2096804877449023

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
Epoch 10 
 Learning Rate: 		1.0661e-05


Epoch 10:   0%|          | 0/88 [00:00<?, ?it/s]

	 TRAINING    avg loss:	 0.015737552899570965
	 EVALUATION  avg loss:	 0.22170541278625788

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
Epoch 11 
 Learning Rate: 		1.0845e-05


Epoch 11:   0%|          | 0/88 [00:00<?, ?it/s]

	 TRAINING    avg loss:	 0.012420696074100719
	 EVALUATION  avg loss:	 0.24189422103135208

---------------------------------------------------------------------- 
Early stopping triggered. Stopping training at epoch 11

---------------------------------------------------------------------- 
		BEST MODEL: 	EPOCH: 4



Unnamed: 0,Epoch,Loss,LRAP,micro avg Acc,micro avg F1,micro Rec,micro avg Prec,E Acc,E F1,E Rec,E Prec,S Acc,S F1,S Rec,S Prec,G Acc,G F1,G Rec,G Prec
3,4,0.163564,0.981667,0.942222,0.765766,0.867347,0.685484,0.986667,0.955556,0.955556,0.955556,0.896667,0.523077,0.414634,0.708333,0.943333,0.746269,0.657895,0.862069


---------------------------------------- 




BERT_base(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30873, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=

---------------------------------------- 




Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
Epoch,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0
Loss,0.432792,0.246886,0.161764,0.119955,0.086747,0.071036,0.048021,0.032505,0.023282,0.015738,0.012421


---------------------------------------- 




Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
Epoch,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0
Loss,0.307585,0.196803,0.180681,0.163564,0.186871,0.185767,0.210926,0.20684,0.20968,0.221705,0.241894
LRAP,0.953333,0.988333,0.985556,0.981667,0.986667,0.983333,0.977778,0.983333,0.984444,0.981111,0.977778
micro avg Acc,0.893333,0.934444,0.945556,0.942222,0.943333,0.935556,0.941111,0.941111,0.937778,0.935556,0.936667
micro avg F1,0.376623,0.714976,0.780269,0.765766,0.760563,0.756303,0.757991,0.753488,0.752212,0.743363,0.744395
micro Rec,0.966667,0.891566,0.878788,0.867347,0.910112,0.789474,0.873684,0.89011,0.833333,0.823529,0.838384
micro avg Prec,0.233871,0.596774,0.701613,0.685484,0.653226,0.725806,0.669355,0.653226,0.685484,0.677419,0.669355
E Acc,0.923333,0.976667,0.976667,0.986667,0.98,0.986667,0.983333,0.976667,0.976667,0.976667,0.98
E F1,0.666667,0.921348,0.924731,0.955556,0.931818,0.956522,0.946237,0.921348,0.923077,0.923077,0.933333
E Rec,0.511111,0.911111,0.955556,0.955556,0.911111,0.977778,0.977778,0.911111,0.933333,0.933333,0.933333


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unnamed: 0,Label,Accuracy,F1 Score,Precision,Recall
0,Micro Avg,0.743333,0.798479,0.846774,0.755396
1,E,0.983333,0.947368,0.9,1.0
2,S,0.913333,0.682927,0.848485,0.571429
3,G,0.926667,0.744186,0.780488,0.711111


CPU times: user 8min 16s, sys: 1.78 s, total: 8min 18s
Wall time: 8min 16s


In [41]:
%%time

"""SKIP GRADUAL UNFREEZING TEST AFTER NO IMPROVEMENTS IN MANY TESTS"""
# best_model_base, train_logs_base, validation_logs_base = tokenize_and_fine_tune('bert-base-uncased', train_and_evaluate_GU)


CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.25 µs


'SKIP GRADUAL UNFREEZING TEST AFTER NO IMPROVEMENTS IN MANY TESTS'

# Testing Existing Model yiyanghkust/finbert-esg on multi label classigication

In [42]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax


# Define the size of each split
train_size = 0.7
validation_test_size = 0.5

train_dataset, temp_dataset = train_test_split(df, train_size=train_size, random_state=1984)
val_dataset, test_dataset = train_test_split(temp_dataset, test_size=validation_test_size, random_state=1984)

# Reset the index
train_dataset = train_dataset.reset_index(drop=True)
val_dataset = val_dataset.reset_index(drop=True)
test_dataset = test_dataset.reset_index(drop=True)


tokenizer_finbert_esg = AutoTokenizer.from_pretrained("yiyanghkust/finbert-esg")
model_finbert_esg = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-esg")

encoded_finbert_esg = tokenizer_finbert_esg(test_dataset['text'].tolist(), padding=True, return_tensors="pt")
with torch.no_grad():
    outputs_finbert_esg = model_finbert_esg(**encoded_finbert_esg)
    probs_finbert_esg = softmax(outputs_finbert_esg.logits, dim=1).numpy()

print(model_finbert_esg.config.id2label)

true_labels = test_dataset[['E', 'S', 'G']].to_numpy()


# Map the index of the highest probability to the correct label
predictions_finbert_esg = np.argmax(probs_finbert_esg, axis=1)  # This will give you the index of the highest probability
# For simplicity, let's assume the indices [None, E, S, G] correspond to [0, 1, 2, 3] respectively

# Convert predictions to match the format of your true_labels
# This step is crucial for comparing using the criteria we discussed
predictions_mapped = np.zeros_like(true_labels)
for i, pred in enumerate(predictions_finbert_esg):
    if pred == 0:  # None
        continue  # All labels remain 0
    else:
        predictions_mapped[i, pred-1] = 1  # Subtract 1 to match the true_labels indexing


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Loop through each label to calculate metrics
for i, label in enumerate(['E', 'S', 'G']):
    accuracy = accuracy_score(true_labels[:, i], predictions_mapped[:, i])
    recall = recall_score(true_labels[:, i], predictions_mapped[:, i])
    precision = precision_score(true_labels[:, i], predictions_mapped[:, i])
    f1 = f1_score(true_labels[:, i], predictions_mapped[:, i])

    print(f"Label: {label} (FinBERT-ESG)")
    print(f"Accuracy: {accuracy}")
    print(f"Recall: {recall}")
    print(f"Precision: {precision}")
    print(f"F1 Score: {f1}\n")

# Calculate average metrics
average_accuracy = np.mean(accuracy)
average_recall = np.mean(recall)
average_precision = np.mean(precision)
average_f1 = np.mean(f1)

# Print average metrics
print(f"Average Accuracy: {average_accuracy}")
print(f"Average Recall: {average_recall}")
print(f"Average Precision: {average_precision}")
print(f"Average F1 Score: {average_f1}")


{0: 'None', 1: 'Environmental', 2: 'Social', 3: 'Governance'}
Label: E (FinBERT-ESG)
Accuracy: 0.96
Recall: 0.8222222222222222
Precision: 0.9024390243902439
F1 Score: 0.8604651162790697

Label: S (FinBERT-ESG)
Accuracy: 0.8466666666666667
Recall: 0.6938775510204082
Precision: 0.5230769230769231
F1 Score: 0.5964912280701755

Label: G (FinBERT-ESG)
Accuracy: 0.89
Recall: 0.7111111111111111
Precision: 0.6153846153846154
F1 Score: 0.6597938144329897

Average Accuracy: 0.89
Average Recall: 0.7111111111111111
Average Precision: 0.6153846153846154
Average F1 Score: 0.6597938144329897
