<a href="https://colab.research.google.com/github/Fabchirajoul/ESG_MODELS/blob/main/ESG_Governance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## INSTALLING THE LIBRARIES

In [1]:
pip install transformers



In [2]:
pip install accelerate -U



In [3]:
pip install datasets



In [4]:
import random

# For our testing dataset

def random_sample_testing(input_file, output_file, sample_percentage=0.85):
    with open(input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Extract header and content separately
    header = lines[0]
    content = lines[1:]

    num_lines = len(content)
    sample_size = int(num_lines * sample_percentage)

    sampled_lines = random.sample(content, sample_size)

    # Add the "TEXT" header back at the beginning
    sampled_lines = [header] + sampled_lines

    with open(output_file, 'w', encoding='utf-8') as file:
        file.writelines(sampled_lines)

# Example usage
input_document = "/content/governance_metric_proper.csv"
output_sample = "/content/training_dataset_governance.csv"
random_sample_testing(input_document, output_sample, sample_percentage=0.85)



# For our training dataset

def random_sample_testing(input_file, output_file, sample_percentage=0.1):
    with open(input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Extract header and content separately
    header = lines[0]
    content = lines[1:]

    num_lines = len(content)
    sample_size = int(num_lines * sample_percentage)

    sampled_lines = random.sample(content, sample_size)

    # Add the "TEXT" header back at the beginning
    sampled_lines = [header] + sampled_lines

    with open(output_file, 'w', encoding='utf-8') as file:
        file.writelines(sampled_lines)

# Example usage
input_document = "/content/governance_metric_proper.csv"
output_sample = "/content/evaluating_dataset_governance.csv"
random_sample_testing(input_document, output_sample, sample_percentage=0.1)


# For our testing dataset

def random_sample_testing(input_file, output_file, sample_percentage=0.05):
    with open(input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Extract header and content separately
    header = lines[0]
    content = lines[1:]

    num_lines = len(content)
    sample_size = int(num_lines * sample_percentage)

    sampled_lines = random.sample(content, sample_size)

    # Add the "TEXT" header back at the beginning
    sampled_lines = [header] + sampled_lines

    with open(output_file, 'w', encoding='utf-8') as file:
        file.writelines(sampled_lines)

# Example usage
input_document = "/content/governance_metric_proper.csv"
output_sample = "/content/testing_dataset_goevrnment.csv"
random_sample_testing(input_document, output_sample, sample_percentage=0.05)

## IMPORTING THE NECESSARY LIBARIES

In [21]:
import random
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from datasets import load_dataset
import torch

## INVOKING SPLIT DATA AS A DATASET DICTIONARY

In [6]:
data_files = {"train": "/content/training_dataset_governance.csv",
              "validation": "/content/evaluating_dataset_governance.csv",
              "testing": "/content/testing_dataset_goevrnment.csv"}
df_governantal = load_dataset("csv", data_files=data_files)
df_governantal

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating testing split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Company', 'Indicator', 'Context', 'Metric', 'Metric Unit', 'Latest Year'],
        num_rows: 1534
    })
    validation: Dataset({
        features: ['Company', 'Indicator', 'Context', 'Metric', 'Metric Unit', 'Latest Year'],
        num_rows: 180
    })
    testing: Dataset({
        features: ['Company', 'Indicator', 'Context', 'Metric', 'Metric Unit', 'Latest Year'],
        num_rows: 90
    })
})

## DEFINING THE SOCIAL KEYWORDS AS PER THE UN SUSTAINABLE DEVELOPMENT GOALS

In [7]:
governance_keywords = [
"Board of Directors",
"Corporate Governance",
                        "Executive Compensation",
                        "Shareholder Rights",
                        "Audit Committee",
                        "Transparency",
                        "Disclosure",
                        "Ethics",
                        "Anti-Corruption",
                        "Code of Conduct",
                        "Whistleblower Protection",
                        "Risk Management",
                        "Compliance",
                        "Legal Compliance",
                        "Regulatory Compliance",
                        "Data Privacy",
                        "Cybersecurity",
                        "Diversity and Inclusion",
                        "Gender Diversity",
                        "Employee Relations",
                        "Human Rights",
                        "Labor Practices",
                        "Health and Safety",
                        "Supply Chain Management",
                        "Stakeholder Engagement",
                        "Community Relations",
                        "Philanthropy",
                        "Political Contributions",
                        "Lobbying",
                        "Sustainable Development Goals (SDGs)",
                        "Climate Change Governance",
                        "Carbon Footprint",
                        "Energy Efficiency",
                        "Renewable Energy",
                        "Water Management",
                        "Waste Management",
                        "Biodiversity",
                        "Environmental Impact Assessment",
                        "Pollution Control",
                        "Emissions Reduction",
                        "Sustainable Sourcing",
                        "Sustainable Packaging",
                        "Circular Economy",
                        "Product Responsibility",
                        "Fair Competition",
                        "Intellectual Property Rights",
                        "Customer Satisfaction",
                        "Quality Management",
                        "Supply Chain Ethics",
                        "Responsible Investment"

]

## CLASSIFYING THE INDICATOR PREDICTED ROW AS 1 OR 0

In [8]:
def label_government_data(row):
    label = 0
    for keyword in governance_keywords:
        if keyword in row["Indicator"].lower():
            label = 1
            break
    return {"input": row, "label": label, "Metric": row["Metric"], "Metric Unit": row["Metric Unit"]}


## INVOKING THE TOKENIZER AND PRE-TRAINED MODEL FROM HUGGING FACE

In [15]:
checkpoint_government = "ESGBERT/GovRoBERTa-governance"
tokenizer_government = AutoTokenizer.from_pretrained(checkpoint_government)

model_government = AutoModelForSequenceClassification.from_pretrained(checkpoint_government)

## TOKENIZING THE DATA (CONVERTING TO MACHINE UNDERSTANDING LANGUAGE)

In [10]:
def tokenize_government_data(row):
    inputs_government = tokenizer_government(
        row["Company"],
        row["Indicator"],
        row["Context"],
        str(row["Latest Year"]),
        return_tensors="pt",
        padding="max_length",
        max_length=512,
        truncation=True  # Add truncation
    )

    # Flatten the input tensor
    inputs_government = {key: value.squeeze(0) for key, value in inputs_government.items()}

    return inputs_government

## MAPPING THE GOVERNMENT LABELS WITH THE GOVERNMENT DATASET DICTIONARY

In [11]:
# Apply label_data function to each row in the dataset
label_government_data =df_governantal.map(label_government_data)

# Apply tokenize_data function to each row in the dataset
tokenized_government_data = label_government_data.map(tokenize_government_data)


data_collator_environmental = DataCollatorWithPadding(tokenizer=tokenize_government_data)

Map:   0%|          | 0/1534 [00:00<?, ? examples/s]

Map:   0%|          | 0/180 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/1534 [00:00<?, ? examples/s]

Map:   0%|          | 0/180 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

## PREPARING FOR DATALOADER

In [13]:
tokenized_government_data.set_format("torch")

## DEFINING EPOCH, BATCH SIZE AND SCHEDULER

In [16]:
# Check the expected input size from the model's configuration
lr_government=5e-5
num_epochs_government = 10
batch_size_government = 10
optimizer_government = torch.optim.AdamW(model_government.parameters(), lr=lr_government)



# Define the scheduler
num_training_steps_government = len(tokenized_government_data["train"]) * num_epochs_government
lr_scheduler_government = get_scheduler(
    "linear",
    optimizer=optimizer_government,
    num_warmup_steps=0,
    num_training_steps=num_training_steps_government
)

## PUSHING MODEL TO DEVICE

In [18]:
device_government = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model_government.to(device_government)
print("========================================================================================")
print("Our available device which our model is going to be trained on is: \n\n", device_government)
print("========================================================================================")
# Making sure our model is ready to be  trained
print("Our Training Model architecture is: \n\n", model_government.train())
print("========================================================================================")

Our available device which our model is going to be trained on is: 

 cuda
Our Training Model architecture is: 

 RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=2)
      (position_embeddings): Embedding(514, 768, padding_idx=2)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (out

In [27]:
training_args_government = TrainingArguments(
    accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True},
    adafactor=False,
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-08,
    auto_find_batch_size=False,
    bf16=False,
    bf16_full_eval=False,
    data_seed=None,
    dataloader_drop_last=False,
    dataloader_num_workers=0,
    dataloader_persistent_workers=False,
    dataloader_pin_memory=True,
    dataloader_prefetch_factor=None,
    ddp_backend=None,
    ddp_broadcast_buffers=None,
    ddp_bucket_cap_mb=None,
    ddp_find_unused_parameters=None,
    ddp_timeout=1800,
    debug=[],
    deepspeed=None,
    disable_tqdm=False,
    dispatch_batches=None,
    do_eval=True,
    do_predict=False,
    do_train=False,
    eval_accumulation_steps=None,
    eval_delay=0,
    eval_steps=None,
    evaluation_strategy="epoch",
    fp16=False,
    fp16_backend="auto",
    fp16_full_eval=False,
    fp16_opt_level="O1",
    fsdp=[],
    fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
    fsdp_min_num_params=0,
    fsdp_transformer_layer_cls_to_wrap=None,
    full_determinism=False,
    gradient_accumulation_steps=1,
    gradient_checkpointing=False,
    gradient_checkpointing_kwargs=None,
    greater_is_better=True,
    group_by_length=False,
    half_precision_backend="auto",
    hub_always_push=False,
    hub_model_id=None,
    hub_private_repo=False,
    hub_strategy="every_save",
    hub_token="YOUR_HUB_TOKEN_HERE",
    ignore_data_skip=False,
    include_inputs_for_metrics=False,
    include_num_input_tokens_seen=False,
    include_tokens_per_second=False,
    jit_mode_eval=False,
    label_names=None,  # Remove or set to None
    label_smoothing_factor=0.0,
    learning_rate=lr_government,
    length_column_name="length",
    load_best_model_at_end=True,
    local_rank=0,
    log_level="passive",
    log_level_replica="warning",
    logging_dir="GOVERNANCE_Model_Saved",
    logging_first_step=False,
    logging_nan_inf_filter=True,
    logging_steps=100,
    logging_strategy="steps",
    lr_scheduler_kwargs={},
    lr_scheduler_type="linear",
    max_grad_norm=1.0,
    max_steps=-1,
    metric_for_best_model="accuracy",
    mp_parameters="",
    neftune_noise_alpha=None,
    no_cuda=False,
    num_train_epochs=num_epochs_government,
    optim="adamw_torch",
    optim_args=None,
    output_dir="GOVERNANCE",
    overwrite_output_dir=True,
    past_index=-1,
    per_device_eval_batch_size=batch_size_government,
    per_device_train_batch_size=batch_size_government,
    prediction_loss_only=False,
    push_to_hub=False,
    push_to_hub_model_id=None,
    push_to_hub_organization=None,
    # push_to_hub_token="PUSH_TO_HUB_TOKEN",
    ray_scope="last",
    remove_unused_columns=True,
    report_to=['tensorboard'],
    resume_from_checkpoint=None,
    run_name="GOVERNANCE",
    save_on_each_node=False,
    save_only_model=False,
    save_safetensors=True,
    save_steps=500,
    save_strategy="epoch",
    save_total_limit=None,
    seed=42,
    skip_memory_metrics=True,
    split_batches=None,
    tf32=None,
    torch_compile=False,
    torch_compile_backend=None,
    torch_compile_mode=None,
    torchdynamo=None,
    tpu_metrics_debug=False,
    tpu_num_cores=None,
    use_cpu=False,
    use_ipex=False,
    use_legacy_prediction_loop=False,
    use_mps_device=False,
    warmup_ratio=0.0,
    warmup_steps=0,
    weight_decay=0.01,
)


## DEFINING THE LOSS FUNCTION

In [23]:
def compute_metrics_government(pred_government):
    labels_government = pred_government.label_ids
    preds_government = pred_government.predictions.argmax(-1)

    accuracy_government = accuracy_score(labels_government, preds_government)
    precision_government = precision_score(labels_government.flatten(), preds_government.flatten(), average='macro', zero_division=0)
    f1_government = f1_score(labels_government.flatten(), preds_government.flatten(), average='macro')
    return {"accuracy": accuracy_government.item(), "precision": precision_government, "f1": f1_government}

    return {
        'eval_accuracy': accuracy_government,
        'precision': precision_government,
        'f1': f1_government
    }

## INSTANTIATING THE TRAINER

In [34]:

trainer_government = Trainer(
    model=model_government,
    args=training_args_government,
    train_dataset=tokenized_government_data["train"],
    eval_dataset=tokenized_government_data["validation"],
    tokenizer=tokenizer_government,
    compute_metrics=compute_metrics_government
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


## CHECKING THE SHAPE OF THE BATCH INPUT TO THE MODEL

In [36]:
# Before training
print("Before training")
for batch in trainer_government.get_train_dataloader():
    print("Shape of inputs:", batch['input_ids'].shape)  # Print shape of input_ids
    break  # Only print the shape of the first batch

Before training
Shape of inputs: torch.Size([10, 512])


## TRAINING THE GOVERNMENT MODEL

In [37]:
# @title Default title text
trainer_government.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,F1
1,0.0003,1e-06,1.0,1.0,1.0
2,0.0,0.0,1.0,1.0,1.0
3,0.0,0.0,1.0,1.0,1.0
4,0.0,0.0,1.0,1.0,1.0
5,0.0,0.0,1.0,1.0,1.0
6,0.0,0.0,1.0,1.0,1.0
7,0.0,0.0,1.0,1.0,1.0
8,0.0,0.0,1.0,1.0,1.0
9,0.0,0.0,1.0,1.0,1.0
10,0.0,0.0,1.0,1.0,1.0


TrainOutput(global_step=1540, training_loss=1.9273316196596206e-05, metrics={'train_runtime': 1426.9753, 'train_samples_per_second': 10.75, 'train_steps_per_second': 1.079, 'total_flos': 4036123589222400.0, 'train_loss': 1.9273316196596206e-05, 'epoch': 10.0})

## VALIDATING THE MODEL ON THE ALLOCATED VAIDATION DATASET

In [38]:
# Load the validation dataset
validation_data_files = {"validation": "/content/evaluating_dataset_governance.csv"}
validation_dataset_government = load_dataset("csv", data_files=validation_data_files)["validation"]

Generating validation split: 0 examples [00:00, ? examples/s]

In [39]:
def evaluate_batch(batch):
    batch = {key: [value] if not isinstance(value, list) else value for key, value in batch.items()}  # Ensure batch has consistent format

    # Concatenate input fields into single sequences or text pairs
    inputs_government = [f"{company} {indicator} {context} {latest_year}" for company, indicator, context, latest_year in zip(batch["Company"], batch["Indicator"], batch["Context"], batch["Latest Year"])]

    inputs_government = tokenizer_government(
        inputs_government,
        return_tensors="pt",
        padding=True,
        truncation=True
    )
    inputs_government = {key: value.to(device_government) for key, value in inputs_government.items()}  # Move inputs to the same device as the model
    with torch.no_grad():  # Ensure gradients are not calculated during inference
        outputs_government = model_government(**inputs_government)
    predicted_labels_government = torch.sigmoid(outputs_government.logits) > 0.5  # Apply threshold for binary classification
    predicted_metrics_government = []
    for idx, (predicted_label_government, keyword) in enumerate(zip(predicted_labels_government, batch["Indicator"])):
        # Check each element of predicted_label tensor individually
        if predicted_label_government[0].item() and any(keyword.lower() in governance_keyword.lower() for governance_keyword in governance_keywords):  # Convert tensor to boolean value
            predicted_metrics_government.append({"Predicted Metric": batch["Metric"][idx], "Predicted Metric unit": batch["Metric Unit"][idx], "Predicted Keyword": keyword})
        else:
            predicted_metrics_government.append({"Predicted Metric": batch["Metric"][idx], "Predicted Metric unit": batch["Metric Unit"][idx], "Predicted Keyword": None})
    return predicted_metrics_government


In [40]:
# Populate predicted_results
predicted_results_government = []
for batch in validation_dataset_government:
    batch_results = evaluate_batch(batch)
    predicted_results_government.extend(batch_results)

for result in predicted_results_government:
    print("Metric:", result["Predicted Metric"])
    print("Metric unit:", result["Predicted Metric unit"])
    print("Keyword:", result["Predicted Keyword"])
    # Print accuracy, precision, and f1 score if available
    if "accuracy" in result:
        print("Accuracy:", result["accuracy"])
    if "precision" in result:
        print("Precision:", result["precision"])
    if "f1" in result:
        print("F1 score:", result["f1"])
    print()  # Add a blank line for better readability between results


Metric: 97.0
Metric unit: %
Keyword: None

Metric: 5.0
Metric unit: billion
Keyword: Gender

Metric: 12.8
Metric unit: %
Keyword: Board

Metric: 3.0
Metric unit: million
Keyword: None

Metric: 2.8
Metric unit: years
Keyword: Gender

Metric: 8.0
Metric unit: billion
Keyword: Compliance

Metric: 40.0
Metric unit: years
Keyword: Executive

Metric: 25.0
Metric unit: years
Keyword: Executive

Metric: 7.0
Metric unit: billion
Keyword: None

Metric: 100.0
Metric unit: %
Keyword: None

Metric: 6124.0
Metric unit: %
Keyword: Executive

Metric: 1.0
Metric unit: Rand
Keyword: Compliance

Metric: 447.0
Metric unit: employees
Keyword: None

Metric: 7.0
Metric unit: %
Keyword: Compliance

Metric: 0.0
Metric unit: employees
Keyword: None

Metric: 87.0
Metric unit: Rand
Keyword: None

Metric: 49.0
Metric unit: %
Keyword: None

Metric: 63.0
Metric unit: hours
Keyword: None

Metric: 3.0
Metric unit: million
Keyword: None

Metric: 1.0
Metric unit: billion
Keyword: Board

Metric: 63.0
Metric unit: %
Keywo