<a href="https://colab.research.google.com/github/Fabchirajoul/ESG_MODELS/blob/main/ESG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install transformers



In [2]:
pip install datasets



In [3]:
pip install accelerate -U



In [4]:
import random

# For our testing dataset

def random_sample_testing(input_file, output_file, sample_percentage=0.2):
    with open(input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Extract header and content separately
    header = lines[0]
    content = lines[1:]

    num_lines = len(content)
    sample_size = int(num_lines * sample_percentage)

    sampled_lines = random.sample(content, sample_size)

    # Add the "TEXT" header back at the beginning
    sampled_lines = [header] + sampled_lines

    with open(output_file, 'w', encoding='utf-8') as file:
        file.writelines(sampled_lines)

# Example usage
input_document = "/content/governance_metric_proper.csv"
output_sample = "/content/training_dataset_governance.csv"
random_sample_testing(input_document, output_sample, sample_percentage=0.9)



# For our training dataset

def random_sample_testing(input_file, output_file, sample_percentage=0.1):
    with open(input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Extract header and content separately
    header = lines[0]
    content = lines[1:]

    num_lines = len(content)
    sample_size = int(num_lines * sample_percentage)

    sampled_lines = random.sample(content, sample_size)

    # Add the "TEXT" header back at the beginning
    sampled_lines = [header] + sampled_lines

    with open(output_file, 'w', encoding='utf-8') as file:
        file.writelines(sampled_lines)

# Example usage
input_document = "/content/governance_metric_proper.csv"
output_sample = "/content/evaluating_dataset_governance.csv"
random_sample_testing(input_document, output_sample, sample_percentage=0.1)

In [5]:
import random
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader
from datasets import load_dataset
import torch

In [6]:
data_files = {"train": "/content/training_dataset_governance.csv",
              "validation": "/content/evaluating_dataset_governance.csv"}
df_governance = load_dataset("csv", data_files=data_files)
df_governance

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Company', 'Indicator', 'Context', 'Metric', 'Metric Unit', 'Latest Year'],
        num_rows: 1624
    })
    validation: Dataset({
        features: ['Company', 'Indicator', 'Context', 'Metric', 'Metric Unit', 'Latest Year'],
        num_rows: 180
    })
})

In [7]:
governance_keywords = [
"Board of Directors",
"Corporate Governance",
                        "Executive Compensation",
                        "Shareholder Rights",
                        "Audit Committee",
                        "Transparency",
                        "Disclosure",
                        "Ethics",
                        "Anti-Corruption",
                        "Code of Conduct",
                        "Whistleblower Protection",
                        "Risk Management",
                        "Compliance",
                        "Legal Compliance",
                        "Regulatory Compliance",
                        "Data Privacy",
                        "Cybersecurity",
                        "Diversity and Inclusion",
                        "Gender Diversity",
                        "Employee Relations",
                        "Human Rights",
                        "Labor Practices",
                        "Health and Safety",
                        "Supply Chain Management",
                        "Stakeholder Engagement",
                        "Community Relations",
                        "Philanthropy",
                        "Political Contributions",
                        "Lobbying",
                        "Sustainable Development Goals (SDGs)",
                        "Climate Change Governance",
                        "Carbon Footprint",
                        "Energy Efficiency",
                        "Renewable Energy",
                        "Water Management",
                        "Waste Management",
                        "Biodiversity",
                        "Environmental Impact Assessment",
                        "Pollution Control",
                        "Emissions Reduction",
                        "Sustainable Sourcing",
                        "Sustainable Packaging",
                        "Circular Economy",
                        "Product Responsibility",
                        "Fair Competition",
                        "Intellectual Property Rights",
                        "Customer Satisfaction",
                        "Quality Management",
                        "Supply Chain Ethics",
                        "Responsible Investment"

]

In [8]:
def label_data(row):
    label = 0
    for keyword in governance_keywords:
        if keyword in row["Indicator"].lower():
            label = 1
            break
    return {"input": row, "label": label, "Metric": row["Metric"], "Metric Unit": row["Metric Unit"]}


In [9]:
# Step 3: Tokenize the data
tokenizer = AutoTokenizer.from_pretrained("ESGBERT/GovRoBERTa-governance")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [10]:

# Apply label_data function to each row in the dataset
labeled_data = df_governance.map(label_data)
labeled_data


Map:   0%|          | 0/1624 [00:00<?, ? examples/s]

Map:   0%|          | 0/180 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Company', 'Indicator', 'Context', 'Metric', 'Metric Unit', 'Latest Year', 'input', 'label'],
        num_rows: 1624
    })
    validation: Dataset({
        features: ['Company', 'Indicator', 'Context', 'Metric', 'Metric Unit', 'Latest Year', 'input', 'label'],
        num_rows: 180
    })
})

In [11]:
def tokenize_data(row):
    inputs = tokenizer(
        row["Company"],
        row["Indicator"],
        row["Context"],
        str(row["Latest Year"]),
        return_tensors="pt",
        padding="max_length",
        max_length=512,
        truncation=True  # Add truncation
    )

    # Flatten the input tensor
    inputs = {key: value.squeeze(0) for key, value in inputs.items()}

    return inputs


In [12]:
# Apply label_data function to each row in the dataset
labeled_data = df_governance.map(label_data)

# Apply tokenize_data function to each row in the dataset
tokenized_data = labeled_data.map(tokenize_data)


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
tokenized_data.set_format("torch")
tokenized_data.column_names

Map:   0%|          | 0/1624 [00:00<?, ? examples/s]

Map:   0%|          | 0/180 [00:00<?, ? examples/s]

{'train': ['Company',
  'Indicator',
  'Context',
  'Metric',
  'Metric Unit',
  'Latest Year',
  'input',
  'label',
  'input_ids',
  'attention_mask',
  'labels'],
 'validation': ['Company',
  'Indicator',
  'Context',
  'Metric',
  'Metric Unit',
  'Latest Year',
  'input',
  'label',
  'input_ids',
  'attention_mask',
  'labels']}

In [13]:
tokenized_data.set_format("torch")

In [14]:
# Step 4: Define the model
model_governance = AutoModelForSequenceClassification.from_pretrained("ESGBERT/GovRoBERTa-governance")

In [15]:

# Check the expected input size from the model's configuration
lr=5e-5
num_epochs = 10
batch_size = 10
optimizer = torch.optim.AdamW(model_governance.parameters(), lr=lr)

In [16]:


# Define the scheduler
num_training_steps = len(tokenized_data["train"]) * num_epochs
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [17]:
device_governance = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model_governance.to(device_governance)
print("========================================================================================")
print("Our available device which our model is going to be trained on is: \n\n", device_governance)
print("========================================================================================")
# Making sure our model is ready to be  trained
print("Our Training Model architecture is: \n\n", model_governance.train())
print("========================================================================================")

Our available device which our model is going to be trained on is: 

 cuda
Our Training Model architecture is: 

 RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=2)
      (position_embeddings): Embedding(514, 768, padding_idx=2)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (out

In [18]:
# Step 5: Fine-tune the model
training_args = TrainingArguments(
    output_dir="Governance",
    overwrite_output_dir=True,
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=lr,
    weight_decay=0.01,
    logging_dir="Governance_Model_Saved",
    logging_steps=100,
    save_steps=500,
    evaluation_strategy="epoch",
    save_strategy= "epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",  # Adjust metric as needed for your task
)

In [19]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    accuracy = accuracy_score(labels, preds)
    # precision = precision_score(labels, preds, average='binary', zero_division=)
    # recall = recall_score(labels, preds, average='binary', zero_division='warn')
    # f1 = f1_score(labels, preds, average='binary', zero_division='warn')
    precision = precision_score(labels.flatten(), preds.flatten(), average='macro', zero_division=0)
    f1 = f1_score(labels.flatten(), preds.flatten(), average='macro')
    return {"accuracy": accuracy.item(), "precision": precision, "f1": f1}

    return {
        'eval_accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


    print("Type of predictions:", (pred.predictions))
    print("Predictions:", pred.predictions)

In [20]:
# Define Trainer
print("Before creating Trainer instance")
trainer = Trainer(
    model=model_governance,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
print("Trainer instance created")


Before creating Trainer instance
Trainer instance created


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [21]:
# Inspect the labels
# print(df_governance["train"]["label"][:10])  # Print the first few labels

In [22]:
# Before training
print("Before training")
for batch in trainer.get_train_dataloader():
    print("Shape of inputs:", batch['input_ids'].shape)  # Print shape of input_ids
    break  # Only print the shape of the first batch

Before training
Shape of inputs: torch.Size([10, 512])


In [23]:
# Train the model
print("Before training")
trainer.train()
print("Training completed")

Before training


Epoch,Training Loss,Validation Loss,Accuracy,Precision,F1
1,0.0074,5e-06,1.0,1.0,1.0
2,0.0,2e-06,1.0,1.0,1.0
3,0.0,1e-06,1.0,1.0,1.0
4,0.0,1e-06,1.0,1.0,1.0
5,0.0,1e-06,1.0,1.0,1.0
6,0.0,1e-06,1.0,1.0,1.0
7,0.0,1e-06,1.0,1.0,1.0
8,0.0,0.0,1.0,1.0,1.0
9,0.0,0.0,1.0,1.0,1.0
10,0.0,0.0,1.0,1.0,1.0


Training completed


In [24]:


# Load the validation dataset
# validation_dataset = load_dataset("csv", data_files="/content/evaluating_dataset_governance.csv")["train"]

In [25]:
# Load the validation dataset
validation_data_files = {"validation": "/content/evaluating_dataset_governance.csv"}
validation_dataset = load_dataset("csv", data_files=validation_data_files)["validation"]

Generating validation split: 0 examples [00:00, ? examples/s]

In [26]:
def evaluate_batch(batch):
    batch = {key: [value] if not isinstance(value, list) else value for key, value in batch.items()}  # Ensure batch has consistent format

    # Concatenate input fields into single sequences or text pairs
    inputs = [f"{company} {indicator} {context} {latest_year}" for company, indicator, context, latest_year in zip(batch["Company"], batch["Indicator"], batch["Context"], batch["Latest Year"])]

    inputs = tokenizer(
        inputs,
        return_tensors="pt",
        padding=True,
        truncation=True
    )
    inputs = {key: value.to(device_governance) for key, value in inputs.items()}  # Move inputs to the same device as the model
    with torch.no_grad():  # Ensure gradients are not calculated during inference
        outputs = model_governance(**inputs)
    predicted_labels = torch.sigmoid(outputs.logits) > 0.5  # Apply threshold for binary classification
    predicted_metrics = []
    for idx, (predicted_label, keyword) in enumerate(zip(predicted_labels, batch["Indicator"])):
        # Check each element of predicted_label tensor individually
        if predicted_label[0].item() and any(keyword.lower() in governance_keyword.lower() for governance_keyword in governance_keywords):  # Convert tensor to boolean value
            predicted_metrics.append({"Predicted Metric": batch["Metric"][idx], "Predicted Metric unit": batch["Metric Unit"][idx], "Predicted Keyword": keyword})
        else:
            predicted_metrics.append({"Predicted Metric": batch["Metric"][idx], "Predicted Metric unit": batch["Metric Unit"][idx], "Predicted Keyword": None})
    return predicted_metrics


In [27]:
# Populate predicted_results
predicted_results = []
for batch in validation_dataset:
    batch_results = evaluate_batch(batch)
    predicted_results.extend(batch_results)

for result in predicted_results:
    print("Metric:", result["Predicted Metric"])
    print("Metric unit:", result["Predicted Metric unit"])
    print("Keyword:", result["Predicted Keyword"])
    # Print accuracy, precision, and f1 score if available
    if "accuracy" in result:
        print("Accuracy:", result["accuracy"])
    if "precision" in result:
        print("Precision:", result["precision"])
    if "f1" in result:
        print("F1 score:", result["f1"])
    print()  # Add a blank line for better readability between results


Metric: 6.0
Metric unit: million
Keyword: Board

Metric: 41.8
Metric unit: %
Keyword: Diversity

Metric: 61.0
Metric unit: %
Keyword: Gender

Metric: 7.0
Metric unit: Employees
Keyword: Compliance

Metric: 17.0
Metric unit: %
Keyword: Compliance

Metric: 602.0
Metric unit: Number
Keyword: None

Metric: 12.0
Metric unit: Number
Keyword: None

Metric: 18.0
Metric unit: %
Keyword: Gender

Metric: 7.0
Metric unit: billion
Keyword: Gender

Metric: 0.0
Metric unit: Employees
Keyword: None

Metric: 117.0
Metric unit: Employees
Keyword: None

Metric: 565.0
Metric unit: employees
Keyword: None

Metric: 1.0
Metric unit: million
Keyword: Executive

Metric: 52.0
Metric unit: years
Keyword: Executive

Metric: 197.0
Metric unit: employees
Keyword: None

Metric: 616.0
Metric unit: hours
Keyword: Compliance

Metric: 2.0
Metric unit: years
Keyword: Corporate governance

Metric: 50.0
Metric unit: years
Keyword: None

Metric: 5.0
Metric unit: billion
Keyword: None

Metric: 812.0
Metric unit: employees
Ke