<a href="https://colab.research.google.com/github/Fabchirajoul/ESG_MODELS/blob/main/ESG_Social.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers



In [None]:
pip install accelerate -U



In [None]:
pip install datasets



## SPLITTING THE DATASET INTO TRAINING (90%) AND VALIDATION (10%)

In [None]:
import random

# For our testing dataset

def random_sample_testing(input_file, output_file, sample_percentage=0.9):
    with open(input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Extract header and content separately
    header = lines[0]
    content = lines[1:]

    num_lines = len(content)
    sample_size = int(num_lines * sample_percentage)

    sampled_lines = random.sample(content, sample_size)

    # Add the "TEXT" header back at the beginning
    sampled_lines = [header] + sampled_lines

    with open(output_file, 'w', encoding='utf-8') as file:
        file.writelines(sampled_lines)

# Example usage
input_document = "/content/social_metric_proper.csv"
output_sample = "/content/training_dataset_social.csv"
random_sample_testing(input_document, output_sample, sample_percentage=0.9)



# For our training dataset

def random_sample_testing(input_file, output_file, sample_percentage=0.1):
    with open(input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Extract header and content separately
    header = lines[0]
    content = lines[1:]

    num_lines = len(content)
    sample_size = int(num_lines * sample_percentage)

    sampled_lines = random.sample(content, sample_size)

    # Add the "TEXT" header back at the beginning
    sampled_lines = [header] + sampled_lines

    with open(output_file, 'w', encoding='utf-8') as file:
        file.writelines(sampled_lines)

# Example usage
input_document = "/content/social_metric_proper.csv"
output_sample = "/content/evaluating_dataset_social.csv"
random_sample_testing(input_document, output_sample, sample_percentage=0.1)

## IMPORTING THE NECESSARY LIBARIES

In [None]:
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader
from datasets import load_dataset
import torch

## INVOKING SPLIT DATA AS A DATASET DICTIONARY

In [None]:
data_files = {"train": "/content/training_dataset_social.csv",
              "validation": "/content/evaluating_dataset_social.csv"}
df_social = load_dataset("csv", data_files=data_files)
df_social

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Company', 'Indicator', 'Context', 'Metric', 'Metric Unit', 'Latest Year'],
        num_rows: 597
    })
    validation: Dataset({
        features: ['Company', 'Indicator', 'Context', 'Metric', 'Metric Unit', 'Latest Year'],
        num_rows: 66
    })
})

## DEFINING THE SOCIAL KEYWORDS AS PER THE UN SUSTAINABLE DEVELOPMENT GOALS

In [None]:
social_keywords = [
    "Age",
    "Culture",
    "Race",
    "Access to",
    "Accessibility",
    "Accident",
    "Accountability",
    "Awareness",
    "Behaviour",
    "Charity",
    "Civil",
    "Code of conduct",
    "Community",
    "Consumer protection",
    "Cyber security",
    "Data privacy",
    "Data protection",
    "Data security",
    "Demographic",
    "Disability",
    "Discrimination",
    "Diversity",
    "Donation",
    "Education",
    "Emotion",
    "Employee benefit",
    "Employee development",
    "Employment benefit",
    "Empower",
    "Equal",
    "ESG",
    "Ethics",
    "Ethnic",
    "Fairness",
    "Family",
    "Female",
    "Financial protection",
    "Gap",
    "Gender",
    "Health",
    "Human",
    "Inclusion",
    "Information security",
    "Injury",
    "Leave",
    "LGBT",
    "mental well-being",
    "Parity",
    "Pay equity",
    "Peace",
    "Pension benefit",
    "Philanthropy",
    "Poverty",
    "Privacy",
    "Product quality",
    "Product safety",
    "Promotion",
    "Quality of life",
    "Religion",
    "Respectful",
    "Respecting",
    "Retirement benefit",
    "Safety",
    "Salary",
    "Social",
    "Society",
    "Supply chain transparency",
    "Supportive",
    "Talent",
    "Volunteer",
    "Wage",
    "Welfare",
    "Well-being",
    "Wellbeing",
    "Wellness",
    "Women",
    "Workforce",
    "Working conditions"
]

## CLASSIFYING THE INDICATOR PREDICTED ROW AS 1 OR 0

In [None]:
def label_social_data(row):
    label = 0
    for keyword in social_keywords:
        if keyword in row["Indicator"].lower():
            label = 1
            break
    return {"input": row, "label": label, "Metric": row["Metric"], "Metric Unit": row["Metric Unit"]}

## INVOKING THE TOKENIZER AND PRE-TRAINED MODEL FROM HUGGING FACE

In [None]:
tokenizer_social = AutoTokenizer.from_pretrained("ESGBERT/SocRoBERTa-social")
model_social = AutoModelForSequenceClassification.from_pretrained("ESGBERT/SocRoBERTa-social")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


## TOKENIZING THE DATA (CONVERTING TO MACHINE UNDERSTANDING LANGUAGE)

In [None]:
def tokenize_social_data(row):
    inputs = tokenizer_social(
        row["Company"],
        row["Indicator"],
        row["Context"],
        str(row["Latest Year"]),
        return_tensors="pt",
        padding="max_length",
        max_length=512,
        truncation=True  # Add truncation
    )

    # Flatten the input tensor
    inputs = {key: value.squeeze(0) for key, value in inputs.items()}

    return inputs

## MAPPING THE SOCIAL LABELS WITH THE SOCIAL DATASET DICTIONARY

In [None]:
# Apply label_data function to each row in the dataset
label_social_data = df_social.map(label_social_data)

# Apply tokenize_data function to each row in the dataset
tokenized_social_data = label_social_data.map(tokenize_social_data)


data_collator_social = DataCollatorWithPadding(tokenizer=tokenize_social_data)
tokenized_social_data .set_format("torch")
# tokenized_social_data .column_names

Map:   0%|          | 0/597 [00:00<?, ? examples/s]

Map:   0%|          | 0/66 [00:00<?, ? examples/s]

Map:   0%|          | 0/597 [00:00<?, ? examples/s]

Map:   0%|          | 0/66 [00:00<?, ? examples/s]

## PREPARING FOR DATALOADER

In [None]:
tokenized_social_data.set_format("torch")

## DEFINING EPOCH, BATCH SIZE AND SCHEDULER

In [None]:
# Check the expected input size from the model's configuration
lr_social=5e-5
num_epochs_social = 10
batch_size_social = 10
optimizer_social = torch.optim.AdamW(model_social.parameters(), lr=lr_social)



# Define the scheduler
num_training_steps_social = len(tokenized_social_data["train"]) * num_epochs_social
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer_social,
    num_warmup_steps=0,
    num_training_steps=num_training_steps_social
)

## PUSHING MODEL TO DEVICE

In [None]:
device_social = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model_social.to(device_social)
print("========================================================================================")
print("Our available device which our model is going to be trained on is: \n\n", device_social)
print("========================================================================================")
# Making sure our model is ready to be  trained
print("Our Training Model architecture is: \n\n", model_social.train())
print("========================================================================================")

Our available device which our model is going to be trained on is: 

 cuda
Our Training Model architecture is: 

 RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=2)
      (position_embeddings): Embedding(514, 768, padding_idx=2)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (out

## FINE TUNING

In [None]:
# Step 5: Fine-tune the model
training_args_social = TrainingArguments(
    output_dir="SOCIAL",
    overwrite_output_dir=True,
    num_train_epochs=num_epochs_social,
    per_device_train_batch_size=batch_size_social,
    per_device_eval_batch_size=batch_size_social,
    learning_rate=lr_social,
    weight_decay=0.01,
    logging_dir="Social_Model_Saved",
    logging_steps=100,
    save_steps=500,
    evaluation_strategy="epoch",
    save_strategy= "epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",  # Adjust metric as needed for your task
)

## DEFINING THE LOSS FUNCTION

In [None]:
def compute_metrics_social(pred_social):
    labels_social = pred_social.label_ids
    preds_social = pred_social.predictions.argmax(-1)

    accuracy_social = accuracy_score(labels_social, preds_social)
    precision_social = precision_score(labels_social.flatten(), preds_social.flatten(), average='macro', zero_division=0)
    f1_social = f1_score(labels_social.flatten(), preds_social.flatten(), average='macro')
    return {"accuracy": accuracy_social.item(), "precision": precision_social, "f1": f1_social}

    return {
        'eval_accuracy': accuracy_social,
        'precision': precision,
        'f1': f1
    }

## INSTANTIATING THE TRAINER

In [None]:
trainer = Trainer(
    model=model_social,
    args=training_args_social,
    train_dataset=tokenized_social_data["train"],
    eval_dataset=tokenized_social_data["validation"],
    tokenizer=tokenizer_social,
    compute_metrics=compute_metrics_social
)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
# Before training
print("Before training")
for batch in trainer.get_train_dataloader():
    print("Shape of inputs:", batch['input_ids'].shape)  # Print shape of input_ids
    break  # Only print the shape of the first batch

Before training
Shape of inputs: torch.Size([10, 512])


In [None]:

trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,Precision,F1
1,No log,0.000851,1.0,1.0,1.0
2,0.141800,0.00013,1.0,1.0,1.0
3,0.141800,6.3e-05,1.0,1.0,1.0
4,0.000200,4.4e-05,1.0,1.0,1.0
5,0.000100,3.4e-05,1.0,1.0,1.0
6,0.000100,2.9e-05,1.0,1.0,1.0
7,0.000100,2.5e-05,1.0,1.0,1.0
8,0.000100,2.3e-05,1.0,1.0,1.0
9,0.000000,2.2e-05,1.0,1.0,1.0
10,0.000000,2.1e-05,1.0,1.0,1.0


Checkpoint destination directory SOCIAL/checkpoint-60 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory SOCIAL/checkpoint-120 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory SOCIAL/checkpoint-180 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory SOCIAL/checkpoint-240 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory SOCIAL/checkpoint-300 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory SOCIAL/checkpoint-360 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory SOCIAL/checkpoint-420 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpo

TrainOutput(global_step=600, training_loss=0.023696231240561853, metrics={'train_runtime': 631.5991, 'train_samples_per_second': 9.452, 'train_steps_per_second': 0.95, 'total_flos': 1570773000499200.0, 'train_loss': 0.023696231240561853, 'epoch': 10.0})

## EVALUATING THE MODEL ON THE ALLOCATED VAIDATION DATASET

In [None]:
# Load the validation dataset
validation_data_files_social = {"validation": "/content/evaluating_dataset_social.csv"}
validation_dataset_social = load_dataset("csv", data_files=validation_data_files_social)["validation"]




def evaluate_batch_social(batch):
    batch = {key: [value] if not isinstance(value, list) else value for key, value in batch.items()}  # Ensure batch has consistent format

    # Concatenate input fields into single sequences or text pairs
    inputs_social = [f"{company} {indicator} {context} {latest_year}" for company, indicator, context, latest_year in zip(batch["Company"], batch["Indicator"], batch["Context"], batch["Latest Year"])]

    inputs_social = tokenizer_social(
        inputs_social,
        return_tensors="pt",
        padding=True,
        truncation=True
    )
    inputs_social = {key: value.to(device_social) for key, value in inputs_social.items()}  # Move inputs to the same device as the model
    with torch.no_grad():  # Ensure gradients are not calculated during inference
        outputs_social = model_social(**inputs_social)
    predicted_labels_social = torch.sigmoid(outputs_social.logits) > 0.5  # Apply threshold for binary classification
    predicted_metrics_social = []
    for idx, (predicted_label_social, keyword_social) in enumerate(zip(predicted_labels_social, batch["Indicator"])):
        # Check each element of predicted_label tensor individually
        if predicted_label_social[0].item() and any(keyword_social.lower() in social_keyword.lower() for social_keyword in social_keywords):  # Convert tensor to boolean value

            predicted_metrics_social.append({"Predicted Metric": batch["Metric"][idx], "Predicted Metric unit": batch["Metric Unit"][idx], "predicted_label_social": 0, "Predicted Keyword": keyword_social})
        else:
            predicted_metrics_social.append({"Predicted Metric": batch["Metric"][idx], "Predicted Metric unit": batch["Metric Unit"][idx], "predicted_label_social": 1, "Predicted Keyword": None})
    return predicted_metrics_social


## DISPLAYING THE RESULTS

In [None]:
# @title Default title text
# Populate predicted_results
predicted_results_social = []
for batch in validation_dataset_social:
    batch_results_social = evaluate_batch_social(batch)
    predicted_results_social.extend(batch_results_social)

for result in predicted_results_social:
    print("Metric:", result["Predicted Metric"])
    print("Metric unit:", result["Predicted Metric unit"])
    if result["predicted_label_social"]:
        print("Is there a social keyword Present ?:", "0 for No")
    else:
        print("Is there a social keyword Present ?::", "1 for Yes")
    print("Keyword:", result["Predicted Keyword"])
    print()  # Add a blank line for better readability between results

Metric: 50.0
Metric unit: years
Is there a social keyword Present ?: 0 for No
Keyword: None

Metric: 20.0
Metric unit: years
Is there a social keyword Present ?:: 1 for Yes
Keyword: Community

Metric: 200.0
Metric unit: employees
Is there a social keyword Present ?: 0 for No
Keyword: None

Metric: 4.0
Metric unit: years
Is there a social keyword Present ?: 0 for No
Keyword: None

Metric: 28.0
Metric unit: years
Is there a social keyword Present ?: 0 for No
Keyword: None

Metric: 6.0
Metric unit: Employees
Is there a social keyword Present ?: 0 for No
Keyword: None

Metric: 35.0
Metric unit: employees
Is there a social keyword Present ?: 0 for No
Keyword: None

Metric: 592.0
Metric unit: employees
Is there a social keyword Present ?: 0 for No
Keyword: None

Metric: 2022.0
Metric unit: Employees
Is there a social keyword Present ?: 0 for No
Keyword: None

Metric: 36.0
Metric unit: hours
Is there a social keyword Present ?: 0 for No
Keyword: None

Metric: 69.0
Metric unit: years
Is there 