# Imports

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import time

from sklearn.metrics import classification_report, accuracy_score, precision_score, f1_score, recall_score, precision_recall_fscore_support, roc_auc_score, confusion_matrix

In [5]:
df = pd.read_csv("CW_NDC_SDG_linkages.csv")

# Preprocess

In [10]:
from datasets import Dataset
from transformers import Trainer, TrainingArguments
from transformers import EvalPrediction

import torch
import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
df_tf = df.copy()

In [12]:
df_tf = df_tf.drop(columns=['Country', 'ISO_code3', 'Document_slug', 'Status', 'Sector', 'Climate_response', 'Type_of_information', 'Unnamed: 10', 'Target'])

### **Class integer not mapped directly to goal number

In [14]:
mapping_goal = {value: index for index, value in enumerate(df_tf['Goal'].unique())}
mapping_goal

{'Goal 7 - Affordable and Clean Energy': 0,
 'Goal 12 - Responsible Consumption and Production ': 1,
 'Goal 4 - Quality Education': 2,
 'Goal 13 - Climate Action': 3,
 'Goal 1 - No Poverty ': 4,
 'Goal 15 - Life on Land': 5,
 'Goal 2 - Zero Hunger': 6,
 'Goal 17 - Partnerships for the Goal': 7,
 'Goal 8 - Decent Work and Economic Growth': 8,
 'Goal 9 - Industry, Innovation and Infrastructure': 9,
 'Goal 11 - Sustainable Cities and Communities ': 10,
 'Goal 3 - Good Health and Well-being': 11,
 'Goal 6 - Clean Water and Sanitation': 12,
 'Goal 14 - Life Below Water': 13,
 'Goal 5 - Gender Equality': 14,
 'Goal 16 - Peace, Justice and Strong Institutions': 15,
 'Goal 10 - Reduced Inequalities': 16,
 nan: 17}

In [15]:
df_tf['Goal'] = df_tf['Goal'].map(mapping_goal)

In [17]:
df_tf.rename(columns={'Goal': 'label'}, inplace=True)

In [18]:
df_tf

Unnamed: 0,label,INDC Text
0,0,Promoting economic development and sustainable...
1,1,Promoting economic development and sustainable...
2,2,Raising awareness for people of Afghanistan on...
3,1,Raising awareness for people of Afghanistan on...
4,3,Raising awareness for people of Afghanistan on...
...,...,...
24507,10,"In 2023, the Infrastructure Sector Adaptation ..."
24508,10,"In 2025, Municipal Plans for Territorial Organ..."
24509,10,Expansion and adaptation of the road network (...
24510,10,Adaptation to climate change in urban environm...


# Metrics

In [19]:
def compute_metrics(pred: EvalPrediction):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    accuracy = accuracy_score(labels, preds)
    
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted') # Feel free to change to macro, micro, etc.
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

In [20]:
def tokenize_function(examples):
    return tokenizer(examples["INDC Text"], padding="max_length", truncation=True)

# Train

In [48]:
from transformers import BertTokenizer, BertForSequenceClassification

In [22]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [50]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=18) # len(mapping_goal)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [51]:
dataset = Dataset.from_pandas(df_tf[['INDC Text', 'label']])

split_datasets = dataset.train_test_split(test_size=0.2)
train_dataset = split_datasets['train'].map(tokenize_function, batched=True)
eval_dataset = split_datasets['test'].map(tokenize_function, batched=True)

train_dataset.set_format('torch')
eval_dataset.set_format('torch')

Map: 100%|██████████| 19609/19609 [00:11<00:00, 1746.85 examples/s]
Map: 100%|██████████| 4903/4903 [00:02<00:00, 1784.20 examples/s]


In [55]:
training_args = TrainingArguments(
    output_dir="./results_BERT_new",
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs_BERT_new",
    logging_steps=10,
    evaluation_strategy="epoch",  # Metrics at end of epoch
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

time_start = time.time()
trainer.train()

time_end = time.time()
print(f"Duration: {time_end - time_start} seconds") 

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0574,1.290137,0.64328,0.641283,0.64328,0.635259
2,0.6476,1.1648,0.633082,0.641085,0.633082,0.627233
3,0.977,1.105707,0.656129,0.656728,0.656129,0.653094
4,0.7929,1.138387,0.654701,0.6529,0.654701,0.651073
5,0.4933,1.196001,0.654089,0.653321,0.654089,0.649945
6,0.6189,1.248963,0.649398,0.653585,0.649398,0.64709
7,0.5033,1.255291,0.649194,0.650419,0.649194,0.644006
8,0.5919,1.251025,0.644707,0.645202,0.644707,0.641719
9,0.5699,1.302709,0.646951,0.644352,0.646951,0.644399
10,0.3562,1.384808,0.643688,0.639736,0.643688,0.640528


  _warn_prf(average, modifier, msg_start, len(result))


Duration: 5018.279499292374 seconds


In [56]:
trainer.evaluate(eval_dataset)

{'eval_loss': 1.3848077058792114,
 'eval_accuracy': 0.6436875382418927,
 'eval_precision': 0.6397364264545716,
 'eval_recall': 0.6436875382418927,
 'eval_f1': 0.640527805226724,
 'eval_runtime': 35.7051,
 'eval_samples_per_second': 137.319,
 'eval_steps_per_second': 17.168,
 'epoch': 10.0}

In [57]:
model_path = "./results_BERT_new/saved_model"
tokenizer.save_pretrained(model_path)
model.save_pretrained(model_path)