# Imports

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import time

from sklearn.metrics import classification_report, accuracy_score, precision_score, f1_score, recall_score, precision_recall_fscore_support, roc_auc_score, confusion_matrix

from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification, DefaultDataCollator
from datasets import Dataset
import torch
from torch.utils.data import DataLoader

from tqdm.auto import tqdm

In [6]:
model_path = "./results_BERT_new/saved_model"
tokenizer_BERT = AutoTokenizer.from_pretrained(model_path)
model_BERT = AutoModelForSequenceClassification.from_pretrained(model_path)

# Data

Let's make an example dataset. We will make a dataframe with 10 rows of random climate related text.

In [4]:
INDC_Text = [
    "Climate change is leading to more frequent and severe weather events.",
    "Rising global temperatures are causing polar ice to melt at an alarming rate.",
    "Deforestation is a major contributor to the increase in greenhouse gases.",
    "Ocean acidification is harming marine ecosystems and biodiversity.",
    "Increased carbon emissions are driving global warming and climate change.",
    "The loss of biodiversity is exacerbated by changing climate conditions.",
    "Extreme weather events, such as hurricanes and wildfires, are becoming more common.",
    "Rising sea levels are threatening coastal communities and habitats.",
    "The climate crisis is impacting food security and agricultural productivity.",
    "Air pollution from burning fossil fuels is a significant health risk and environmental problem."
]

df = pd.DataFrame(INDC_Text, columns=["INDC Text"])

# Predict

Helper function for handling predictions

In [9]:
def transformer_predict(model, tokenizer, df, text_column_name="INDC Text"):
    def tokenize_function(examples):
        return tokenizer(examples[text_column_name], padding="max_length", truncation=True, max_length=512)

    dataset = Dataset.from_pandas(df)
    dataset = dataset.map(tokenize_function, batched=True)

    data_collator = DefaultDataCollator()
    dataloader = DataLoader(dataset, collate_fn=data_collator, batch_size=1)
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            batch = {k: v.to(device) for k, v in batch.items() if k in tokenizer.model_input_names}
            outputs = model(**batch)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)
            predictions.extend(preds.cpu().numpy())
    
    return predictions

In [12]:
output_BERT = transformer_predict(model_BERT, tokenizer_BERT, df)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

**VERY IMPORTANT** The original fine-tuned BERT that I trained has a specific mapping for each class integer. They do not correspond with the goal number, and I will define the mapping here

In [20]:
mapping_goal_class = {'Goal 7 - Affordable and Clean Energy': 0,
 'Goal 12 - Responsible Consumption and Production ': 1,
 'Goal 4 - Quality Education': 2,
 'Goal 13 - Climate Action': 3,
 'Goal 1 - No Poverty ': 4,
 'Goal 15 - Life on Land': 5,
 'Goal 2 - Zero Hunger': 6,
 'Goal 17 - Partnerships for the Goal': 7,
 'Goal 8 - Decent Work and Economic Growth': 8,
 'Goal 9 - Industry, Innovation and Infrastructure': 9,
 'Goal 11 - Sustainable Cities and Communities ': 10,
 'Goal 3 - Good Health and Well-being': 11,
 'Goal 6 - Clean Water and Sanitation': 12,
 'Goal 14 - Life Below Water': 13,
 'Goal 5 - Gender Equality': 14,
 'Goal 16 - Peace, Justice and Strong Institutions': 15,
 'Goal 10 - Reduced Inequalities': 16,
 'missing_text': 17}

In [19]:
mapping_class_goal = {value: key for key, value in mapping_goal_class.items()}
mapping_class_goal

{0: 'Goal 7 - Affordable and Clean Energy',
 1: 'Goal 12 - Responsible Consumption and Production ',
 2: 'Goal 4 - Quality Education',
 3: 'Goal 13 - Climate Action',
 4: 'Goal 1 - No Poverty ',
 5: 'Goal 15 - Life on Land',
 6: 'Goal 2 - Zero Hunger',
 7: 'Goal 17 - Partnerships for the Goal',
 8: 'Goal 8 - Decent Work and Economic Growth',
 9: 'Goal 9 - Industry, Innovation and Infrastructure',
 10: 'Goal 11 - Sustainable Cities and Communities ',
 11: 'Goal 3 - Good Health and Well-being',
 12: 'Goal 6 - Clean Water and Sanitation',
 13: 'Goal 14 - Life Below Water',
 14: 'Goal 5 - Gender Equality',
 15: 'Goal 16 - Peace, Justice and Strong Institutions',
 16: 'Goal 10 - Reduced Inequalities',
 17: None}

# Eval

In [23]:
output_BERT_mapped = output_BERT.replace(mapping_class_goal)
output_BERT_mapped

eplaced_array = [reversed_mapping[value] for value in original_array]

AttributeError: 'list' object has no attribute 'replace'

In [13]:
df_after = df.copy()
df_after['BERT_Output'] = output_BERT

In [15]:
df_after

Unnamed: 0,INDC Text,BERT_Output
0,Climate change is leading to more frequent and...,11
1,Rising global temperatures are causing polar i...,13
2,Deforestation is a major contributor to the in...,5
3,Ocean acidification is harming marine ecosyste...,13
4,Increased carbon emissions are driving global ...,3
5,The loss of biodiversity is exacerbated by cha...,5
6,"Extreme weather events, such as hurricanes and...",3
7,Rising sea levels are threatening coastal comm...,13
8,The climate crisis is impacting food security ...,6
9,Air pollution from burning fossil fuels is a s...,11
