# Training the Sustainability Objective Detection Model

## === Setup ===

### Importing Libraries

In [None]:
import os
import sys
import time
import numpy
import pandas
import plotly.io as pio
import plotly.graph_objects as go
import sklearn.metrics
import sklearn.cluster
import sklearn.ensemble
import sklearn.model_selection
import sklearn.feature_extraction

sys.path.append("../source")
import data_preprocessing
import transformer_model

pandas.set_option("display.max_rows", None)
pandas.set_option("display.max_columns", None)
pandas.set_option("display.max_colwidth", None)
pio.renderers.default = "iframe"

### Loading the Dataset

In [None]:
df = pandas.read_csv("../datasets/sustainability_goals.csv")
df = df.dropna(subset=["Text Blocks"])

print("Dataset Size:", df.shape)
print("The Number of Goals:", df["Goal"].sum())
df.head()

## === Data Preprocessing ===

In [None]:
sustainability_keywords = [
    "green", "environment", "carbon", "footprint", "co2",  "emission", "pollution", "recycle", "waste", "plant", "energy", "renewable", "water", "electricity",
    "diversity", "employee", "women", "female", "human", "inclusion", "health", "safety", "security",
    # "goal", "sustainable", "zero", "right"
    ]

data_preprocessor = data_preprocessing.DataPreprocessing()
df = data_preprocessor.clean_text_blocks(df, "Text Blocks", level="minimal")
df = data_preprocessor.filter_text_blocks(df, "Text Blocks", keep_only_size=(0, 300), keep_only_keywords=sustainability_keywords)

print("Dataset Size:", df.shape)
df.head()

In [None]:
tb_len = df[df["Goal"].notnull()]["Text Blocks"].apply(len)
fig = go.Figure(data=[go.Histogram(x=tb_len, nbinsx=1000)])
fig.show()

## === Splitting the Dataset ===

In [None]:
# df["labels"] = df["Goal"].notnull().apply(lambda x: 1 if x else 0)
df = df.rename(columns={"Text Blocks": "text", "Goal": "labels"})
df = df[["text", "labels"]]
df = df.drop_duplicates(subset=["text"])

df_train, df_test = sklearn.model_selection.train_test_split(
    df,
    test_size=0.2,
    stratify=df["labels"],
    random_state=7
)

print("Train Set Size:", df_train.shape)
print(df_train["labels"].value_counts())
print("Test Set Size:", df_test.shape)
print(df_test["labels"].value_counts())

## === Training and Testing the Model ===

In [None]:
model = transformer_model.TransformerModel(name="climatebert/environmental-claims", epochs=3, learning_rate=1e-5, batch_size=16, weight_decay=0.01, save=False, save_to="../models")
model.fit(df_train, df_test)

## === Analyzing the Model Error ===

In [None]:
model = transformer_model.TransformerModel(name="climatebert/environmental-claims", load_from="../models/climatebert/environmental-claims")
pipe = model.load_pipeline()
predictions = pipe(df_test["text"].tolist())
df_test["Goal Score"] = [p[1]["score"] for p in predictions]
df_test.sort_values("Goal Score", ascending=False).head(20)