# Fine-Tuning the Sustainability Goal Detection Model

## === Setup ===

### Importing Libraries

In [None]:
import sys
import pandas
import plotly.io as pio
import plotly.graph_objects as go
import sklearn.model_selection

sys.path.append("../source")
import data_preprocessing
import transformer_model

pandas.set_option("display.max_rows", None)
pandas.set_option("display.max_columns", None)
pandas.set_option("display.max_colwidth", None)
pio.renderers.default = "iframe"

### Loading the Dataset

In [None]:
df = pandas.read_csv("../datasets/sustainability_goals.csv", low_memory=False)
df = df.dropna(subset=["Text Blocks", "Goal"])
df = df.drop_duplicates(subset=["Text Blocks"])

print("Dataset Size:", df.shape)
print("The Number of Goals:", df["Goal"].sum())
df.head()

## === Data Preprocessing ===

In [None]:
df["text"] = df["Text Blocks"].copy()
df["labels"] = df["Goal"].copy()
target_values = ["Not Goal", "Goal"]

sustainability_keywords = [
    "green", "environment", "carbon", "footprint", "co2",  "emission", "pollution", "recycle", "waste", "plant", "energy", "renewable", "water", "electricity",
    "diversity", "employee", "women", "female", "human", "inclusion", "health", "safety", "security",
    # "goal", "sustainable", "zero", "right"
    ]

data_preprocessor = data_preprocessing.DataPreprocessing()
# df = data_preprocessor.clean_text_blocks(df, "text", level="essential")
df = data_preprocessor.filter_text_blocks(df, "text", keep_only_size=(0, 300), keep_only_keywords=sustainability_keywords)

print("Dataset Size:", df.shape)
df.head()

In [None]:
tb_len = df[df["labels"].notnull()]["text"].apply(len)
fig = go.Figure(data=[go.Histogram(x=tb_len, nbinsx=1000)])
fig.show()

## === Splitting the Dataset ===

In [None]:
df_train, df_test = sklearn.model_selection.train_test_split(
    df,
    test_size=0.2,
    stratify=df["labels"],
    random_state=7
)

print("Train Set Size:", df_train.shape)
print(df_train["labels"].value_counts())
print("Test Set Size:", df_test.shape)
print(df_test["labels"].value_counts())

## === Training and Testing the Model ===

In [None]:
model = transformer_model.TextClassification(target_values, name="distilroberta-base", epochs=3, learning_rate=5e-5, batch_size=16, 
                                             weight_decay=0.01, save=True, save_to="../models/goal-detection")
model.fit(df_train, df_test)

## === Inference ===

In [None]:
model = transformer_model.TextClassification(target_values, name="distilroberta-base", load_from="../models/goal-detection/distilroberta-base")
predictions = model.predict(df_test["text"].tolist())
df_test["Goal Score"] = predictions["Goal"].values
df_test.sort_values("Goal Score", ascending=False).head(20)