# Deployment

## === Setup ===

### Importing Libraries

In [None]:
import os
import sys
import pathlib 
import urllib3
import datetime
import minio
import pandas

sys.path.append("../source")
import document
import data_preprocessing
import transformer_model

pandas.set_option("display.max_rows", None)
pandas.set_option("display.max_columns", None)
pandas.set_option("display.max_colwidth", None)

### Connecting to Minio

In [None]:
...

### Setting up the Data Preprocessor

In [None]:
sustainability_keywords = [
    "green", "environment", "carbon", "footprint", "co2",  "emission", "pollution", "recycle", "waste", "plant", "energy", "renewable", "water", "electricity",
    "diversity", "employee", "women", "female", "human", "inclusion", "health", "safety", "security",
    # "goal", "sustainable", "zero", "right"
    ]

data_preprocessor = data_preprocessing.DataPreprocessing()

### Loading Our Trained Models

In [None]:
target_values = ["Not Goal", "Goal"]
goal_detection_model = transformer_model.TextClassification(target_values, name="climatebert/environmental-claims", 
                                                            load_from="../models/goal-detection/climatebert/environmental-claims")

## === Processing New Sustainability Reports ===

In [None]:
objects = client.list_objects("esg-next-gen", prefix="questions_app/firm/", recursive=True)
for i, obj in enumerate(objects):

    source_file_path = obj.object_name
    result_file_path = os.path.join("../extracted-sustainability-objectives", obj.object_name[:-3] + "csv")
    result_folder_path = os.path.dirname(result_file_path)
           
    if os.path.exists(result_file_path):
        continue

    _, extension = os.path.splitext(source_file_path.lower())
    extension = extension.strip(".")
    if extension in ["pdf", "html"]:
                
        client.fget_object("esg-next-gen", source_file_path, "temp_file")

        try:
            
            doc = document.Document("temp_file")
            doc.content_type = extension
            content = doc.read_local_file()           
            parsed_content = doc.parse_content(content)
            text_blocks = doc.segment_text(parsed_content)
            tdf = pandas.DataFrame({"File": source_file_path, "Text Blocks": text_blocks})

            tdf["text"] = tdf["Text Blocks"].copy()
            #tdf = data_preprocessor.clean_text_blocks(tdf, "text", level="essential")
            tdf = data_preprocessor.filter_text_blocks(tdf, "text", keep_only_size=(0, 300))
            predictions = goal_detection_model.predict(tdf["text"].tolist())
            tdf["Goal Score"] = predictions["Goal"].values
            tdf = tdf.drop(["text"], axis=1)

        except:
            print(f"Cannot process {obj.object_name}.")
            continue
    
    else:
        continue
    
    output_dir = pathlib.Path(result_folder_path)
    output_dir.mkdir(parents=True, exist_ok=True)
    tdf.to_csv(result_file_path, index=False)
    
    if i % 10 == 0:
        print(f"{i} documents have been processed.")

## === Post-Processing the Results ===

In [None]:
df_all = None
for root, dirs, files in os.walk("../extracted-sustainability-objectives"):
    for file_name in files:
        file_path = os.path.join(root, file_name)
        if not file_path.endswith("csv"):
            continue       
        rdf = pandas.read_csv(file_path)
        rdf["Company"] = root.split("/")[-1]
        if df_all is None:
            df_all = rdf
        else:
            df_all = pandas.concat([df_all, rdf])

print(df_all.shape)
print(df_all["Company"].nunique())
print(df_all["File"].nunique())
df_all.head()

In [None]:
df_goals = df_all[df_all["Goal Score"] >= 0.5]
df_goals = df_goals.sort_values("Goal Score", ascending=False)
# df_goals.to_csv("../datasets/extracted_sustainability_objectives.csv", index=False)
print(df_goals.shape)
print(df_goals["Company"].nunique())
print(df_goals["File"].nunique())
df_goals.head()