# Dataset Construction

## === Setup ===

### Importing Libraries

In [None]:
import os
import io
import sys
import hashlib
import urllib3
import datetime
import urllib.parse

import minio
import pandas

sys.path.append("../source")
import document

pandas.set_option("display.max_rows", None)
pandas.set_option("display.max_columns", None)
pandas.set_option("display.max_colwidth", None)

### Connecting to Minio

In [None]:
client = ...

### Downloading Annotations

In [None]:
dataset_folder = "../datasets"
if not os.path.exists(dataset_folder):
    os.mkdir(dataset_folder) 

file_path = os.path.join(dataset_folder, "annotations.csv")
client.fget_object("esg-claims", "master_edits.csv", file_path)
adf = pandas.read_csv(file_path)
adf = adf.drop(0)
adf = adf.dropna(subset=["admin_link", "Goal"])
print(adf.shape)
adf.head(2)

## === Extracting and Labeling Text Blocks from URLs of Sustainability Reports ===

In [None]:
result_folder = "../training-text-blocks"
if not os.path.exists(result_folder):
    os.mkdir(result_folder) 

urls = adf["admin_link"].unique()
processed_urls = {f: True for f in os.listdir(result_folder)}

for i, url in enumerate(urls):
    
    if i % 30 == 0:
        print(f"Processing URL {i} / {len(urls)} ({int(100 * i / len(urls))}%): {url}")
    
    file_name = hashlib.sha1(url.encode("utf-8")).hexdigest()
    if file_name in processed_urls:
        continue
    
    try:
        annotated_goals = adf[adf["admin_link"] == url]["Goal"].unique()
        doc = document.Document(url, annotations=annotated_goals)
        content = doc.request_url()
        parsed_content = doc.parse_content(content)
        text_blocks = doc.segment_text(parsed_content)
        labeled_text_blocks = doc.label_text_blocks(text_blocks)
        labeled_text_blocks.to_csv(os.path.join(result_folder, file_name), index=False)
    except:
        # print(f"Cannot process the URL: {url}")
        open(os.path.join(result_folder, file_name), "w").close()    

## === Post-Processing Text Blocks ===

In [None]:
results = [f for f in os.listdir(result_folder) if os.path.getsize(os.path.join(result_folder, f)) > 0]
print(f"{len(results)} URLs are stored.")

dataset = None
for i, file_name in enumerate(results):
    
    if i % 30 == 0:
        print(f"Processing URL {i} / {len(results)} ({int(100 * i / len(results))}%)")
    
    rdf = pandas.read_csv(os.path.join(result_folder, file_name))
    if rdf.empty:
        continue
    
    if rdf["Goal"].sum() <= 0:
        continue
    
    if dataset is None:
        dataset = rdf
    else:
        dataset = pandas.concat([dataset, rdf])

print("Dataset Size:", dataset.shape)
print("The Number of Goals:", dataset["Goal"].sum())
dataset_path = os.path.join(dataset_folder, "sustainability_goals.csv")
dataset.to_csv(dataset_path, index=False)    