# E=1, S=2, G=3, None=4

# ESGBERT

## Environment

In [None]:
df = pd.read_csv("hf://datasets/ESGBERT/environmental_2k/environmental_2k.csv", usecols=["text", "env"])
df = df[df["env"] == 1]
df.reset_index(inplace=True, drop=True)
sample_df = df.sample(n=500, random_state=42, replace=False)
sample_df.reset_index(drop=True, inplace=True)
sample_df["env"] = sample_df["env"].replace({
    1:1
})

sample_df_environment = sample_df.rename(columns={"env":"label"})
sample_df_environment

## Social

In [None]:
import pandas as pd
df = pd.read_csv("hf://datasets/ESGBERT/social_2k/social_2k.csv", usecols=["text", "soc"])
df = df[df["soc"] == 1]
df.reset_index(inplace=True, drop=True)
sample_df_s = df.sample(n=500, random_state=42, replace=False)
sample_df_s.reset_index(drop=True, inplace=True)
sample_df_s["soc"] = sample_df_s["soc"].replace({
    1:2
})

sample_df_social = sample_df_s.rename(columns={"soc":"label"})
sample_df_social

## Governance

In [None]:
df = pd.read_csv("hf://datasets/ESGBERT/governance_2k/governance_2k.csv", usecols=["text", "gov"])
df = df[df["gov"] == 1]
df.reset_index(inplace=True, drop=True)
sample_df = df.sample(n=500, random_state=42, replace=False)
sample_df.reset_index(drop=True, inplace=True)
sample_df["gov"] = sample_df["gov"].replace({
    1:3
})

sample_df_governance = sample_df.rename(columns={"gov":"label"})
sample_df_governance

## Concatenate ESGBERT Dataset

In [None]:
df_esgbert_dataset = pd.concat([sample_df_environment, sample_df_social, sample_df_governance], ignore_index=True)
df_esgbert_dataset


---

# ESG-Miner

In [None]:
df = pd.read_excel(r"gold_standard_corpus.xlsx", usecols=["headline", "esg_category"])
df["esg_category"].value_counts()
df.duplicated(subset=["headline"]).sum()

In [62]:
df.rename(columns={"headline":"text", "esg_category":"label"}, inplace=True)

In [None]:
df["label"].value_counts()

## Non-ESG

In [None]:
df_non_esg = df[df["label"] == "non-esg"]
miner_sample_non = df_non_esg.sample(n=1000, random_state=42, replace=False)
miner_sample_non.replace({"non-esg":0}, inplace=True)
miner_sample_non.reset_index(drop=True, inplace=True)
miner_sample_non

## E

In [None]:
df_e = df[df["label"] == "environmental"]
miner_sample_e = df_e.sample(n=500, random_state=42, replace=False)
miner_sample_e.replace({"environmental":1}, inplace=True)
miner_sample_e.reset_index(drop=True, inplace=True)
miner_sample_e

## S


In [None]:
df_s = df[df["label"] == "social"]
miner_sample_s = df_s.sample(n=500, random_state=42, replace=False)
miner_sample_s.replace({"social":2}, inplace=True)
miner_sample_s.reset_index(drop=True, inplace=True)
miner_sample_s

## G

In [None]:
df_g = df[df["label"] == "governance"]
miner_sample_g = df_g.sample(n=500, random_state=42, replace=False)
miner_sample_g.replace({"governance":3}, inplace=True)
miner_sample_g.reset_index(drop=True, inplace=True)
miner_sample_g

## Concatenate ESG Miner

In [None]:
df_miner_dataset = pd.concat([miner_sample_non, miner_sample_e, miner_sample_s, miner_sample_g], ignore_index=True)
df_miner_dataset["label"].value_counts()

# Final ESG Training Dataset

In [None]:
esg_train_dataset = df.concat([df_miner_dataset, df_esgbert_dataset])
esg_train_dataset.to_excel("ESG_classification_training_dataset.xlsx", index=False)