In [5]:
import pandas as pd
import os
from tqdm.auto import tqdm

# Merge Gen and NLP 

## Sentiment

In [None]:
def merge_nlp_genai_data(input_folder_nlp, input_folder_genai, output_folder):
    file_list = sorted(os.listdir(input_folder_nlp), key=lambda x: int(x.split('.')[0]))

    # Iterate every file by number
    for file_name in tqdm(file_list, desc="衝鴨!!!"):
        file_use_nlp = os.path.join(input_folder_nlp, file_name)
        file_use_genai = os.path.join(input_folder_genai, file_name)

        df_nlp = pd.read_excel(file_use_nlp)
        df_nlp.drop(columns=["sentence"], inplace=True)

        df_genai = pd.read_excel(file_use_genai)
        df_genai.drop(columns=["number", "sentence"], inplace=True)

        df = pd.concat([df_nlp, df_genai], axis=1)        
        df.to_excel(os.path.join(output_folder, file_name), index=False)

In [None]:
# sentiment 
folder_nlp = r"Sentiment_result_NLP"
folder_genai = r"Sentiment_result_genAI"
sentiment_output_folder = r"Sentiment_ALL"

merge_nlp_genai_data(folder_nlp, folder_genai, sentiment_output_folder)

## ESG

In [None]:
def merge_nlp_genai_data_esg(input_folder_nlp, input_folder_genai, output_folder):
    file_list = sorted(os.listdir(input_folder_nlp), key=lambda x: int(x.split('.')[0]))

    # Iterate every file by number
    for file_name in tqdm(file_list, desc="衝鴨!!!"):
        file_use_nlp = os.path.join(input_folder_nlp, file_name)
        file_use_genai = os.path.join(input_folder_genai, file_name)

        df_nlp = pd.read_excel(file_use_nlp)
        df_nlp.drop(columns=["sentence"], inplace=True)

        df_genai = pd.read_excel(file_use_genai)
        df_genai.drop(columns=["number", "sentence"], inplace=True)

        df = pd.concat([df_nlp, df_genai], axis=1) 
        df.to_excel(os.path.join(output_folder, file_name), index=False)

In [None]:
# ESG 
folder_nlp = r"ESG_result_NLP"
folder_genai = r"ESG_result_genAI"
output_folder = r"ESG_ALL" 

merge_nlp_genai_data_esg(folder_nlp, folder_genai, output_folder)

# For Metric

## Sentiment

In [None]:
def agg_to_one(variable, input_folder):
    file_list = sorted(os.listdir(input_folder), key=lambda x: int(x.split('.')[0]))

    # Iterate every file by number
    df_list = []
    for file_name in tqdm(file_list, desc="衝鴨!!!"):
        file_use_nlp = os.path.join(input_folder, file_name)

        df = pd.read_excel(file_use_nlp)
        df.drop(columns=["number"], inplace=True)
        df_list.append(df)

    df_agg = pd.concat(df_list, axis=0)
    df_agg.to_excel(f"{variable}_Metric.xlsx", index=False)


In [None]:
# Sentiment
sentiment_folder = r"Sentiment_ALL"
agg_to_one("Sentiment" ,sentiment_folder)

In [None]:
esg_folder = r"ESG_ALL"
agg_to_one("ESG", esg_folder)

# For Variable

# Sentiment

In [None]:
from collections import defaultdict

def calcuate__sentiment_variable(input_folder):
    file_list = sorted(os.listdir(input_folder), key=lambda x: int(x.split('.')[0]))

    # Make dic object avaliable for append() func
    result_net_tone = defaultdict(list)
    result_power = defaultdict(list)

    # Iterate every file by number
    for file_name in tqdm(file_list, desc="衝鴨!!!"):
        file_use_nlp = os.path.join(input_folder, file_name)

        df = pd.read_excel(file_use_nlp)
        df.drop(columns=["number", "researcher_sentiment"], inplace=True)
        total_count = len(df)
        for model in df.columns:
            positive_count = (df[model] == 1).sum()
            negative_count = (df[model] == 2).sum()

            net_tone = (positive_count - negative_count) / total_count
            sentiment_power = (positive_count + negative_count) / total_count

            result_net_tone[model].append(net_tone)
            result_power[model].append(sentiment_power)
        
    result_net_tone = dict(result_net_tone)
    result_power = dict(result_power)

    df_net_tone = pd.DataFrame(result_net_tone)
    df_power = pd.DataFrame(result_power)

    df_net_tone.to_excel("Net_Tone.xlsx", index=False)
    df_power.to_excel("Sentiment_Power.xlsx", index=False)

In [None]:
sentiment_folder = r"Sentiment_ALL"

calcuate__sentiment_variable(sentiment_folder)

# Metric 計算

In [None]:
import evaluate

metrics_1 = evaluate.load("accuracy")

metrics_2 = evaluate.combine([
    evaluate.load("precision"),          
    evaluate.load("recall"),
    evaluate.load("f1")
])

In [None]:
df_sentiment = pd.read_excel(r"Sentiment_Metric.xlsx")
df_esg = pd.read_excel(r"ESG_Metric.xlsx")

In [11]:
def metric_calculate(preds, labels): 
    results = metrics_2.compute(
        predictions=preds,
        references=labels,
        average="macro"         
    ) | metrics_1.compute(
        predictions=preds,
        references=labels
    )

    return results

In [84]:
def metric_run(df_variable):
    accuracy_list = []
    precision_list = []
    recall_list = []
    f1_list = []
    labels = df_variable[df_variable.columns[0]]
    for model in tqdm(df_variable.columns[1:], desc="我會走"):
        preds = df_variable[model]
        result = metric_calculate(preds, labels)
        accuracy_list.append(result["accuracy"])
        precision_list.append(result["precision"])
        recall_list.append(result["recall"])
        f1_list.append(result["f1"])
    
    df = pd.DataFrame([accuracy_list, precision_list, recall_list, f1_list], 
                        columns=df_variable.columns[1:], 
                        index=["Accuracy", "Precision", "Recall", "F1-Score"])
    
    return df
    

In [None]:
metric_run(df_esg)