In [12]:
import numpy as np
import pandas as pd
from collections import Counter

In [13]:
pred_file = './submission_bw/sub_8_7_1136.csv'
df = pd.read_csv(pred_file)
pred = df['Answer']

gt_file = './submission/baseline_submission_8_4_18.csv'
df = pd.read_csv(gt_file)
gt = df['Answer']

In [14]:
def calculate_f1_score(true_sentence, predicted_sentence, sum_mode=True):

    #공백 제거
    true_sentence = ''.join(true_sentence.split())
    predicted_sentence = ''.join(predicted_sentence.split())
    
    true_counter = Counter(true_sentence)
    predicted_counter = Counter(predicted_sentence)

    #문자가 등장한 개수도 고려
    if sum_mode:
        true_positive = sum((true_counter & predicted_counter).values())
        predicted_positive = sum(predicted_counter.values())
        actual_positive = sum(true_counter.values())

    #문자 자체가 있는 것에 focus를 맞춤
    else:
        true_positive = len((true_counter & predicted_counter).values())
        predicted_positive = len(predicted_counter.values())
        actual_positive = len(true_counter.values())

    #f1 score 계산
    precision = true_positive / predicted_positive if predicted_positive > 0 else 0
    recall = true_positive / actual_positive if actual_positive > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1_score

def calculate_average_f1_score(true_sentences, predicted_sentences):
    
    total_precision = 0
    total_recall = 0
    total_f1_score = 0
    
    for true_sentence, predicted_sentence in zip(true_sentences, predicted_sentences):
        precision, recall, f1_score = calculate_f1_score(true_sentence, predicted_sentence)
        total_precision += precision
        total_recall += recall
        total_f1_score += f1_score
    
    avg_precision = total_precision / len(true_sentences)
    avg_recall = total_recall / len(true_sentences)
    avg_f1_score = total_f1_score / len(true_sentences)
    
    return {
        'average_precision': avg_precision,
        'average_recall': avg_recall,
        'average_f1_score': avg_f1_score
    }

result = calculate_average_f1_score(gt, pred)
print(result)

{'average_precision': 0.6548559590113365, 'average_recall': 0.58574722020826, 'average_f1_score': 0.562767827184241}


In [15]:
import pandas as pd

def clean_and_save_csv(input_file_path, output_file_path):
    # CSV 파일 로드
    df = pd.read_csv(input_file_path)
    
    # 'Answer' 열의 줄바꿈 문자(\n)를 제거
    df['Answer'] = df['Answer'].str.replace('\n', ' ')
    
    # 수정된 데이터를 새로운 CSV 파일로 저장
    df.to_csv(output_file_path, encoding='UTF-8-sig', index=False)
    
    print(f"파일이 성공적으로 저장되었습니다: {output_file_path}")

# 사용 예시
input_file_path = '/home/MMI24byungwan/21_3_Workspace/FINANCIAL_INFORMATION/submission_bw/sub_8_8_1713.csv'  # 원래 CSV 파일 경로
output_file_path = './submission_bw/sub_cleaned.csv'    # 저장할 CSV 파일 경로

clean_and_save_csv(input_file_path, output_file_path)


파일이 성공적으로 저장되었습니다: ./submission_bw/sub_cleaned.csv
