# Library

In [None]:
import os
import pandas as pd
import numpy as np

from datasets import Dataset
from dotenv import load_dotenv
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Traine, pipeline

# Labeling

In [7]:
data_path = r"C:\Users\james\J_Data_Lab\Project-FXCast\crawler\data\naver_finance_news_2020_prep_s7.csv"
df = pd.read_csv(data_path)

# summary 컬럼만 추출 + 중복 제거, 공백 제거
df_labeled = df[["summary"]].dropna().drop_duplicates()
df_labeled["summary"] = df_labeled["summary"].str.strip()

# 감정 라벨 컬럼
df_labeled["label"] = ""

# 저장
save_path = r"C:\Users\james\J_Data_Lab\Project-FXCast\crawler\data\labeling_2020.csv"
df_labeled.to_csv(save_path, index=False, encoding="utf-8-sig")
print(f"{len(df_labeled)}개 문장 저장 완료: {save_path}")

370개 문장 저장 완료: C:\Users\james\J_Data_Lab\Project-FXCast\crawler\data\labeling_2020.csv


# Fine Tuning

* 환율 전용 튜닝
- 긍정	시장 안정, 환율 하락, 외국인 투자 확대, 호재 반응	하락, 안정, 진정, 강세, 회복, 개선, 순매수
- 부정	시장 불안, 환율 급등, 지정학적 리스크, 악재 반응	급등, 불안, 출렁, 위기, 약세, 순매도
- 중립	방향성 없거나 팩트만 전달, 보합

In [1]:
data_path = r"C:\Users\james\J_Data_Lab\Project-FXCast\crawler\data\labeled_2020_s1.csv"
df = pd.read_csv(data_path)
label_map = {"부정": 0, "중립": 1, "긍정": 2}
df["label"] = df["label"].map(label_map)
dataset = Dataset.from_pandas(df[["summary", "label"]])

In [2]:
# https://huggingface.co/snunlp/KR-FinBert-SC 
model_name = "snunlp/KR-FinBERT-SC"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

In [3]:
def tokenize(batch):
    return tokenizer(batch["summary"], padding="max_length", truncation=True, max_length=128)

dataset = dataset.map(tokenize, batched=True)
dataset = dataset.train_test_split(test_size=0.1)

Map:   0%|          | 0/207 [00:00<?, ? examples/s]

In [4]:
training_args = TrainingArguments(
    output_dir="./finbert-finetuned",
    per_device_train_batch_size=8,
    num_train_epochs=4,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
)

trainer.train()






Epoch,Training Loss,Validation Loss
1,0.9115,0.7066
2,0.6332,0.502824
3,0.3,0.293566
4,0.2076,0.282096


TrainOutput(global_step=96, training_loss=0.6265187462170919, metrics={'train_runtime': 21.096, 'train_samples_per_second': 35.267, 'train_steps_per_second': 4.551, 'total_flos': 48939095697408.0, 'train_loss': 0.6265187462170919, 'epoch': 4.0})

In [5]:
trainer.save_model("./finbert-finetuned")

# test
pipe = pipeline("text-classification", model="./finbert-finetuned", tokenizer=tokenizer)
pipe("환율 급등 우려로 시장 불안이 커지고 있다.")

Device set to use cuda:0


[{'label': 'negative', 'score': 0.9823349714279175}]

In [11]:
# hugging face upload
load_dotenv(dotenv_path=".env")
token = os.getenv("HF_TOKEN")

model.push_to_hub("DataWizardd/finbert-sentiment-ko", token=token)
tokenizer.push_to_hub("DataWizardd/finbert-sentiment-ko", token=token)

model.safetensors:   0%|          | 0.00/406M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/DataWizardd/finbert-sentiment-ko/commit/1f6b07752b808ea06dce142bb79c2663536b6534', commit_message='Upload tokenizer', commit_description='', oid='1f6b07752b808ea06dce142bb79c2663536b6534', pr_url=None, repo_url=RepoUrl('https://huggingface.co/DataWizardd/finbert-sentiment-ko', endpoint='https://huggingface.co', repo_type='model', repo_id='DataWizardd/finbert-sentiment-ko'), pr_revision=None, pr_num=None)

# Classification Report

In [17]:
data_path = r"C:\Users\james\J_Data_Lab\Project-FXCast\crawler\data\labeled_2020_s1.csv"
df = pd.read_csv(data_path)

# 라벨 매핑
label_map = {"부정": 0, "중립": 1, "긍정": 2}
df["label"] = df["label"].map(label_map)

df_clean = df[df["label"].isin([0, 1, 2])].copy()
df_clean["label"] = df_clean["label"].astype(int)

In [18]:
train_df, test_df = train_test_split(
    df_clean,
    test_size=0.2,
    stratify=df_clean["label"],
    random_state=42
)

In [19]:
# 데이터셋 변환
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

# 토크나이저 로드 (KR-FinBERT-SC)
tokenizer = AutoTokenizer.from_pretrained("snunlp/KR-FinBERT-SC")

# 토크나이징 함수
def tokenize(batch):
    return tokenizer(batch["summary"], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# 필요한 컬럼만 학습용으로 설정
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

Map:   0%|          | 0/165 [00:00<?, ? examples/s]

Map:   0%|          | 0/42 [00:00<?, ? examples/s]

In [22]:
# 예측
preds = trainer.predict(test_dataset)

# 결과 분석
y_true = preds.label_ids
y_pred = np.argmax(preds.predictions, axis=1)

print(classification_report(y_true, y_pred, target_names=["부정", "중립", "긍정"]))

              precision    recall  f1-score   support

          부정       0.89      1.00      0.94        17
          중립       1.00      0.82      0.90        11
          긍정       0.93      0.93      0.93        14

    accuracy                           0.93        42
   macro avg       0.94      0.92      0.92        42
weighted avg       0.93      0.93      0.93        42

