In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score
from datasets import Dataset, DatasetDict

# KoBERT 모델 및 토크나이저 로드
model_name = "monologg/kobert"  # KoBERT Hugging Face 모델 이름
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)  # 3가지 감정 분류

# 데이터 로드
train_data = pd.read_csv('finance_data.csv')

# 라벨 매핑
label_mapping = {'neutral': 0, 'positive': 1, 'negative': 2}
reverse_label_mapping = {v: k for k, v in label_mapping.items()}
train_data['labels'] = train_data['labels'].map(label_mapping)

# Dataset으로 변환
train_dataset = Dataset.from_pandas(train_data)

# 학습/임시 테스트-검증 데이터로 분리 (70% 학습, 30% 임시)
train_test_data = train_dataset.train_test_split(test_size=0.3)

# 임시 데이터에서 테스트/검증 데이터로 다시 분리 (15% 테스트, 15% 검증)
test_validation_data = train_test_data['test'].train_test_split(test_size=0.5)

# 최종 데이터셋 구성
dataset = DatasetDict({
    'train': train_test_data['train'],
    'test': test_validation_data['test'],
    'validation': test_validation_data['train']
})

# 토크나이징 함수
def tokenize_function(examples):
    return tokenizer(examples['kor_sentence'], padding="max_length", truncation=True, max_length=128)

# 데이터셋 토크나이징
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format("torch")

# 평가 메트릭 함수 정의
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {"accuracy": acc, "f1": f1}

# 트레이너 설정
training_args = TrainingArguments(
    output_dir="./kobert_results",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
)

# 모델 학습
trainer.train()

# 모델 저장
model_save_path = "kobert_sentiment_analysis_model"
tokenizer.save_pretrained(model_save_path)
model.save_pretrained(model_save_path)

# 평가 (검증 데이터셋)
eval_results = trainer.evaluate()
print(f"Evaluation results (Validation): {eval_results}")
print(f"Validation Accuracy: {eval_results['eval_accuracy']:.4f}")
print(f"Validation F1 Score: {eval_results['eval_f1']:.4f}")

# 테스트 데이터셋 평가
test_results = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
print(f"Evaluation results (Test): {test_results}")
print(f"Test Accuracy: {test_results['eval_accuracy']:.4f}")
print(f"Test F1 Score: {test_results['eval_f1']:.4f}")

# 예측 함수
def predict_review(sentence, model_path):
    tokenizer = BertTokenizer.from_pretrained(model_path)
    model = BertForSequenceClassification.from_pretrained(model_path)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    prediction = torch.argmax(outputs.logits, dim=1).item()
    return reverse_label_mapping[prediction]

# 전체 데이터에 대한 예측 수행
train_data['predict_labels'] = train_data['kor_sentence'].apply(lambda x: predict_review(x, model_save_path))

# 예측 라벨을 숫자형으로 변환
train_data['predict_labels'] = train_data['predict_labels'].map(label_mapping)

# 전체 데이터 평가
accuracy = accuracy_score(train_data['labels'], train_data['predict_labels'])
f1 = f1_score(train_data['labels'], train_data['predict_labels'], average='weighted')
print(f"Accuracy on all data: {accuracy:.4f}")
print(f"F1 Score on all data: {f1:.4f}")

# 결과 저장
train_data.to_csv('finance_data_with_predictions.csv', index=False)