In [None]:
pip install transformers datasets pandas torch

In [1]:
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score
from datasets import Dataset, DatasetDict

# 데이터 로딩
data = pd.read_csv('finance_data.csv')

# 라벨 맵핑
label_mapping = {'neutral': 0, 'positive': 1, 'negative': 2}
reverse_label_mapping = {v: k for k, v in label_mapping.items()}
data['labels'] = data['labels'].map(label_mapping)

# Dataset으로 변환
dataset = Dataset.from_pandas(data)

# train/test 분리
dataset_split = dataset.train_test_split(test_size=0.2)
dataset = DatasetDict({
    'train': dataset_split['train'],
    'test': dataset_split['test']
})

# RoBERTa 토크나이저 및 모델 로드
model_name = "roberta-base"  # 또는 "roberta-large"를 사용할 수 있습니다
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=3)  # 3가지 감정 분류를 위한 모델 설정

# 토크나이징 함수
def tokenize_function(examples):
    return tokenizer(examples['kor_sentence'], padding="max_length", truncation=True, max_length=128)

# 데이터셋 토크나이징
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format("torch")

# 트레이너 설정
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=lambda p: {
        'accuracy': accuracy_score(p.label_ids, p.predictions.argmax(axis=-1))
    }
)

# 모델 학습
trainer.train()

# 모델 평가
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# 예측 함수
def predict_review(sentence, model, tokenizer):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    # 입력 데이터 토크나이징
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    
    # 모델을 통한 예측
    with torch.no_grad():
        outputs = model(**inputs)
    
    # 예측 결과
    prediction = torch.argmax(outputs.logits, dim=1).item()
    
    return prediction

# 전체 데이터에 대한 예측 수행
data['predict_labels'] = data['kor_sentence'].apply(lambda x: predict_review(x, model, tokenizer))
data['predict_labels'] = data['predict_labels'].map(reverse_label_mapping)

# 정확도 계산
accuracy = accuracy_score(data['labels'], data['predict_labels'].map(label_mapping))
print(f"Accuracy: {accuracy}")

# 결과 저장
data.to_csv('finance_data_with_roberta_predictions.csv', index=False)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3876 [00:00<?, ? examples/s]

Map:   0%|          | 0/970 [00:00<?, ? examples/s]



  0%|          | 0/729 [00:00<?, ?it/s]

  0%|          | 0/61 [00:00<?, ?it/s]

{'eval_loss': 0.8948796987533569, 'eval_accuracy': 0.5814432989690722, 'eval_runtime': 52.0183, 'eval_samples_per_second': 18.647, 'eval_steps_per_second': 1.173, 'epoch': 1.0}


  0%|          | 0/61 [00:00<?, ?it/s]

{'eval_loss': 0.8284378051757812, 'eval_accuracy': 0.6453608247422681, 'eval_runtime': 52.1748, 'eval_samples_per_second': 18.591, 'eval_steps_per_second': 1.169, 'epoch': 2.0}
{'loss': 0.884, 'grad_norm': 7.112246513366699, 'learning_rate': 6.282578875171468e-06, 'epoch': 2.06}


  0%|          | 0/61 [00:00<?, ?it/s]

{'eval_loss': 0.8056624531745911, 'eval_accuracy': 0.6628865979381443, 'eval_runtime': 52.142, 'eval_samples_per_second': 18.603, 'eval_steps_per_second': 1.17, 'epoch': 3.0}
{'train_runtime': 2674.4872, 'train_samples_per_second': 4.348, 'train_steps_per_second': 0.273, 'train_loss': 0.8587180557564943, 'epoch': 3.0}


  0%|          | 0/61 [00:00<?, ?it/s]

Evaluation results: {'eval_loss': 0.8056624531745911, 'eval_accuracy': 0.6628865979381443, 'eval_runtime': 52.1245, 'eval_samples_per_second': 18.609, 'eval_steps_per_second': 1.17, 'epoch': 3.0}
Accuracy: 0.6735451919108543
