In [1]:
pip install datasets

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
pip install transformers[torch]

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
pip install accelerate -U

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
import numpy as np
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

# 데이터 로딩
train_data = pd.read_csv('C:/Users/PKNU/Desktop/Sentiment Analysis/finance_data.csv')

# 라벨 맵핑
label_mapping = {'neutral': 0, 'positive': 1, 'negative': 2}
train_data['labels'] = train_data['labels'].map(label_mapping)

# 학습 데이터와 테스트 데이터 분리
train_df, test_df = train_test_split(train_data, test_size=0.1, random_state=42)

# Dataset으로 변환
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
dataset = DatasetDict({"train": train_dataset, "test": test_dataset})

# KoBERT 토크나이저 및 모델 로드
model_name = "monologg/kobert"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)

# 토크나이징 함수
def tokenize_function(examples):
    return tokenizer(examples['kor_sentence'], padding="max_length", truncation=True, max_length=128)

# 데이터셋 토크나이징
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['kor_sentence'])
tokenized_datasets.set_format("torch")

# 트레이너 설정
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

# 모델 학습
trainer.train()

# 모델 평가
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# 예측 함수
def predict_review(sentence, model, tokenizer):
    # 모델을 GPU로 이동 (만약 GPU가 있다면)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    # 입력 데이터 토크나이징
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=128)
    
    # 입력 데이터도 GPU로 이동
    inputs = {key: val.to(device) for key, val in inputs.items()}
    
    # 모델을 통한 예측
    with torch.no_grad():
        outputs = model(**inputs)
    
    # 예측 결과
    prediction = torch.argmax(outputs.logits, dim=1).item()

    if prediction == 0:
        sentiment = "중립"
    elif prediction == 1:
        sentiment = "긍정"
    else:
        sentiment = "부정"
    
    return sentiment

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4361 [00:00<?, ? examples/s]

Map:   0%|          | 0/485 [00:00<?, ? examples/s]



  0%|          | 0/819 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.8280672430992126, 'eval_runtime': 10.5379, 'eval_samples_per_second': 46.025, 'eval_steps_per_second': 2.942, 'epoch': 1.0}
{'loss': 0.8655, 'grad_norm': 2.9435789585113525, 'learning_rate': 7.78998778998779e-06, 'epoch': 1.83}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.7882125973701477, 'eval_runtime': 10.5396, 'eval_samples_per_second': 46.017, 'eval_steps_per_second': 2.941, 'epoch': 2.0}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.7674531936645508, 'eval_runtime': 10.5556, 'eval_samples_per_second': 45.947, 'eval_steps_per_second': 2.937, 'epoch': 3.0}
{'train_runtime': 2435.3813, 'train_samples_per_second': 5.372, 'train_steps_per_second': 0.336, 'train_loss': 0.8430338243570665, 'epoch': 3.0}


  0%|          | 0/31 [00:00<?, ?it/s]

Evaluation results: {'eval_loss': 0.7674531936645508, 'eval_runtime': 10.5827, 'eval_samples_per_second': 45.83, 'eval_steps_per_second': 2.929, 'epoch': 3.0}
