In [6]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

# 데이터 로딩
train_data = pd.read_csv('finance_data.csv')

# 라벨 맵핑
label_mapping = {'neutral': 0, 'positive': 1, 'negative': 2}
train_data['labels'] = train_data['labels'].map(label_mapping)

# Dataset으로 변환
train_dataset = Dataset.from_pandas(train_data)

# train/test 분리
train_test_data = train_dataset.train_test_split(test_size=0.2)
dataset = DatasetDict({
    'train': train_test_data['train'],
    'test': train_test_data['test']
})

# KoBERT 토크나이저 및 모델 로드
model_name = "bert-base-multilingual-cased"  # 다국적 언어 모델 사용
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)  # 3가지 감정 분류를 위한 모델 설정

# 토크나이징 함수
def tokenize_function(examples):
    return tokenizer(examples['kor_sentence'], padding="max_length", truncation=True, max_length=128)

# 데이터셋 토크나이징
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format("torch")

# 트레이너 설정
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

# 모델 학습
trainer.train()

# 모델 평가
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# 예측 함수
def predict_review(sentence, model, tokenizer):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    # 입력 데이터 토크나이징
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    
    # 모델을 통한 예측
    with torch.no_grad():
        outputs = model(**inputs)
    
    # 예측 결과
    prediction = torch.argmax(outputs.logits, dim=1).item()
    
    if prediction == 0:
        sentiment = "neutral"
    elif prediction == 1:
        sentiment = "positive"
    else:
        sentiment = "negative"
    
    return sentiment

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3876 [00:00<?, ? examples/s]

Map:   0%|          | 0/970 [00:00<?, ? examples/s]



  0%|          | 0/729 [00:00<?, ?it/s]

  0%|          | 0/61 [00:00<?, ?it/s]

{'eval_loss': 0.6523424983024597, 'eval_runtime': 48.3975, 'eval_samples_per_second': 20.042, 'eval_steps_per_second': 1.26, 'epoch': 1.0}


  0%|          | 0/61 [00:00<?, ?it/s]

{'eval_loss': 0.5333667397499084, 'eval_runtime': 48.4557, 'eval_samples_per_second': 20.018, 'eval_steps_per_second': 1.259, 'epoch': 2.0}
{'loss': 0.5505, 'grad_norm': 12.788064002990723, 'learning_rate': 6.282578875171468e-06, 'epoch': 2.06}


  0%|          | 0/61 [00:00<?, ?it/s]

{'eval_loss': 0.584376871585846, 'eval_runtime': 48.4763, 'eval_samples_per_second': 20.01, 'eval_steps_per_second': 1.258, 'epoch': 3.0}
{'train_runtime': 3401.8498, 'train_samples_per_second': 3.418, 'train_steps_per_second': 0.214, 'train_loss': 0.4761166984652295, 'epoch': 3.0}


  0%|          | 0/61 [00:00<?, ?it/s]

Evaluation results: {'eval_loss': 0.584376871585846, 'eval_runtime': 48.5923, 'eval_samples_per_second': 19.962, 'eval_steps_per_second': 1.255, 'epoch': 3.0}


In [7]:
# 예측 테스트
test_sentence = "This company reported great earnings this quarter."
predicted_sentiment = predict_review(test_sentence, model, tokenizer)
print(f"Predicted sentiment: {predicted_sentiment}")  

Predicted sentiment: positive


In [11]:
# 예측 테스트
test_sentence = "이 회사 별로야."
predicted_sentiment = predict_review(test_sentence, model, tokenizer)
print(f"Predicted sentiment: {predicted_sentiment}") 

Predicted sentiment: neutral


In [12]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score
from datasets import Dataset, DatasetDict

# 데이터 로딩
train_data = pd.read_csv('finance_data.csv')

# 라벨 맵핑
label_mapping = {'neutral': 0, 'positive': 1, 'negative': 2}
reverse_label_mapping = {v: k for k, v in label_mapping.items()}
train_data['labels'] = train_data['labels'].map(label_mapping)

# Dataset으로 변환
train_dataset = Dataset.from_pandas(train_data)

# train/test 분리
train_test_data = train_dataset.train_test_split(test_size=0.2)
dataset = DatasetDict({
    'train': train_test_data['train'],
    'test': train_test_data['test']
})

# KoBERT 토크나이저 및 모델 로드
model_name = "bert-base-multilingual-cased"  # 다국적 언어 모델 사용
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)  # 3가지 감정 분류를 위한 모델 설정

# 토크나이징 함수
def tokenize_function(examples):
    return tokenizer(examples['kor_sentence'], padding="max_length", truncation=True, max_length=128)

# 데이터셋 토크나이징
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format("torch")

# 트레이너 설정
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

# 모델 학습
trainer.train()

# 모델 평가
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# 예측 함수
def predict_review(sentence, model, tokenizer):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    # 입력 데이터 토크나이징
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    
    # 모델을 통한 예측
    with torch.no_grad():
        outputs = model(**inputs)
    
    # 예측 결과
    prediction = torch.argmax(outputs.logits, dim=1).item()
    
    return prediction

# 전체 데이터에 대한 예측 수행
train_data['predict_labels'] = train_data['kor_sentence'].apply(lambda x: predict_review(x, model, tokenizer))

# 라벨 역매핑
train_data['predict_labels'] = train_data['predict_labels'].map(reverse_label_mapping)

# 정확도 계산
accuracy = accuracy_score(train_data['labels'], train_data['predict_labels'].map(label_mapping))
print(f"Accuracy: {accuracy}")

# 결과 저장
train_data.to_csv('finance_data_with_predictions.csv', index=False)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3876 [00:00<?, ? examples/s]

Map:   0%|          | 0/970 [00:00<?, ? examples/s]



  0%|          | 0/729 [00:00<?, ?it/s]

  0%|          | 0/61 [00:00<?, ?it/s]

{'eval_loss': 0.6523424983024597, 'eval_runtime': 102.6965, 'eval_samples_per_second': 9.445, 'eval_steps_per_second': 0.594, 'epoch': 1.0}


  0%|          | 0/61 [00:00<?, ?it/s]

{'eval_loss': 0.5333667397499084, 'eval_runtime': 107.0276, 'eval_samples_per_second': 9.063, 'eval_steps_per_second': 0.57, 'epoch': 2.0}
{'loss': 0.5505, 'grad_norm': 12.788064002990723, 'learning_rate': 6.282578875171468e-06, 'epoch': 2.06}


  0%|          | 0/61 [00:00<?, ?it/s]

{'eval_loss': 0.584376871585846, 'eval_runtime': 106.1787, 'eval_samples_per_second': 9.136, 'eval_steps_per_second': 0.575, 'epoch': 3.0}
{'train_runtime': 2923.621, 'train_samples_per_second': 3.977, 'train_steps_per_second': 0.249, 'train_loss': 0.4761166984652295, 'epoch': 3.0}


  0%|          | 0/61 [00:00<?, ?it/s]

Evaluation results: {'eval_loss': 0.584376871585846, 'eval_runtime': 106.731, 'eval_samples_per_second': 9.088, 'eval_steps_per_second': 0.572, 'epoch': 3.0}
Accuracy: 0.9011555922410235
