In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaForSequenceClassification, RobertaTokenizer
import pandas as pd
import numpy as np

emotion_to_label = {
    0: 'anger', 1: 'disgust', 2: 'fear', 3: 'sadness', 4: 'contentment', 5: 'excitement', 6: 'awe', 7: 'amusement'
}

In [None]:
# CUDA 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
# RoBERTa 모델 및 토크나이저 불러오기
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(emotion_to_label))
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# 모델을 GPU로 이동
model.to(device)
model.eval()

# 불러올 모델 파일 경로
model_path = "/content/drive/MyDrive/앤트 공유 문서함/0-1.감정인식/Model/0ver2_SOTA_ED_model_comp_0.8973.pt"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
# 저장된 모델의 state_dict 불러오기
state_dict = torch.load(model_path)

# 모델에 state_dict 적용
model.load_state_dict(state_dict)

<All keys matched successfully>

In [None]:
# CSV 파일을 pandas DataFrame으로 읽기
test_data = pd.read_csv("/content/drive/MyDrive/앤트 공유 문서함/0-2-2.main/dataset/T5_T2_v2.csv")

In [None]:
# 데이터셋 클래스 정의
class EmotionDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = pd.DataFrame(data)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        utterance = self.data['original_text'].iloc[idx]
        text = f"{utterance}"

        encoding = self.tokenizer(text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')

        return {
            'input_ids': encoding['input_ids'],
            'attention_mask': encoding['attention_mask']
        }

test_dataset = EmotionDataset(test_data, tokenizer)

In [None]:
# 3. 예측 수행
def predict_label(inputs):
    with torch.no_grad():
        inputs['input_ids'] = inputs['input_ids'].to(device)
        inputs['attention_mask'] = inputs['attention_mask'].to(device)
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_label_id = logits.argmax(dim=1).item()
    predicted_label = emotion_to_label[predicted_label_id]
    return predicted_label

In [None]:
predictions = [predict_label(inputs) for inputs in test_dataset]

In [None]:
print(predictions)

['disgust', 'disgust', 'anger', 'anger', 'fear', 'disgust', 'anger', 'disgust', 'anger', 'disgust', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'disgust', 'anger', 'anger', 'disgust', 'anger', 'anger', 'anger', 'anger', 'disgust', 'anger', 'disgust', 'anger', 'anger', 'anger', 'disgust', 'disgust', 'fear', 'anger', 'disgust', 'anger', 'anger', 'sadness', 'disgust', 'anger', 'disgust', 'disgust', 'anger', 'disgust', 'anger', 'disgust', 'anger', 'fear', 'anger', 'anger', 'anger', 'sadness', 'anger', 'disgust', 'anger', 'disgust', 'anger', 'fear', 'anger', 'anger', 'disgust', 'anger', 'anger', 'anger', 'disgust', 'anger', 'anger', 'anger', 'anger', 'fear', 'anger', 'disgust', 'anger', 'disgust', 'anger', 'disgust', 'anger', 'anger', 'disgust', 'anger', 'anger', 'disgust', 'fear', 'anger', 'anger', 'anger', 'anger', 'disgust', 'disgust', 'anger', 'disgust', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'disgust', 'anger', 'ange

In [None]:
test_data['emotion'] = predictions

In [None]:
# test_data['prompt'] = test_data['emotion'] + ', ' + test_data['utterance']
test_data['original_text'] = test_data['original_text']
test_data['prompt'] = test_data['utterance'] + ', ' + test_data['emotion']

In [None]:
test_data.to_csv('/content/drive/MyDrive/앤트 공유 문서함/1-1.감정라벨append/E1_감정추가_v2.csv', index=False)

In [None]:
# 정확도 확인차
# from sklearn.metrics import accuracy_score, classification_report

# test_data2 = pd.read_csv("/content/drive/MyDrive/앤트 공유 문서함/main/dataset/main_data.csv")
# test_labels = test_data2['emotion']
# test_accuracy = accuracy_score(test_labels, predictions)

# # 테스트 결과 출력
# print(f'Test Accuracy: {test_accuracy:.4f}')
# # 전체 classification report 출력
# print('\nClassification Report for the Entire Test Set:')
# report = classification_report(test_labels, predictions, target_names=emotion_to_label.values())
# print(report)

# # 감정별로 Test Accuracy 계산
# emotion_accuracy = {}
# emotion_classification_reports = {}

# for emotion, label in emotion_to_label.items():
#     emotion_indices = [i for i, l in enumerate(test_labels) if l == label]
#     emotion_labels = [test_labels[i] for i in emotion_indices]
#     emotion_predictions = [predictions[i] for i in emotion_indices]
#     emotion_accuracy[emotion] = accuracy_score(emotion_labels, emotion_predictions)

# # 각각의 감정에 대한 Test Accuracy 출력
# for emotion, acc in emotion_accuracy.items():
#     print(f'Test Accuracy ({emotion}): {acc:.4f}')
# # 감정별 classification report 출력
# for emotion, report in emotion_classification_reports.items():
#     print(f'\nClassification Report for {emotion}:\n{report}')