In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaForSequenceClassification, RobertaTokenizer
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import numpy as np

emotion_to_label = {
    0: 'anger', 1: 'disgust', 2: 'fear', 3: 'sadness', 4: 'contentment', 5: 'excitement', 6: 'awe', 7: 'amusement'
}

In [None]:
# CUDA 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
# RoBERTa 모델 및 토크나이저 불러오기
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(emotion_to_label))
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# 모델을 GPU로 이동
model.to(device)
model.eval()

# 불러올 모델 파일 경로
model_path = "/content/drive/MyDrive/앤트 공유 문서함/0-1.감정인식/Model/0ver2_SOTA_ED_model_comp_0.8973.pt"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
# 저장된 모델의 state_dict 불러오기
state_dict = torch.load(model_path)

# 모델에 state_dict 적용
model.load_state_dict(state_dict)

<All keys matched successfully>

In [None]:
# CSV 파일을 pandas DataFrame으로 읽기
test_data = pd.read_csv("/content/drive/MyDrive/앤트 공유 문서함/0-2-2.main/dataset/T5_T2_v2.csv")

In [None]:
# 데이터셋 클래스 정의
class EmotionDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = pd.DataFrame(data)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        utterance = self.data['original_text'].iloc[idx]
        text = f"{utterance}"

        encoding = self.tokenizer(text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')

        return {
            'input_ids': encoding['input_ids'],
            'attention_mask': encoding['attention_mask']
        }

test_dataset = EmotionDataset(test_data, tokenizer)

In [None]:
import torch.nn.functional as F

# 3. 예측 수행
def predict_label(inputs):
    with torch.no_grad():
        inputs['input_ids'] = inputs['input_ids'].to(device)  # 입력 데이터를 GPU로 이동
        inputs['attention_mask'] = inputs['attention_mask'].to(device)
        outputs = model(**inputs)
    logits = outputs.logits

    probabilities = F.softmax(logits, dim=1)
    print(probabilities)
    probability, predicted_label_id = torch.max(probabilities, dim=1)
    percentage = probability.item()
    predicted_label = emotion_to_label[predicted_label_id.item()]
    print("Returned values:", predicted_label, percentage)

    return predicted_label, percentage

In [None]:
predictions, percentages = zip(*[predict_label(inputs) for inputs in test_dataset])

tensor([[5.3999e-02, 8.9731e-01, 6.5864e-03, 3.6002e-02, 1.5372e-03, 7.1077e-04,
         3.1082e-03, 7.4891e-04]], device='cuda:0')
Returned values: disgust 0.8973080515861511
tensor([[9.9874e-03, 9.8114e-01, 3.1636e-03, 2.9726e-03, 6.2785e-04, 3.7359e-04,
         1.3559e-03, 3.8108e-04]], device='cuda:0')
Returned values: disgust 0.9811378717422485
tensor([[7.5761e-01, 2.3120e-01, 4.3541e-03, 1.9592e-03, 1.4641e-03, 4.7841e-04,
         2.2495e-03, 6.9119e-04]], device='cuda:0')
Returned values: anger 0.7576056718826294
tensor([[0.5210, 0.4267, 0.0429, 0.0016, 0.0028, 0.0006, 0.0033, 0.0010]],
       device='cuda:0')
Returned values: anger 0.5209718346595764
tensor([[1.4691e-02, 6.0772e-02, 9.1805e-01, 1.1110e-03, 1.1046e-03, 5.7123e-04,
         2.8144e-03, 8.9074e-04]], device='cuda:0')
Returned values: fear 0.9180456399917603
tensor([[8.5224e-02, 9.0322e-01, 4.1831e-03, 3.5186e-03, 7.7925e-04, 3.4379e-04,
         1.9359e-03, 7.9180e-04]], device='cuda:0')
Returned values: disgus

In [None]:
print(predictions)
print(percentages)

('disgust', 'disgust', 'anger', 'anger', 'fear', 'disgust', 'anger', 'disgust', 'anger', 'disgust', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'disgust', 'anger', 'anger', 'disgust', 'anger', 'anger', 'anger', 'anger', 'disgust', 'anger', 'disgust', 'anger', 'anger', 'anger', 'disgust', 'disgust', 'fear', 'anger', 'disgust', 'anger', 'anger', 'sadness', 'disgust', 'anger', 'disgust', 'disgust', 'anger', 'disgust', 'anger', 'disgust', 'anger', 'fear', 'anger', 'anger', 'anger', 'sadness', 'anger', 'disgust', 'anger', 'disgust', 'anger', 'fear', 'anger', 'anger', 'disgust', 'anger', 'anger', 'anger', 'disgust', 'anger', 'anger', 'anger', 'anger', 'fear', 'anger', 'disgust', 'anger', 'disgust', 'anger', 'disgust', 'anger', 'anger', 'disgust', 'anger', 'anger', 'disgust', 'fear', 'anger', 'anger', 'anger', 'anger', 'disgust', 'disgust', 'anger', 'disgust', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'disgust', 'anger', 'ange

In [None]:
test_data['emotion'] = predictions
test_data['percentage'] = percentages

확률에 따라 수식어 추가

In [None]:
emotion_to_adj = {
    'anger':'angry', 'disgust':'disgusted', 'fear':'fearful', 'sadness':'sad', 'contentment':'content', 'excitement':'excited', 'awe':'awed', 'amusement':'amused'
    }

for index, row in test_data.iterrows():
    percent = float(row['percentage'])
    if percent >= 0.95:
        prefix = 'extremely'
    elif percent >= 0.85:
        prefix = 'completely'
    elif percent >= 0.8:
        prefix = 'really'
    elif percent >= 0.75:
        prefix = 'very'
    else:
        prefix = ''

    emo = emotion_to_adj.get(row['emotion'])

    if prefix == '':
        test_data.at[index, 'prompt'] = f"{emo}, {test_data.at[index, 'utterance']}"
    else:
        test_data.at[index, 'prompt'] = f"{prefix} {emo}, {test_data.at[index, 'utterance']}"

In [None]:
test_data.to_csv('/content/drive/MyDrive/앤트 공유 문서함/1-1.감정라벨append/E2_확률수식어추가_v2.csv', index=False)