# **감정 분석**

In [None]:
!pip install gluonnlp pandas tqdm
!pip install mxnet
!pip install sentencepiece

!pip install transformers
!pip install torch
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

Collecting gluonnlp
  Using cached gluonnlp-0.10.0-cp310-cp310-linux_x86_64.whl
Installing collected packages: gluonnlp
Successfully installed gluonnlp-0.10.0
Collecting mxnet
  Using cached mxnet-1.9.1-py3-none-manylinux2014_x86_64.whl (49.1 MB)
Collecting graphviz<0.9.0,>=0.8.1 (from mxnet)
  Downloading graphviz-0.8.4-py2.py3-none-any.whl (16 kB)
Installing collected packages: graphviz, mxnet
  Attempting uninstall: graphviz
    Found existing installation: graphviz 0.20.3
    Uninstalling graphviz-0.20.3:
      Successfully uninstalled graphviz-0.20.3
Successfully installed graphviz-0.8.4 mxnet-1.9.1
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-

In [None]:
!pip install "numpy<1.24" # import gluonnlp as nlp 오류 해결 위함



In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook
import pandas as pd
from sklearn.model_selection import train_test_split
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel,AutoConfig
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

print(device)

cpu


## **하이퍼파라미터 설정**

In [None]:
max_len = 100
batch_size = 32
warmup_ratio = 0.1
num_epochs = 13
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5 # 3e-5

In [None]:
from tqdm import tqdm, tqdm_notebook
from glob import glob
import gc
import os

In [None]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len, pad, pair):
        # BERTSentenceTransform을 사용하여 데이터 전처리 변환기를 설정
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, vocab=vocab, pad=pad, pair=pair)

        # 데이터셋에서 문장과 레이블을 추출하고 변환기를 적용하여 토큰화
        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        # 인덱스 i에 해당하는 문장과 레이블을 반환.
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        # 데이터셋의 전체 길이를 반환
        return (len(self.labels))


## **BERT Classifier**

In [None]:
config=AutoConfig.from_pretrained('skt/kobert-base-v1')
config

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/535 [00:00<?, ?B/s]

BertConfig {
  "_name_or_path": "skt/kobert-base-v1",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "author": "Heewon Jeon(madjakarta@gmail.com)",
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "kobert_version": 1.0,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.41.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 8002
}

In [None]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size=768,
                 num_classes=7,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert  # BERT 모델을 설정
        self.dr_rate = dr_rate  # 드롭아웃 비율 설정

        # 분류기 정의: 드롭아웃과 두 개의 선형 레이어
        self.classifier = nn.Sequential(
            nn.Dropout(p=0.2),  # 드롭아웃 적용
            nn.Linear(in_features=hidden_size, out_features=512),  # 첫 번째 선형 레이어
            nn.Linear(in_features=512, out_features=num_classes)  # 두 번째 선형 레이어
        )

        # 정규화 레이어 추가
        self.layer_norm = nn.LayerNorm(768)

        # 드롭아웃 레이어 추가
        self.dropout = nn.Dropout(p=dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):  # token_ids는 입력 문장을 토큰화한 결과
        attention_mask = torch.zeros_like(token_ids)  # attention_mask 초기화
        for i, v in enumerate(valid_length):  # valid_length를 사용하여 어텐션 마스크 생성
            attention_mask[i][:v] = 1
        return attention_mask.float()  # 어텐션 마스크 반환

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)  # gen_attention_mask 메서드를 사용하여 어텐션 마스크를 생성

        # BERT 모델에 입력을 전달하여 출력을 계산
        _, pooler = self.bert(input_ids=token_ids, token_type_ids=segment_ids.long(), attention_mask=attention_mask.float().to(token_ids.device))

        pooled_output = self.dropout(pooler)  # 드롭아웃 적용
        normalized_output = self.layer_norm(pooled_output)  # Layer Normalization 적용
        out = self.classifier(normalized_output)  # 분류기 통과

        return out  # 최종 출력 반환


## **Tokenizer & Model**

In [None]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')
tok = tokenizer.tokenize

tokenizer_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/371k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


pytorch_model.bin:   0%|          | 0.00/369M [00:00<?, ?B/s]

In [None]:
#모델 구조
BERTClassifier(bertmodel, dr_rate=0.5)

BERTClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise

## **데이터 전처리**
- 데이터1] 감성 대화 말뭉치: https://aihub.or.kr/aihubdata/data/view.do?currMenu=115&topMenu=100&aihubDataSe=realm&dataSetSn=86
- 데이터2] 감정 분류를 위한 대화 음성 데이터셋: https://www.aihub.or.kr aihubdata/data/view.do?currMenu=115&topMenu=100&dataSetSn=263

In [None]:
data=pd.read_csv('/content/drive/MyDrive/졸프/4차년도.csv',encoding='cp949')
data1=pd.read_csv('/content/drive/MyDrive/졸프/5차년도.csv', encoding='cp949')
data2=pd.read_csv('/content/drive/MyDrive/졸프/5차년도_2차.csv',encoding='cp949')
data2.head()

Unnamed: 0,wav_id,발화문,상황,1번 감정,1번 감정세기,2번 감정,2번 감정세기,3번 감정,3번 감정세기,4번 감정,4번감정세기,5번 감정,5번 감정세기,나이,성별
0,5f4141e29dd513131eacee2f,헐! 나 이벤트에 당첨 됐어.,happiness,angry,2,surprise,2,happiness,2,happiness,2,happiness,2,48,female
1,5f4141f59dd513131eacee30,내가 좋아하는 인플루언서가 이벤트를 하더라고. 그래서 그냥 신청 한번 해봤지.,happiness,neutral,0,happiness,2,happiness,2,happiness,2,happiness,2,48,female
2,5f4142119dd513131eacee31,"한 명 뽑는 거였는데, 그게 바로 내가 된 거야.",happiness,angry,2,happiness,2,happiness,2,happiness,2,happiness,2,48,female
3,5f4142279dd513131eacee32,"당연히 마음에 드는 선물이니깐, 이벤트에 내가 신청 한번 해본 거지. 비싼 거야. ...",happiness,angry,2,happiness,2,happiness,2,happiness,2,happiness,1,48,female
4,5f3c9ed98a3c1005aa97c4bd,에피타이저 정말 좋아해. 그 것도 괜찮은 생각인 것 같애.,neutral,happiness,2,happiness,1,happiness,2,happiness,1,happiness,1,48,female


In [None]:
# 데이터 합치기
df=pd.concat([data,data2,data1])
df.dropna(inplace=True)
df.head()

Unnamed: 0,wav_id,발화문,상황,1번 감정,1번 감정세기,2번 감정,2번 감정세기,3번 감정,3번 감정세기,4번 감정,4번감정세기,5번 감정,5번 감정세기,나이,성별
0,5e258fd1305bcf3ad153a6a4,"어, 청소 니가 대신 해 줘!",anger,Neutral,0,Angry,1,Neutral,0,Neutral,0,Angry,1,27,male
1,5e258fe2305bcf3ad153a6a5,둘 다 청소 하기 싫어. 귀찮아.,anger,Neutral,0,Angry,1,Neutral,0,Neutral,0,Angry,1,27,male
2,5e258ff5305bcf3ad153a6a6,둘 다 하기 싫어서 화내.,anger,Angry,1,Angry,1,Neutral,0,Angry,1,Angry,1,27,male
3,5e25902f305bcf3ad153a6a9,그럼 방세는 어떡해.,anger,Sadness,1,Sadness,1,Sadness,1,Sadness,1,Sadness,1,27,male
4,5e27f90b5807b852d9e0157b,권태긴줄 알았는데 다른 사람이 생겼나보더라고.,sad,Sadness,1,Sadness,1,Sadness,1,Sadness,2,Sadness,1,32,male


In [None]:
# 상황 데이터와 발화문 데이터만 추출
df.loc[(df['상황'] == "fear"), '상황'] = 0
df.loc[(df['상황'] == "surprise"), '상황'] = 1
df.loc[(df['상황'] == "angry"), '상황'] = 2
df.loc[(df['상황'] == "anger"), '상황'] = 2
df.loc[(df['상황'] == "sadness"), '상황'] = 3
df.loc[(df['상황'] == "sad"), '상황'] = 3
df.loc[(df['상황'] == "neutral"), '상황'] = 4
df.loc[(df['상황'] == "happiness"), '상황'] = 5
df.loc[(df['상황'] == "disgust"), '상황'] = 6

df=df.loc[:,['상황','발화문']]

#col 이름 변경
df.columns=['label','data']
df['label'].value_counts()

# 정수로 변환
df['label'].astype(int)

df['label'].value_counts()

label
3    14000
2    11635
6     4660
5     4548
0     4131
4     3262
1     1755
Name: count, dtype: int64

In [None]:
import pandas as pd

# CSV 파일 가져오기
data4 = pd.read_csv('/content/drive/MyDrive/졸프/감성대화말뭉치_감정만_Training.csv')
data5 = pd.read_csv('/content/drive/MyDrive/졸프/감성대화말뭉치_감정만_Validation.csv')

# 필요한 열만
data4 = data4.loc[:, ['감정_대분류', '사람문장1']]
data5 = data5.loc[:, ['감정_대분류', '사람문장1']]

# 결측값 제거
data4.dropna(inplace=True)
data5.dropna(inplace=True)

# col 이름 변경.
data4.columns = ['label', 'data']
data5.columns = ['label', 'data']

# 감정 레이블을 숫자로 변환
data4.loc[(data4['label'] == '불안'), 'label'] = int(0)
data4.loc[(data4['label'] == '당황'), 'label'] = int(1)
data4.loc[(data4['label'] == '기쁨'), 'label'] = int(5)

# '불안', '당황', '기쁨'에 해당하는 데이터만 선택
data4_ = data4.loc[data4['label'] == 0]
data4_ = pd.concat([data4.loc[data4['label'] == 1], data4_])
data4 = pd.concat([data4.loc[data4['label'] == 5], data4_])

# 감정 레이블을 숫자로 변환
data5.loc[(data5['label'] == '불안'), 'label'] = 0
data5.loc[(data5['label'] == '당황'), 'label'] = 1
data5.loc[(data5['label'] == '기쁨'), 'label'] = 5

# '불안', '당황', '기쁨'에 해당하는 데이터만 선택
data5_ = data5.loc[data5['label'] == 0]
data5_ = pd.concat([data5.loc[data5['label'] == 1], data5_])
data5 = pd.concat([data5.loc[data5['label'] == 5], data5_])
data5['label'] = data5['label'].astype(int)

# 두 데이터를 하나로 합치기
df = pd.concat([data4, data5])

# 결측값을 제거
df.dropna(inplace=True)

# 확인
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 71567 entries, 0 to 6622
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   71567 non-null  object
 1   data    71567 non-null  object
dtypes: object(2)
memory usage: 1.6+ MB


In [None]:
df.head()

Unnamed: 0,label,data
0,2,"어, 청소 니가 대신 해 줘!"
1,2,둘 다 청소 하기 싫어. 귀찮아.
2,2,둘 다 하기 싫어서 화내.
3,2,그럼 방세는 어떡해.
4,3,권태긴줄 알았는데 다른 사람이 생겼나보더라고.


In [None]:
df['label'].value_counts()

label
0    14564
3    14000
5    11887
2    11635
1    11559
6     4660
4     3262
Name: count, dtype: int64

## **Train, Test 데이터 분리**

In [None]:
train_set_data = [[i, str(j)] for i, j in zip(df['data'], df['label'])]

train_set_data, test_set_data = train_test_split(train_set_data, test_size = 0.2, random_state=4)
train_set_data = BERTDataset(train_set_data, 0, 1, tok, vocab, max_len, True, False)
test_set_data = BERTDataset(test_set_data, 0, 1, tok, vocab, max_len, True, False)
train_dataloader = torch.utils.data.DataLoader(train_set_data, batch_size=batch_size, num_workers=2)
test_dataloader = torch.utils.data.DataLoader(test_set_data, batch_size=batch_size, num_workers=2)


## **모델 학습**

In [None]:
from sklearn.metrics import f1_score
import torch
import numpy as np

# 정확도를 계산하는 함수 정의
def calc_accuracy(X, Y):
    # X의 최대 값과 해당 인덱스를 얻음
    max_vals, max_indices = torch.max(X, 1)

    # F1-score 계산 (매크로 평균 사용, 클래스 크기에 상관없이 모든 클래스를 같은 비중으로 다룸)
    f1score = f1_score(Y.data.cpu().numpy(), max_indices.data.cpu().numpy(), average='macro')
    return f1score

# 예측을 수행하는 함수 정의
def predict(sentence):
    # 입력 문장을 데이터셋 형식에 맞게 변환
    dataset = [[sentence, '0']]  # 임시 레이블 '0' 사용
    test = BERTDataset(dataset, 0, 1, tok, vocab, max_len, True, False)

    # 데이터로더 생성
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=batch_size, num_workers=2)

    model.eval()  # 모델을 평가 모드로 전환
    answer = 0

    # 데이터로더에서 배치 단위로 데이터 가져오기
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)  # 입력 토큰 ID를 장치로 이동
        segment_ids = segment_ids.long().to(device)  # 세그먼트 ID를 장치로 이동
        valid_length = valid_length  # 유효 길이
        label = label.long().to(device)  # 레이블을 장치로 이동

        # 모델을 통해 예측 수행
        out = model(token_ids, valid_length, segment_ids)

        test_eval = []
        for i in out:  # 모델의 출력값
            logits = i
            logits = logits.detach().cpu().numpy()  # 출력값을 CPU로 이동 후 numpy 배열로 변환

            # 예측된 클래스에 따라 감정 레이블 추가
            if np.argmax(logits) == 0:
                test_eval.append("불안")
            elif np.argmax(logits) == 1:
                test_eval.append("당황")
            elif np.argmax(logits) == 2:
                test_eval.append("분노")
            elif np.argmax(logits) == 3:
                test_eval.append("슬픔")
            elif np.argmax(logits) == 4:
                test_eval.append("일상")
            elif np.argmax(logits) == 5:
                test_eval.append("행복")
            elif np.argmax(logits) == 6:
                test_eval.append("혐오")

        # 첫 번째 예측된 감정 레이블 출력
        print(">>" + test_eval[0])


In [None]:
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)

# 옵티마이저와 학습 스케줄러 준비 (선형 워밍업과 디케이 적용)
no_decay = ['bias', 'LayerNorm.weight']  # 특정 파라미터에 가중치 감쇠(weight decay)를 적용하지 않도록 설정 (과적합 방지 목적)
optimizer_grouped_parameters = [
    # 가중치 감쇠를 적용하는 파라미터 그룹
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    # 가중치 감쇠를 적용하지 않는 파라미터 그룹
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

# AdamW 옵티마이저를 사용하여 학습을 진행
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)

# 손실 함수로 교차 엔트로피 손실 사용
loss_fn = nn.CrossEntropyLoss()

# 전체 학습 스텝 수 계산
t_total = len(train_dataloader) * num_epochs
# 워밍업 스텝 수 계산 (전체 학습 스텝 수에 워밍업 비율을 곱함)
warmup_step = int(t_total * warmup_ratio)

# 코사인 스케줄러와 워밍업 설정
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

# 에포크 수 만큼 학습 반복
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()  # 모델을 학습 모드로 전환
    # 학습 데이터 로더를 사용하여 배치 단위로 학습
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()  # 옵티마이저의 그래디언트 초기화
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)  # 모델에 입력값을 전달하여 출력값 계산
        loss = loss_fn(out, label)  # 손실 값 계산
        loss.backward()  # 역전파를 통해 그래디언트 계산
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # 그래디언트 클리핑 적용 (그래디언트 폭발 방지)
        optimizer.step()  # 옵티마이저를 사용하여 파라미터 업데이트
        scheduler.step()  # 학습 스케줄러 업데이트 (학습률 조정)
        train_acc += calc_accuracy(out, label)  # 정확도 계산
        # 일정 간격으로 학습 상태 출력
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train f1score {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    # 에포크 종료 후 전체 학습 정확도 출력
    print("epoch {} train f1score {}".format(e+1, train_acc / (batch_id+1)))

    model.eval()  # 모델을 평가 모드로 전환
    # 테스트 데이터 로더를 사용하여 배치 단위로 평가
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)  # 모델에 입력값을 전달하여 출력값 계산
        test_acc += calc_accuracy(out, label)  # 정확도 계산
    # 에포크 종료 후 전체 테스트 정확도 출력
    print("epoch {} test f1score {}".format(e+1, test_acc / (batch_id+1)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/1790 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 2.0029711723327637 train f1score 0.04329004329004329
epoch 1 batch id 201 loss 1.8623249530792236 train f1score 0.11529941170413877
epoch 1 batch id 401 loss 1.2135348320007324 train f1score 0.196685942640878
epoch 1 batch id 601 loss 1.1101484298706055 train f1score 0.28837196073089005
epoch 1 batch id 801 loss 0.7200247645378113 train f1score 0.37648070078247015
epoch 1 batch id 1001 loss 0.5604816675186157 train f1score 0.44438755849606504
epoch 1 batch id 1201 loss 0.6017944812774658 train f1score 0.4945043173770872
epoch 1 batch id 1401 loss 0.3855975270271301 train f1score 0.533332684212963
epoch 1 batch id 1601 loss 0.5233376622200012 train f1score 0.562460599511014
epoch 1 train f1score 0.5861966626808196


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/448 [00:00<?, ?it/s]

epoch 1 test f1score 0.7925156845153744


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/1790 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 0.47371506690979004 train f1score 0.8671368547418968
epoch 2 batch id 201 loss 0.7326175570487976 train f1score 0.7825443186008556
epoch 2 batch id 401 loss 0.26615479588508606 train f1score 0.7913548375361161
epoch 2 batch id 601 loss 0.6192241311073303 train f1score 0.7888653286505226
epoch 2 batch id 801 loss 0.4875358045101166 train f1score 0.791948412104024
epoch 2 batch id 1001 loss 0.4011088013648987 train f1score 0.7954183023096315
epoch 2 batch id 1201 loss 0.3035007417201996 train f1score 0.7991858716657028
epoch 2 batch id 1401 loss 0.21598273515701294 train f1score 0.8030720300331307
epoch 2 batch id 1601 loss 0.2836942672729492 train f1score 0.8053125159764349
epoch 2 train f1score 0.8087900570584464


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/448 [00:00<?, ?it/s]

epoch 2 test f1score 0.8193176748415155


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/1790 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.3831387162208557 train f1score 0.8891723356009071
epoch 3 batch id 201 loss 0.6371927857398987 train f1score 0.8397187260168808
epoch 3 batch id 401 loss 0.13140244781970978 train f1score 0.8378706494393742
epoch 3 batch id 601 loss 0.45010843873023987 train f1score 0.8398382866477797
epoch 3 batch id 801 loss 0.2798300087451935 train f1score 0.8435956124042109
epoch 3 batch id 1001 loss 0.1932736486196518 train f1score 0.8469288269462129
epoch 3 batch id 1201 loss 0.18517838418483734 train f1score 0.8501730056412209
epoch 3 batch id 1401 loss 0.3119201958179474 train f1score 0.8521399586799213
epoch 3 batch id 1601 loss 0.3378233015537262 train f1score 0.8535815869773632
epoch 3 train f1score 0.855098727451276


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/448 [00:00<?, ?it/s]

epoch 3 test f1score 0.8254195902596824


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/1790 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 0.32001441717147827 train f1score 0.8720069846120266
epoch 4 batch id 201 loss 0.5171864032745361 train f1score 0.8783322449226888
epoch 4 batch id 401 loss 0.16545088589191437 train f1score 0.877171249580646
epoch 4 batch id 601 loss 0.3112057149410248 train f1score 0.8738831979174653
epoch 4 batch id 801 loss 0.13513332605361938 train f1score 0.8740889609723177
epoch 4 batch id 1001 loss 0.40539640188217163 train f1score 0.8773242918829237
epoch 4 batch id 1201 loss 0.207651287317276 train f1score 0.8809815107777037
epoch 4 batch id 1401 loss 0.12983976304531097 train f1score 0.8818729702095024
epoch 4 batch id 1601 loss 0.3517492413520813 train f1score 0.8832942087411544
epoch 4 train f1score 0.8845456655223921


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/448 [00:00<?, ?it/s]

epoch 4 test f1score 0.8321793044933422


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/1790 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 0.14956319332122803 train f1score 0.9639097744360903
epoch 5 batch id 201 loss 0.49328821897506714 train f1score 0.9040026539270895
epoch 5 batch id 401 loss 0.07075829803943634 train f1score 0.903191138137446
epoch 5 batch id 601 loss 0.3577564060688019 train f1score 0.900706402992068
epoch 5 batch id 801 loss 0.414898157119751 train f1score 0.903872474188239
epoch 5 batch id 1001 loss 0.34565672278404236 train f1score 0.9046160088198635
epoch 5 batch id 1201 loss 0.06416124850511551 train f1score 0.9078821587734283
epoch 5 batch id 1401 loss 0.11910392343997955 train f1score 0.909258546700081
epoch 5 batch id 1601 loss 0.09227369725704193 train f1score 0.9098682324478069
epoch 5 train f1score 0.910863138359026


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/448 [00:00<?, ?it/s]

epoch 5 test f1score 0.8261536324328643


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/1790 [00:00<?, ?it/s]

epoch 6 batch id 1 loss 0.10027054697275162 train f1score 0.9639097744360903
epoch 6 batch id 201 loss 0.6008450388908386 train f1score 0.9249231374311861
epoch 6 batch id 401 loss 0.13277433812618256 train f1score 0.9251955836251301
epoch 6 batch id 601 loss 0.3809897005558014 train f1score 0.9243373101857243
epoch 6 batch id 801 loss 0.1313321739435196 train f1score 0.925341487818021
epoch 6 batch id 1001 loss 0.18530476093292236 train f1score 0.9270300036361374
epoch 6 batch id 1201 loss 0.13680732250213623 train f1score 0.9292189903781418
epoch 6 batch id 1401 loss 0.08365465700626373 train f1score 0.9301846957215902
epoch 6 batch id 1601 loss 0.20012587308883667 train f1score 0.9309759274299569
epoch 6 train f1score 0.9319951815183889


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/448 [00:00<?, ?it/s]

epoch 6 test f1score 0.8317652209243037


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/1790 [00:00<?, ?it/s]

epoch 7 batch id 1 loss 0.2151501476764679 train f1score 0.9135654261704682
epoch 7 batch id 201 loss 0.2957622706890106 train f1score 0.9382167520149168
epoch 7 batch id 401 loss 0.0077731977216899395 train f1score 0.942820525066711
epoch 7 batch id 601 loss 0.10110605508089066 train f1score 0.9434546711950911
epoch 7 batch id 801 loss 0.08038271218538284 train f1score 0.9466615556025072
epoch 7 batch id 1001 loss 0.23765815794467926 train f1score 0.9480655171071976
epoch 7 batch id 1201 loss 0.0866222232580185 train f1score 0.9484587082680769
epoch 7 batch id 1401 loss 0.04458791762590408 train f1score 0.9487570828825592
epoch 7 batch id 1601 loss 0.15806710720062256 train f1score 0.9498336350168799
epoch 7 train f1score 0.9510554585722303


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/448 [00:00<?, ?it/s]

epoch 7 test f1score 0.8368714781146822


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/1790 [00:00<?, ?it/s]

epoch 8 batch id 1 loss 0.06370927393436432 train f1score 0.9365079365079365
epoch 8 batch id 201 loss 0.6269487142562866 train f1score 0.961119343112781
epoch 8 batch id 401 loss 0.00889631174504757 train f1score 0.9607542237243913
epoch 8 batch id 601 loss 0.014344207011163235 train f1score 0.9590861099687024
epoch 8 batch id 801 loss 0.013007227331399918 train f1score 0.9605116039350549
epoch 8 batch id 1001 loss 0.11432822793722153 train f1score 0.9605991486296882
epoch 8 batch id 1201 loss 0.07363511621952057 train f1score 0.9614632392225178
epoch 8 batch id 1401 loss 0.0031125762034207582 train f1score 0.9624552246352138
epoch 8 batch id 1601 loss 0.03957551717758179 train f1score 0.9633610813189183
epoch 8 train f1score 0.964195240098659


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/448 [00:00<?, ?it/s]

epoch 8 test f1score 0.8439487226001081


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/1790 [00:00<?, ?it/s]

epoch 9 batch id 1 loss 0.005701968912035227 train f1score 1.0
epoch 9 batch id 201 loss 0.24330486357212067 train f1score 0.9704989278190397
epoch 9 batch id 401 loss 0.0011344325030222535 train f1score 0.9730216872764379
epoch 9 batch id 601 loss 0.0015593719435855746 train f1score 0.9715635290021624
epoch 9 batch id 801 loss 0.003360828384757042 train f1score 0.97291718370768
epoch 9 batch id 1001 loss 0.013697957620024681 train f1score 0.9732009551025879
epoch 9 batch id 1201 loss 0.05153030902147293 train f1score 0.9739954505825255
epoch 9 batch id 1401 loss 0.0028472074773162603 train f1score 0.9739248307024334
epoch 9 batch id 1601 loss 0.0028963186778128147 train f1score 0.9741579171144306
epoch 9 train f1score 0.974829448044332


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/448 [00:00<?, ?it/s]

epoch 9 test f1score 0.8475785949371215


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/1790 [00:00<?, ?it/s]

epoch 10 batch id 1 loss 0.0008131836075335741 train f1score 1.0
epoch 10 batch id 201 loss 0.19857266545295715 train f1score 0.9756828717361355
epoch 10 batch id 401 loss 0.000666339707095176 train f1score 0.9782032142866378
epoch 10 batch id 601 loss 0.0005997594562359154 train f1score 0.9778984745831212
epoch 10 batch id 801 loss 0.002082766266539693 train f1score 0.9787115154262003
epoch 10 batch id 1001 loss 0.03540614992380142 train f1score 0.9795332549442275
epoch 10 batch id 1201 loss 0.008363127708435059 train f1score 0.9801604849716833
epoch 10 batch id 1401 loss 0.0022276239469647408 train f1score 0.9806140294807509
epoch 10 batch id 1601 loss 0.008298428729176521 train f1score 0.9807631350905824
epoch 10 train f1score 0.9807681584823682


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/448 [00:00<?, ?it/s]

epoch 10 test f1score 0.8476748806259555


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/1790 [00:00<?, ?it/s]

epoch 11 batch id 1 loss 0.0004342506581451744 train f1score 1.0
epoch 11 batch id 201 loss 0.2287234663963318 train f1score 0.9817913292970513
epoch 11 batch id 401 loss 0.0006577866151928902 train f1score 0.9834285293735194
epoch 11 batch id 601 loss 0.00039274661685340106 train f1score 0.9841711859428987
epoch 11 batch id 801 loss 0.0008686878136359155 train f1score 0.9855148459025964
epoch 11 batch id 1001 loss 0.00829550065100193 train f1score 0.9862496944163338
epoch 11 batch id 1201 loss 0.011238166131079197 train f1score 0.9862330964649173
epoch 11 batch id 1401 loss 0.0012090462259948254 train f1score 0.9863614354375885
epoch 11 batch id 1601 loss 0.0011749870609492064 train f1score 0.9863784960113414
epoch 11 train f1score 0.9864964157746736


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/448 [00:00<?, ?it/s]

epoch 11 test f1score 0.8487369644365944


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/1790 [00:00<?, ?it/s]

epoch 12 batch id 1 loss 0.0006569374818354845 train f1score 1.0
epoch 12 batch id 201 loss 0.12946702539920807 train f1score 0.9885537475101371
epoch 12 batch id 401 loss 0.000372207083273679 train f1score 0.9897060883488498
epoch 12 batch id 601 loss 0.00048163271276280284 train f1score 0.9892994749633973
epoch 12 batch id 801 loss 0.0033326942939311266 train f1score 0.9900493037781493
epoch 12 batch id 1001 loss 0.000797980756033212 train f1score 0.9904664607920796
epoch 12 batch id 1201 loss 0.02428790181875229 train f1score 0.9901655199297709
epoch 12 batch id 1401 loss 0.0006637457991018891 train f1score 0.9903484229640147
epoch 12 batch id 1601 loss 0.0008167517953552306 train f1score 0.9903718896465684
epoch 12 train f1score 0.9904891171591401


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/448 [00:00<?, ?it/s]

epoch 12 test f1score 0.8482611549897848


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/1790 [00:00<?, ?it/s]

epoch 13 batch id 1 loss 0.0003264096158090979 train f1score 1.0
epoch 13 batch id 201 loss 0.0031906699296087027 train f1score 0.9896806978596612
epoch 13 batch id 401 loss 0.00034347345354035497 train f1score 0.9910507351723553
epoch 13 batch id 601 loss 0.0005031866021454334 train f1score 0.9901480436681445
epoch 13 batch id 801 loss 0.0008425556588917971 train f1score 0.9908074816563495
epoch 13 batch id 1001 loss 0.00040082604391500354 train f1score 0.9910260118531973
epoch 13 batch id 1201 loss 0.010460461489856243 train f1score 0.9908790961784834
epoch 13 batch id 1401 loss 0.0009324779384769499 train f1score 0.9911807323470786
epoch 13 batch id 1601 loss 0.001083514653146267 train f1score 0.9912459530340655
epoch 13 train f1score 0.9913899679274986


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/448 [00:00<?, ?it/s]

epoch 13 test f1score 0.8487581235036586


In [None]:
#모델 저장하기
torch.save(model, '/content/drive/MyDrive/졸프/best_model.pt')

In [None]:
# 모델 호출
my_model= torch.load('/content/drive/MyDrive/졸프/best_model.pt')

RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.

In [None]:
import torch

# 모델 파일 경로
model_path = '/content/drive/MyDrive/졸프/best_model.pt'

# 모델을 CPU로 로드
my_model = torch.load(model_path, map_location=torch.device('cpu'))

# 필요한 경우 모델을 다시 GPU로 이동
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
my_model.to(device)

# 모델을 평가 모드로 설정
my_model.eval()


BERTClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise

## **Test**
- 일기를 입력하여, Top3 감정 추출하기

In [None]:
# 나눔고딕 폰트 설치 -> 설치 후, 런타임 재시작 필요
!apt-get update -qq
!apt-get install -qq fonts-nanum
!fc-cache -fv
!rm -rf ~/.cache/matplotlib

In [None]:
import numpy as np
import torch
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from collections import Counter
import nltk

# 나눔고딕 폰트 설정
font_path = '/usr/share/fonts/truetype/nanum/NanumGothic.ttf'  # 폰트 파일 경로
font_prop = fm.FontProperties(fname=font_path)  # 폰트 속성 설정
plt.rc('font', family=font_prop.get_name())  # matplotlib에서 사용할 폰트 설정
plt.rcParams['axes.unicode_minus'] = False  # 마이너스 기호 깨짐 방지

# 감정 레이블 정의
emotion_labels = ["불안", "당황", "분노", "슬픔", "일상", "행복", "혐오"]

# 소프트맥스 함수 정의
def softmax(x):
    e_x = np.exp(x - np.max(x))  # 오버플로우 방지를 위해 입력에서 최대값을 뺌
    return e_x / e_x.sum(axis=1, keepdims=True)  # 각 샘플에 대해 소프트맥스 계산

# 문장별 감정 예측 함수
def predict(sentence):
    dataset = [[sentence, '0']]  # 입력 문장을 데이터셋 형태로 변환
    test = BERTDataset(dataset, 0, 1, tok, vocab, max_len, True, False)  # BERTDataset 객체 생성
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=1, num_workers=0)  # DataLoader 생성
    my_model.eval()  # 모델을 평가 모드로 전환
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)  # token_ids 텐서를 GPU로 이동
        segment_ids = segment_ids.long().to(device)  # segment_ids 텐서를 GPU로 이동
        valid_length = valid_length  # valid_length는 그대로 사용
        label = label.long().to(device)  # label 텐서를 GPU로 이동
        out = my_model(token_ids, valid_length, segment_ids)  # 모델을 통해 예측값 계산
        logits = out.detach().cpu().numpy()  # 예측값을 CPU로 이동하여 numpy 배열로 변환
        probabilities = softmax(logits)  # 소프트맥스 함수 사용하여 확률 계산
        return probabilities[0]  # 첫 번째 샘플의 확률 반환

# 일기 전체에서 감정 추출 및 분석
def analyze_emotions(diary):
    sentences = nltk.sent_tokenize(diary)  # 문장 단위로 나누기
    emotion_counts = Counter()  # 감정 빈도수 저장할 Counter 객체 생성
    non_neutral_sentences = 0  # "일상"을 제외한 문장 수 카운트

    for sentence in sentences:
        probabilities = predict(sentence)  # 각 문장에 대해 감정 예측
        top_emotion_index = np.argmax(probabilities)  # 가장 높은 확률의 감정 인덱스

        # "일상"을 중립 감정으로 간주하고 제외
        if top_emotion_index != 4:  # 4는 "일상"에 해당
            emotion_counts[top_emotion_index] += 1  # 해당 감정 카운트 증가
            non_neutral_sentences += 1  # 중립이 아닌 문장 수 증가

    # 감정 비율 계산
    emotion_percentages = {emotion_labels[i]: (count / non_neutral_sentences) * 100 for i, count in emotion_counts.items()}

    # 상위 3개 감정 추출
    top_3_emotions = dict(Counter(emotion_percentages).most_common(3))

    return top_3_emotions  # 상위 3개 감정과 그 비율 반환

# 감정 비율 시각화
def plot_top_emotions(emotion_percentages):
    emotions = list(emotion_percentages.keys())  # 감정 레이블 리스트
    percentages = list(emotion_percentages.values())  # 감정 비율 리스트

    plt.figure(figsize=(6, 4))  # 그래프 크기 설정
    bars = plt.barh(emotions, percentages, color=['#FF9999', '#66B2FF', '#99FF99'])  # 수평 막대 그래프 생성

    # 그래프 꾸미기
    plt.xlabel('백분율')  # x축 라벨 설정
    plt.title('오늘의 대표 감정')  # 그래프 제목 설정
    plt.xlim(0, 100)  # x축 범위 설정

    # 각 막대에 백분율 표시
    for bar in bars:
        width = bar.get_width()  # 막대의 너비(백분율 값)
        plt.text(width, bar.get_y() + bar.get_height() / 2, f'{width:.1f}%', va='center', fontproperties=font_prop)  # 막대 위에 백분율 표시

    plt.show()  # 그래프 표시


In [None]:
# 생성된 일기로 테스트
diaries = '''
오늘부터 새벽에 산책을 시작했습니다. 해가 뜨기 전 하늘이 정말 예뻤어요, 그 순간이 특별히 기억에 남았습니다.
낮에는 친구가 병원에 있다는 소식을 듣고 병문안도 다녀왔어요. 평소에 건강에 좋던 친구가 아프다니 마음이 참 무겁더군요.
아침에는 기분이 상쾌하고 좋았지만, 오후에는 내일 있을 건강검진 때문에 걱정이 많아졌어요.
'''

top_3_emotions = analyze_emotions(diaries)
plot_top_emotions(top_3_emotions)