In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [1]:
!pip install git+https://github.com/monologg/KoBERT-transformers.git

Collecting git+https://github.com/monologg/KoBERT-transformers.git
  Cloning https://github.com/monologg/KoBERT-transformers.git to /tmp/pip-req-build-xuwj0mmo
  Running command git clone --filter=blob:none --quiet https://github.com/monologg/KoBERT-transformers.git /tmp/pip-req-build-xuwj0mmo
  Resolved https://github.com/monologg/KoBERT-transformers.git to commit 4e0a00e5e4884848fe4daeccf3698a28ebcfe449
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.1.0->kobert-transformers==0.5.1)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.1.0->kobert-transformers==0.5.1)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=

In [3]:
import os
import json
import torch
import torch.nn as nn
import numpy as np
import random
import argparse
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
from kobert_transformers import get_kobert_model, get_tokenizer
from tqdm import tqdm
import re

def preprocess_text(text):
    # 불필요한 공백과 특수 문자를 제거하는 전처리 함수
    text = re.sub(r'\s+', ' ', text)  # 다중 공백을 단일 공백으로 변환
    text = re.sub(r'[^\w\s]', '', text)  # 특수 문자 제거
    return text.strip()



In [4]:
#오류생겨서 다시 한 것

class EmotionDataset(Dataset):
    def __init__(self, data_dir, tokenizer, max_len=512, sample_ratio=1.0):  # sample_ratio 추가
        self.data = []
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.sample_ratio = sample_ratio
        self.load_data(data_dir)

    def load_data(self, data_dir):
        for root, dirs, files in os.walk(data_dir):
            if files:
                num_files_to_sample = max(1, int(len(files) * self.sample_ratio))  # 샘플링 비율 적용
                sampled_files = random.sample(files, num_files_to_sample)  # 샘플링된 파일 목록
                for file in sampled_files:
                    if file.endswith(".json"):
                        file_path = os.path.join(root, file)
                        with open(file_path, 'r', encoding='utf-8') as f:
                            try:
                                data = json.load(f)
                                listener_behavior = data["info"]["listener_behavior"]
                                for utterance in data["utterances"]:
                                    if utterance["role"] == "listener" and utterance["listener_empathy"]:
                                        text = preprocess_text(utterance["text"])
                                        try:
                                            emotion_label = self.label_from_info(data["info"])
                                            empathy_label = self.label_from_empathy(utterance["listener_empathy"])
                                            self.data.append((text, emotion_label, empathy_label))
                                        except Exception as e:
                                            print(f"Error processing file {file_path} in folder {root}: {e}")
                            except json.JSONDecodeError as e:
                                print(f"Error decoding JSON from file {file_path}: {e}")

    def label_from_info(self, info):
        emotion = info.get("speaker_emotion", "")
        label_map = {"기쁨": 0, "당황": 1, "분노": 2, "불안": 3, "상처": 4, "슬픔": 5}
        if emotion in label_map:
            return label_map[emotion]
        else:
            raise ValueError(f"Unknown emotion: {emotion}")

    def label_from_empathy(self, empathy_list):
        empathy = empathy_list[0]
        label_map = {"동조": 0, "격려": 1, "조언": 2, "위로": 3}
        if empathy in label_map:
            return label_map[empathy]
        else:
            raise ValueError(f"Unknown empathy type: {empathy}")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text, emotion_label, empathy_label = self.data[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'emotion_label': torch.tensor(emotion_label, dtype=torch.long),
            'empathy_label': torch.tensor(empathy_label, dtype=torch.long)
        }

In [5]:
class CustomBERTModel(nn.Module):
    def __init__(self, model_name, num_emotion_labels, num_empathy_labels):
        super(CustomBERTModel, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.emotion_classifier = nn.Linear(self.bert.config.hidden_size, num_emotion_labels)
        self.empathy_classifier = nn.Linear(self.bert.config.hidden_size, num_empathy_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        emotion_logits = self.emotion_classifier(pooled_output)
        empathy_logits = self.empathy_classifier(pooled_output)
        return emotion_logits, empathy_logits

    def save_pretrained(self, save_directory):
        if not os.path.exists(save_directory):
            os.makedirs(save_directory)
        self.bert.save_pretrained(save_directory)
        torch.save(self.emotion_classifier.state_dict(), os.path.join(save_directory, 'emotion_classifier.pth'))
        torch.save(self.empathy_classifier.state_dict(), os.path.join(save_directory, 'empathy_classifier.pth'))


In [6]:
def _get_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument('--seed', type=int, default=7)
    parser.add_argument('--save_dir', type=str, default='checkpoints')
    parser.add_argument('--train_data_dir', type=str, default='/content/drive/MyDrive/하영사용데이터/TL_감정분석_전체데이터')
    parser.add_argument('--val_data_dir', type=str, default='/content/drive/MyDrive/하영사용데이터/VL_감정분석_전체데이터')
    parser.add_argument('--num_workers', type=int, default=5)
    parser.add_argument('--batch_size', type=int, default=30)
    parser.add_argument('--lr', type=float, default=2e-05)
    parser.add_argument('--num_epochs', type=int, default=5)
    parser.add_argument('--log_every', type=int, default=20)
    parser.add_argument('--gradient_clip_val', type=float, default=1.0)
    parser.add_argument('--accumulate_grad_batches', type=int, default=1)
    parser.add_argument('--save_every', type=int, default=10_000)
    parser.add_argument('--pretrain_tokenizer_name', type=str, default='monologg/kobert')
    return parser


In [7]:

def validate(model, val_loader, device):
    model.eval()
    total_emotion_loss = 0
    total_empathy_loss = 0
    correct_emotion = 0
    correct_empathy = 0
    total = 0
    criterion = nn.CrossEntropyLoss()

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            emotion_labels = batch['emotion_label'].to(device)
            empathy_labels = batch['empathy_label'].to(device)

            emotion_logits, empathy_logits = model(input_ids, attention_mask)
            emotion_loss = criterion(emotion_logits, emotion_labels)
            empathy_loss = criterion(empathy_logits, empathy_labels)

            total_emotion_loss += emotion_loss.item()
            total_empathy_loss += empathy_loss.item()

            _, emotion_predicted = torch.max(emotion_logits, 1)
            _, empathy_predicted = torch.max(empathy_logits, 1)

            total += emotion_labels.size(0)
            correct_emotion += (emotion_predicted == emotion_labels).sum().item()
            correct_empathy += (empathy_predicted == empathy_labels).sum().item()

    avg_emotion_loss = total_emotion_loss / len(val_loader)
    avg_empathy_loss = total_empathy_loss / len(val_loader)
    emotion_accuracy = correct_emotion / total
    empathy_accuracy = correct_empathy / total
    return avg_emotion_loss, avg_empathy_loss, emotion_accuracy, empathy_accuracy

In [8]:
def main():
    """

    Returns:

    """
    parser = _get_parser()
    args = parser.parse_args(args=[])

    torch.manual_seed(args.seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(args.seed)
    random.seed(args.seed)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    tokenizer = BertTokenizer.from_pretrained('monologg/kobert')
    model = CustomBERTModel('monologg/kobert', num_emotion_labels=6, num_empathy_labels=4)
    model = nn.DataParallel(model).to(device)

    trainset = EmotionDataset(args.train_data_dir, tokenizer, sample_ratio=0.7, max_len=512)  # 70% 데이터 샘플링, max_len=512
    valset = EmotionDataset(args.val_data_dir, tokenizer, sample_ratio=0.5, max_len=512)  # 50% 데이터 샘플링, max_len=512

    print(f"Number of training samples: {len(trainset)}")
    print(f"Number of validation samples: {len(valset)}")

    train_loader = DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers)
    val_loader = DataLoader(valset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers)

    optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=1e-4)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * args.num_epochs)

    emotion_criterion = nn.CrossEntropyLoss()
    empathy_criterion = nn.CrossEntropyLoss()

    for epoch in range(args.num_epochs):
        model.train()
        total_emotion_loss = 0
        total_empathy_loss = 0
        for step, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}/{args.num_epochs}", ncols=100)):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            emotion_labels = batch['emotion_label'].to(device)
            empathy_labels = batch['empathy_label'].to(device)

            optimizer.zero_grad()
            emotion_logits, empathy_logits = model(input_ids, attention_mask)
            emotion_loss = emotion_criterion(emotion_logits, emotion_labels)
            empathy_loss = empathy_criterion(empathy_logits, empathy_labels)

            loss = emotion_loss + empathy_loss

            loss.backward()
            optimizer.step()
            scheduler.step()

            total_emotion_loss += emotion_loss.item()
            total_empathy_loss += empathy_loss.item()

        avg_train_emotion_loss = total_emotion_loss / len(train_loader)
        avg_train_empathy_loss = total_empathy_loss / len(train_loader)
        avg_val_emotion_loss, avg_val_empathy_loss, emotion_accuracy, empathy_accuracy = validate(model, val_loader, device)

        print(f"Epoch: {epoch + 1}/{args.num_epochs}, Average Train Emotion Loss: {avg_train_emotion_loss}, Average Train Empathy Loss: {avg_train_empathy_loss}")
        print(f"Validation Emotion Loss: {avg_val_emotion_loss}, Validation Empathy Loss: {avg_val_empathy_loss}")
        print(f"Validation Emotion Accuracy: {emotion_accuracy}, Validation Empathy Accuracy: {empathy_accuracy}")

    return model, tokenizer

if __name__ == '__main__':
    model, tokenizer = main()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/77.8k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/426 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

Number of training samples: 101718
Number of validation samples: 9455


Epoch 1/5: 100%|████████████████████████████████████████████████| 3391/3391 [30:54<00:00,  1.83it/s]


Epoch: 1/5, Average Train Emotion Loss: 1.7811191473271566, Average Train Empathy Loss: 1.344970862902712
Validation Emotion Loss: 1.7833953345123725, Validation Empathy Loss: 1.3597885317440275
Validation Emotion Accuracy: 0.212374405076679, Validation Empathy Accuracy: 0.34881015335801163


Epoch 2/5: 100%|████████████████████████████████████████████████| 3391/3391 [30:52<00:00,  1.83it/s]


Epoch: 2/5, Average Train Emotion Loss: 1.7704285381818303, Average Train Empathy Loss: 1.2817295885514945
Validation Emotion Loss: 1.7480746005909353, Validation Empathy Loss: 1.238507944571821
Validation Emotion Accuracy: 0.2612374405076679, Validation Empathy Accuracy: 0.4359598096245373


Epoch 3/5: 100%|████████████████████████████████████████████████| 3391/3391 [30:52<00:00,  1.83it/s]


Epoch: 3/5, Average Train Emotion Loss: 1.7519462203177967, Average Train Empathy Loss: 1.2313549593991848
Validation Emotion Loss: 1.7441755759565136, Validation Empathy Loss: 1.2258597740644142
Validation Emotion Accuracy: 0.26409307244844, Validation Empathy Accuracy: 0.4376520359598096


Epoch 4/5: 100%|████████████████████████████████████████████████| 3391/3391 [30:52<00:00,  1.83it/s]


Epoch: 4/5, Average Train Emotion Loss: 1.7387137814139026, Average Train Empathy Loss: 1.1995011894304257
Validation Emotion Loss: 1.7215516303159013, Validation Empathy Loss: 1.1881029090926618
Validation Emotion Accuracy: 0.2735060814383924, Validation Empathy Accuracy: 0.46134320465362244


Epoch 5/5: 100%|████████████████████████████████████████████████| 3391/3391 [30:52<00:00,  1.83it/s]


Epoch: 5/5, Average Train Emotion Loss: 1.7284792855720328, Average Train Empathy Loss: 1.1805651063800948
Validation Emotion Loss: 1.716185264195068, Validation Empathy Loss: 1.1788195156598393
Validation Emotion Accuracy: 0.2790058170280275, Validation Empathy Accuracy: 0.4683236382866208


In [9]:
# 학습 후 모델 저장
model.module.save_pretrained('/content/drive/MyDrive/하영사용데이터/모델저장')
tokenizer.save_pretrained('/content/drive/MyDrive/하영사용데이터/모델저장')

('/content/drive/MyDrive/하영사용데이터/모델저장/tokenizer_config.json',
 '/content/drive/MyDrive/하영사용데이터/모델저장/special_tokens_map.json',
 '/content/drive/MyDrive/하영사용데이터/모델저장/vocab.txt',
 '/content/drive/MyDrive/하영사용데이터/모델저장/added_tokens.json')

In [None]:
# 모델 학습 이후에 모델을 저장하는 코드

import os

def save_model(model, tokenizer, save_directory):
    if not os.path.exists(save_directory):
        os.makedirs(save_directory)

    # 학습된 모델의 state_dict 저장
    torch.save(model.module.state_dict(), os.path.join(save_directory, 'model_weights.pth'))

    # 토크나이저 저장
    tokenizer.save_pretrained(save_directory)

# 저장 경로 지정
save_directory = '/content/drive/MyDrive/하영사용데이터/모델저장'

# 모델과 토크나이저 저장
save_model(model, tokenizer, save_directory)


In [None]:
# 학습된 모델 인스턴스 (model)와 토크나이저 (tokenizer)를 받아 저장하는 함수
def save_model(model, tokenizer, save_directory):
    if not os.path.exists(save_directory):
        os.makedirs(save_directory)

    # CustomBERTModel의 save_pretrained 메서드를 호출
    model.module.save_pretrained(save_directory)

    # 토크나이저 저장
    tokenizer.save_pretrained(save_directory)

# 저장 경로 지정
save_directory = '/content/drive/MyDrive/하영사용데이터/모델저장'

# 모델과 토크나이저 저장
save_model(model, tokenizer, save_directory)

AttributeError: 'CustomBERTModel' object has no attribute 'save_pretrained'