###AI 기반 응급 상황 분류 시스템


In [1]:
# 필요한 라이브러리 설치
!pip install transformers
!pip install sentencepiece
!pip install konlpy

# 라이브러리 임포트
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import ElectraTokenizer, ElectraForSequenceClassification
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from konlpy.tag import Okt

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading JPype1-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m79.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading JPype1-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (488 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m488.6/488.6 kB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: JPype1, konlpy
Successfully installed JPype1-1.5.0 konlpy-0.6.0


In [3]:
import pandas as pd
from urllib.parse import quote

train_dfs = []

# train files
base_url = "https://media.githubusercontent.com/media/1000century/ml/master/train_label/"
train_files = [
    'TL_광주_구급.json', 'TL_광주_구조.json', 'TL_광주_기타.json', 'TL_광주_화재.json',
    'TL_서울_구급.json', 'TL_서울_구조.json', 'TL_서울_기타.json', 'TL_서울_화재.json',
    'TL_인천_구급.json', 'TL_인천_구조.json', 'TL_인천_기타.json', 'TL_인천_화재.json'
]

for file in train_files:
    print(file)
    encoded_file = quote(file)
    url = base_url + encoded_file
    df = pd.read_json(url)
    df['region'] = file.split('_')[1]
    df['type'] = file.split('_')[2].replace('.json', '')
    train_dfs.append(df)

# valid files
base_url = "https://media.githubusercontent.com/media/1000century/ml/master/valid_label/"
valid_files = [file.replace('TL_', 'VL_') for file in train_files]
valid_dfs = []

for file in valid_files:
    encoded_file = quote(file)
    url = base_url + encoded_file
    df = pd.read_json(url)
    df['region'] = file.split('_')[1]
    df['type'] = file.split('_')[2].replace('.json', '')
    valid_dfs.append(df)

# 데이터프레임 합치기
df_train = pd.concat(train_dfs, ignore_index=True)
df_valid = pd.concat(valid_dfs, ignore_index=True)

print("\nTrain DataFrame Shape:", df_train.shape)
print("\nValid DataFrame Shape:", df_valid.shape)

def process_dialogue(dialogue_list):
    unique_speakers = sorted(set(item['speaker'] for item in dialogue_list))

    speaker_tokens = {
        speaker_id: f"[SPEAKER_{idx}]"
        for idx, speaker_id in enumerate(unique_speakers)
    }

    sorted_dialogue = sorted(dialogue_list, key=lambda x: x['startAt'])

    processed_texts = []
    for utterance in sorted_dialogue:
        speaker_token = speaker_tokens[utterance['speaker']]
        processed_texts.append(f"{speaker_token} {utterance['text']}")

    final_text = " ".join(processed_texts)

    return final_text
df_train['txt'] = df_train['utterances'].apply(process_dialogue)
df_valid['txt'] = df_valid['utterances'].apply(process_dialogue)
df_train['txt'].iloc[3]


TL_광주_구급.json
TL_광주_구조.json
TL_광주_기타.json
TL_광주_화재.json
TL_서울_구급.json
TL_서울_구조.json
TL_서울_기타.json
TL_서울_화재.json
TL_인천_구급.json
TL_인천_구조.json
TL_인천_기타.json
TL_인천_화재.json

Train DataFrame Shape: (127178, 19)

Valid DataFrame Shape: (15897, 19)


'[SPEAKER_0] 네, 119입니다. [SPEAKER_1] 예, 여보세요? [SPEAKER_0] 예, 119입니다. 말씀하세요. [SPEAKER_1] 저, 허리가 너무 아파서 그러는데. [SPEAKER_0] 네. [SPEAKER_0] 언제부터 아프셨어요? 언제부터 아프셨어요? [SPEAKER_1] 어떡해야. [SPEAKER_1] 저, 아까 낮에 병원, 병원 잠깐 갔다왔는데 그때는 괜찮았는데. [SPEAKER_0] 예. [SPEAKER_1] 그런 말하지마 그럼 빼주잖아. [SPEAKER_1] 어, 아무튼 지금 차에 탈 수가 없, 아, 차에 탈 수가 없을 만큼 아파가지고. [SPEAKER_0] 주소가 어디세요? [SPEAKER_1] 여기, 어, 월남동 호반. [SPEAKER_0] 호반베르디움 2차예요? [SPEAKER_1] 예, 2차. [SPEAKER_1] ***[개인정보] [SPEAKER_0] 잠시만요. [SPEAKER_1] 예전에도 한번. [SPEAKER_1] 아예 마비돼가지고 하, 한 번 갔었거든요, 병원에요. [SPEAKER_0] 저희 갈 건데 문 열어줄 수 있죠? [SPEAKER_1] 네, 지금 근데 또 마비될까봐 지금 그 전 상태라서. [SPEAKER_1] 지금 전화 드린거거든요. [SPEAKER_0] 그런데 저희 구급차 출발할겁니다. 문 열어주세요. [SPEAKER_1] 지금 문 열어드릴순 있어요. [SPEAKER_1] 네.'

0. 데이터 불러오기 및 미리보기

In [None]:
# CSV 파일로 저장
df_train.to_csv("train_data.csv", index=False, encoding='utf-8-sig')
df_valid.to_csv("valid_data.csv", index=False, encoding='utf-8-sig')

print("Train data saved to 'train_data.csv'")
print("Valid data saved to 'valid_data.csv'")


# CSV 파일 불러오기
df_train = pd.read_csv("train_data.csv")
df_valid = pd.read_csv("valid_data.csv")

# 데이터 구조 확인
print("Train DataFrame Shape:", df_train.shape)
print("Valid DataFrame Shape:", df_valid.shape)

# 데이터 미리보기
print("\nTrain Data Sample:")
print(df_train.head())
print("\nValid Data Sample:")
print(df_valid.head())


display(df_train.head())

Train data saved to 'train_data.csv'
Valid data saved to 'valid_data.csv'
Train DataFrame Shape: (127178, 20)
Valid DataFrame Shape: (15897, 20)

Train Data Sample:
                        _id  \
0  64f6b358446b19d68e3366b0   
1  64f6b358446b19d68e336723   
2  64f6b358446b19d68e3366ba   
3  64f6b358446b19d68e3366b8   
4  64f6b358446b19d68e3366e6   

                                           audioPath              recordId  \
0  20230904/Gwangju/2021/08/24/1010/converted_202...  7c2de956e154e60e5273   
1  20230904/Gwangju/2021/08/11/1006/converted_202...  aa713b74d306a37e5eab   
2  20230904/Gwangju/2021/01/15/1005/converted_202...  3810367b6ab329939131   
3  20230904/Gwangju/2021/08/11/1004/converted_202...  f579cee6bd57b0c82894   
4  20230904/Gwangju/2021/08/15/1005/converted_202...  e99f28a2f2cdbe8a5b29   

   status  startAt    endAt  \
0      12        0  37600.0   
1      12        0  54800.0   
2      12        0  46720.0   
3      12        0  59640.0   
4      12        0  7256

Unnamed: 0,_id,audioPath,recordId,status,startAt,endAt,utterances,mediaType,gender,address,disasterLarge,disasterMedium,urgencyLevel,sentiment,symptom,triage,source_file,region,type,txt
0,64f6b358446b19d68e3366b0,20230904/Gwangju/2021/08/24/1010/converted_202...,7c2de956e154e60e5273,12,0,37600.0,"[{'id': '37604774', 'startAt': 928, 'endAt': 2...",Mobile,M,광주광역시 광산구 오선동,구급,질병(중증 외),중,불안/걱정,"['기타통증', '기타', '전신쇠약', '어지러움']",준응급증상,/64f6b358446b19d68e3366b0_20210824111610.json,광주,구급,"[SPEAKER_0] 119입니다. [SPEAKER_1] 예, 엊그저, 엊그저께 코..."
1,64f6b358446b19d68e336723,20230904/Gwangju/2021/08/11/1006/converted_202...,aa713b74d306a37e5eab,12,0,54800.0,"[{'id': '996710f4', 'startAt': 30, 'endAt': 38...",Mobile,M,광주광역시 북구 운암동,구급,질병(중증 외),중,불안/걱정,"['고열', '오심']",준응급증상,/64f6b358446b19d68e336723_20210811164345.json,광주,구급,"[SPEAKER_0] 네, 어차피 도 화재로 나가면은 그게 화재 남아있으면. [SP..."
2,64f6b358446b19d68e3366ba,20230904/Gwangju/2021/01/15/1005/converted_202...,3810367b6ab329939131,12,0,46720.0,"[{'id': 'wavesurfer_rd92h0fum9o', 'startAt': 2...",Mobile,M,광주광역시 남구 월산동,구급,질병(중증 외),하,당황/난처,[],,/64f6b358446b19d68e3366ba_20210115015643.json,광주,구급,"[SPEAKER_1] 예. [SPEAKER_0] 예, 광주입니다. 여보세요? [SP..."
3,64f6b358446b19d68e3366b8,20230904/Gwangju/2021/08/11/1004/converted_202...,f579cee6bd57b0c82894,12,0,59640.0,"[{'id': 'wavesurfer_c5rq1in05lo', 'startAt': 1...",Mobile,F,광주광역시 동구 월남동,구급,질병(중증 외),중,불안/걱정,['요통'],준응급증상,/64f6b358446b19d68e3366b8_20210811231050.json,광주,구급,"[SPEAKER_0] 네, 119입니다. [SPEAKER_1] 예, 여보세요? [S..."
4,64f6b358446b19d68e3366e6,20230904/Gwangju/2021/08/15/1005/converted_202...,e99f28a2f2cdbe8a5b29,12,0,72560.0,"[{'id': 'wavesurfer_1cfrkolsas8', 'startAt': 2...",Mobile,M,광주광역시 북구 양산동,구급,질병(중증 외),상,불안/걱정,"['두통', '복통', '기타', '변비', '어지러움']",응급증상,/64f6b358446b19d68e3366e6_20210815015445.json,광주,구급,[SPEAKER_0] 아 예 119입니다. [SPEAKER_1] 예 여기 양산동이요...


1. EDA

In [None]:
# 결측치 확인
print("\nMissing Values in Train Data:")
print(df_train.isnull().sum())
print("\nMissing Values in Valid Data:")
print(df_valid.isnull().sum())


Missing Values in Train Data:
_id                   0
audioPath             0
recordId              0
status                0
startAt               0
endAt                 0
utterances            0
mediaType             0
gender                0
address               0
disasterLarge         0
disasterMedium        0
urgencyLevel          0
sentiment             0
symptom               0
triage            44081
source_file           0
region                0
type                  0
txt                   0
dtype: int64

Missing Values in Valid Data:
_id                  0
audioPath            0
recordId             0
status               0
startAt              0
endAt                0
utterances           0
mediaType            0
gender               0
address              0
disasterLarge        0
disasterMedium       0
urgencyLevel         0
sentiment            0
symptom              0
triage            5555
source_file          0
region               0
type                 0
txt     

In [None]:
missing_ratio = df_train['triage'].isnull().mean()
print(f"'triage' 컬럼의 결측치 비율: {missing_ratio:.2%}")

'triage' 컬럼의 결측치 비율: 34.66%


In [None]:
missing_by_type = df_train[df_train['triage'].isnull()]['type'].value_counts()
print(missing_by_type)

type
구급    17082
구조    14425
화재     8360
기타     4214
Name: count, dtype: int64


In [None]:
# 범주형 변수의 고유값 확인
print("\nUnique Values in 'region' column (Train Data):", df_train['type'].unique())
print("Unique Values in 'type' column (Train Data):", df_train['disasterMedium'].unique())
print("Unique Values in 'urgencyLevel' column (Train Data):", df_train['urgencyLevel'].unique())
print("Unique Values in 'sentiment' column (Train Data):", df_train['sentiment'].unique())
print("Unique Values in 'sentiment' column (Train Data):", df_train['symptom'].unique())
print("Unique Values in 'sentiment' column (Train Data):", df_train['triage'].unique())


Unique Values in 'region' column (Train Data): ['구급' '구조' '기타' '화재']
Unique Values in 'type' column (Train Data): ['질병(중증 외)' '부상' '사고' '약물중독' '기타구급' '질병(중증)' '심정지' '임산부' '안전사고' '기타구조'
 '대물사고' '자살' '기타' '일반화재' '기타화재' '산불']
Unique Values in 'urgencyLevel' column (Train Data): ['중' '하' '상']
Unique Values in 'sentiment' column (Train Data): ['불안/걱정' '당황/난처' '기타부정' '중립']
Unique Values in 'sentiment' column (Train Data): ["['기타통증', '기타', '전신쇠약', '어지러움']" "['고열', '오심']" '[]' ...
 "['두통', '기타통증', '찰과상', '타박상', '가슴불편감']" "['두통', '기타통증', '비출혈', '어지러움']"
 "['흉통', '기타통증', '호흡곤란', '찰과상', '기타']"]
Unique Values in 'sentiment' column (Train Data): ['준응급증상' nan '응급증상' '잠재응급증상' '대상외' '사망추정' '사망']


In [None]:
# 긴급도 수준별 분포triage
print("\nUrgency Level Distribution (Train Data):")
print(df_train['urgencyLevel'].value_counts())

# 감정 상태 분포
print("\nSentiment Distribution (Train Data):")
print(df_train['sentiment'].value_counts())

# 재난 유형 분포
print("\nDisaster Type Distribution (Train Data):")
print(df_train['type'].value_counts())

# 증상 유형 분포
print("\nDisaster Type Distribution (Train Data):")
print(df_train['symptom'].value_counts())


Urgency Level Distribution (Train Data):
urgencyLevel
하    45044
중    42835
상    39299
Name: count, dtype: int64

Sentiment Distribution (Train Data):
sentiment
불안/걱정    68153
당황/난처    47773
중립        8388
기타부정      2864
Name: count, dtype: int64

Disaster Type Distribution (Train Data):
type
구급    96053
구조    18289
화재     8537
기타     4299
Name: count, dtype: int64

Disaster Type Distribution (Train Data):
symptom
[]                                    48105
['기타통증']                               7728
['기타']                                 5318
['복통']                                 4505
['전신쇠약']                               3098
                                      ...  
['실신', '열상', '찰과상', '타박상', '어지러움']        1
['두통', '그 밖의출혈', '찰과상', '타박상']            1
['복통', '기타통증', '그밖의통증기타', '구토']           1
['기타', '기침', '오심', '구토']                  1
['호흡곤란', '가슴불편감', '오심', '구토']             1
Name: count, Length: 3328, dtype: int64


2. 데이터 전처리

제거컬럼\
1) _id, audioPath, recordId, status, startAt, endAt, source_file - 데이터의 식별자나 메타데이터로서 모델 학습에 직접적으로 사용되지 않는다.\
2) mediaType - 접수 채널 정보로, 신고가 전화, 모바일, 챗봇 등 어떤 채널을 통해 접수되었는지 나타내어 활용할 수 있으나 결측치가 많고 현 모델에는 적합하지 않을 것으로 판단.\
3) disasterLarge - type라벨과 중복



In [None]:
columns_to_drop = [
    "_id", "audioPath", "recordId", "status", "startAt", "endAt",
    "mediaType", "address", "disasterLarge",
    "source_file", "region"]

# 컬럼 제거
df_train = df_train.drop(columns=columns_to_drop, errors='ignore')
df_valid = df_valid.drop(columns=columns_to_drop, errors='ignore')


In [None]:
df_train['triage'] = df_train['triage'].fillna('Unknown') #'Unknown'으로 대체
df_valid['triage'] = df_valid['triage'].fillna('Unknown')

In [None]:
# `type` 컬럼 레이블 인코딩 (이미 수행됨)
from sklearn.preprocessing import LabelEncoder

# `type` 라벨 인코더
type_label_encoder = LabelEncoder()
df_train['type_label'] = type_label_encoder.fit_transform(df_train['type'])
df_valid['type_label'] = type_label_encoder.transform(df_valid['type'])

num_type_labels = len(type_label_encoder.classes_)
print("Number of type labels:", num_type_labels)
print("Type label mapping:", dict(zip(type_label_encoder.classes_, range(num_type_labels))))

# `disasterMedium` 라벨 인코더
disaster_label_encoder = LabelEncoder()
df_train['disaster_label'] = disaster_label_encoder.fit_transform(df_train['disasterMedium'])
df_valid['disaster_label'] = disaster_label_encoder.transform(df_valid['disasterMedium'])

num_disaster_labels = len(disaster_label_encoder.classes_)
print("Number of disaster labels:", num_disaster_labels)
print("Disaster label mapping:", dict(zip(disaster_label_encoder.classes_, range(num_disaster_labels))))

Number of type labels: 4
Type label mapping: {'구급': 0, '구조': 1, '기타': 2, '화재': 3}
Number of disaster labels: 16
Disaster label mapping: {'기타': 0, '기타구급': 1, '기타구조': 2, '기타화재': 3, '대물사고': 4, '부상': 5, '사고': 6, '산불': 7, '심정지': 8, '안전사고': 9, '약물중독': 10, '일반화재': 11, '임산부': 12, '자살': 13, '질병(중증 외)': 14, '질병(중증)': 15}


In [None]:
import joblib

# type_label_encoder와 disaster_label_encoder는 학습 시 사용한 레이블 인코더입니다.
joblib.dump(type_label_encoder, '/content/drive/MyDrive/멋쟁이사자처럼/실전프로젝트1/type_label_encoder.pkl')
joblib.dump(disaster_label_encoder, '/content/drive/MyDrive/멋쟁이사자처럼/실전프로젝트1/disaster_label_encoder.pkl')

In [None]:
# 결측치 제거
df_train = df_train.dropna(subset=['disasterMedium'])
df_valid = df_valid.dropna(subset=['disasterMedium'])

3. 데이터셋 및 데이터로더 수정

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('beomi/KcELECTRA-base', use_fast=False)

In [None]:
# 필요한 라이브러리 임포트
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, ElectraModel
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
import random

# 랜덤 시드 설정 (재현성을 위해)
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

# device 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 토크나이저 로드 (수정된 부분)
tokenizer = AutoTokenizer.from_pretrained('beomi/KcELECTRA-base', use_fast=False)

# 최대 길이 설정
MAX_LEN = 256

# 제공해주신 EmergencyDataset 클래스
class EmergencyDataset(Dataset):
    def __init__(self, texts, type_labels, disaster_labels, tokenizer, max_len):
        self.texts = texts.tolist()
        self.type_labels = type_labels.tolist()
        self.disaster_labels = disaster_labels.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        type_label = self.type_labels[idx]
        disaster_label = self.disaster_labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'type_labels': torch.tensor(type_label, dtype=torch.long),
            'disaster_labels': torch.tensor(disaster_label, dtype=torch.long)
        }

# 데이터 준비 (df_train, df_valid가 이미 존재한다고 가정)
# df_train과 df_valid는 pandas DataFrame으로, 'txt', 'type_label', 'disaster_label' 컬럼을 포함해야 합니다.

# 데이터셋 생성
train_dataset = EmergencyDataset(
    df_train['txt'], df_train['type_label'], df_train['disaster_label'], tokenizer, MAX_LEN
)
valid_dataset = EmergencyDataset(
    df_valid['txt'], df_valid['type_label'], df_valid['disaster_label'], tokenizer, MAX_LEN
)

# 데이터로더 생성
BATCH_SIZE = 8

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

4.  커스텀 모델 클래스 정의

In [None]:
import torch.nn as nn
from transformers import ElectraModel

class MultiTaskElectraModel(nn.Module):
    def __init__(self, num_type_labels, num_disaster_labels):
        super(MultiTaskElectraModel, self).__init__()
        self.electra = ElectraModel.from_pretrained('beomi/KcELECTRA-base')
        self.dropout = nn.Dropout(0.1)
        self.classifier_type = nn.Linear(self.electra.config.hidden_size, num_type_labels)
        self.classifier_disaster = nn.Linear(self.electra.config.hidden_size, num_disaster_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.electra(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs[0][:, 0]  # CLS 토큰의 출력
        pooled_output = self.dropout(pooled_output)

        logits_type = self.classifier_type(pooled_output)
        logits_disaster = self.classifier_disaster(pooled_output)

        return logits_type, logits_disaster

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = MultiTaskElectraModel(num_type_labels, num_disaster_labels)
model.to(device)

MultiTaskElectraModel(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=3)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((7

5. 손실 함수 및 옵티마이저 설정


In [None]:
# 손실 함수 정의
criterion_type = nn.CrossEntropyLoss()
criterion_disaster = nn.CrossEntropyLoss()

# 옵티마이저 설정
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

6. 훈련 루프 수정


In [None]:
# 훈련 및 평가 함수 (앞서 제공한 코드 사용)
def train_epoch(model, data_loader, criterion_type, criterion_disaster, optimizer, device):
    model.train()
    total_loss = 0
    correct_predictions_type = 0
    correct_predictions_disaster = 0

    true_labels_type = []
    pred_labels_type = []
    true_labels_disaster = []
    pred_labels_disaster = []

    for batch in tqdm(data_loader, desc="Training"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        type_labels = batch['type_labels'].to(device)
        disaster_labels = batch['disaster_labels'].to(device)

        logits_type, logits_disaster = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        loss_type = criterion_type(logits_type, type_labels)
        loss_disaster = criterion_disaster(logits_disaster, disaster_labels)
        loss = loss_type + loss_disaster  # 총 손실

        _, preds_type = torch.max(logits_type, dim=1)
        _, preds_disaster = torch.max(logits_disaster, dim=1)

        correct_predictions_type += torch.sum(preds_type == type_labels)
        correct_predictions_disaster += torch.sum(preds_disaster == disaster_labels)
        total_loss += loss.item()

        # 실제 레이블과 예측 레이블 저장
        true_labels_type.extend(type_labels.cpu().numpy())
        pred_labels_type.extend(preds_type.cpu().numpy())
        true_labels_disaster.extend(disaster_labels.cpu().numpy())
        pred_labels_disaster.extend(preds_disaster.cpu().numpy())

        loss.backward()
        # 그래디언트 클리핑
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

    avg_loss = total_loss / len(data_loader)
    acc_type = correct_predictions_type.double() / len(data_loader.dataset)
    acc_disaster = correct_predictions_disaster.double() / len(data_loader.dataset)

    # F1-스코어 계산
    f1_type = f1_score(true_labels_type, pred_labels_type, average='weighted')
    f1_disaster = f1_score(true_labels_disaster, pred_labels_disaster, average='weighted')

    return avg_loss, acc_type, acc_disaster, f1_type, f1_disaster


평가 함수

In [None]:
def eval_model(model, data_loader, criterion_type, criterion_disaster, device):
    model.eval()
    total_loss = 0
    correct_predictions_type = 0
    correct_predictions_disaster = 0

    true_labels_type = []
    pred_labels_type = []
    true_labels_disaster = []
    pred_labels_disaster = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Validation"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            type_labels = batch['type_labels'].to(device)
            disaster_labels = batch['disaster_labels'].to(device)

            logits_type, logits_disaster = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            loss_type = criterion_type(logits_type, type_labels)
            loss_disaster = criterion_disaster(logits_disaster, disaster_labels)
            loss = loss_type + loss_disaster  # 총 손실

            _, preds_type = torch.max(logits_type, dim=1)
            _, preds_disaster = torch.max(logits_disaster, dim=1)

            correct_predictions_type += torch.sum(preds_type == type_labels)
            correct_predictions_disaster += torch.sum(preds_disaster == disaster_labels)
            total_loss += loss.item()

            # 실제 레이블과 예측 레이블 저장
            true_labels_type.extend(type_labels.cpu().numpy())
            pred_labels_type.extend(preds_type.cpu().numpy())
            true_labels_disaster.extend(disaster_labels.cpu().numpy())
            pred_labels_disaster.extend(preds_disaster.cpu().numpy())

    avg_loss = total_loss / len(data_loader)
    acc_type = correct_predictions_type.double() / len(data_loader.dataset)
    acc_disaster = correct_predictions_disaster.double() / len(data_loader.dataset)

    # F1-스코어 계산
    f1_type = f1_score(true_labels_type, pred_labels_type, average='weighted')
    f1_disaster = f1_score(true_labels_disaster, pred_labels_disaster, average='weighted')

    return avg_loss, acc_type, acc_disaster, f1_type, f1_disaster

훈련루프

In [None]:
# 옵티마이저 및 스케줄러 설정
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

# ReduceLROnPlateau 스케줄러 설정 (검증 손실을 기반으로 학습률 감소)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)

# 학습 루프 실행
EPOCHS = 15

# 결과를 저장할 리스트 초기화
train_losses = []
train_accs_type = []
train_accs_disaster = []
train_f1s_type = []
train_f1s_disaster = []

val_losses = []
val_accs_type = []
val_accs_disaster = []
val_f1s_type = []
val_f1s_disaster = []

# EarlyStopping 객체 생성
early_stopping = EarlyStopping(patience=3, verbose=True, path='best_model.pt')

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch + 1}/{EPOCHS}")
    train_loss, train_acc_type, train_acc_disaster, train_f1_type, train_f1_disaster = train_epoch(
        model, train_loader, criterion_type, criterion_disaster, optimizer, device
    )
    print(f"Train loss: {train_loss:.4f}, Type accuracy: {train_acc_type:.4f}, "
          f"Disaster accuracy: {train_acc_disaster:.4f}, Type F1-score: {train_f1_type:.4f}, "
          f"Disaster F1-score: {train_f1_disaster:.4f}")

    val_loss, val_acc_type, val_acc_disaster, val_f1_type, val_f1_disaster = eval_model(
        model, valid_loader, criterion_type, criterion_disaster, device
    )
    print(f"Validation loss: {val_loss:.4f}, Type accuracy: {val_acc_type:.4f}, "
          f"Disaster accuracy: {val_acc_disaster:.4f}, Type F1-score: {val_f1_type:.4f}, "
          f"Disaster F1-score: {val_f1_disaster:.4f}")

    # 스케줄러 스텝 (검증 손실을 기반으로 학습률 감소)
    scheduler.step(val_loss)

    # Early Stopping 체크
    early_stopping(val_loss, model)

    if early_stopping.early_stop:
        print("Early stopping")
        break

    # 결과 저장
    train_losses.append(train_loss)
    train_accs_type.append(train_acc_type.item())
    train_accs_disaster.append(train_acc_disaster.item())
    train_f1s_type.append(train_f1_type)
    train_f1s_disaster.append(train_f1_disaster)

    val_losses.append(val_loss)
    val_accs_type.append(val_acc_type.item())
    val_accs_disaster.append(val_acc_disaster.item())
    val_f1s_type.append(val_f1_type)
    val_f1s_disaster.append(val_f1_disaster)


Epoch 1/15


Training: 100%|██████████| 15898/15898 [55:36<00:00,  4.77it/s]


Train loss: 1.2000, Type accuracy: 0.9180, Disaster accuracy: 0.6571, Type F1-score: 0.9136, Disaster F1-score: 0.6314


Validation: 100%|██████████| 1988/1988 [02:10<00:00, 15.23it/s]


Validation loss: 1.2244, Type accuracy: 0.9161, Disaster accuracy: 0.6550, Type F1-score: 0.9125, Disaster F1-score: 0.6222
Validation loss decreased (inf --> 1.224424).  Saving model...

Epoch 2/15


Training: 100%|██████████| 15898/15898 [55:36<00:00,  4.77it/s]


Train loss: 1.0732, Type accuracy: 0.9324, Disaster accuracy: 0.6850, Type F1-score: 0.9294, Disaster F1-score: 0.6677


Validation: 100%|██████████| 1988/1988 [02:10<00:00, 15.25it/s]


Validation loss: 1.2735, Type accuracy: 0.9156, Disaster accuracy: 0.6515, Type F1-score: 0.9111, Disaster F1-score: 0.6379
EarlyStopping counter: 1 out of 3

Epoch 3/15


Training: 100%|██████████| 15898/15898 [55:36<00:00,  4.77it/s]


Train loss: 0.9543, Type accuracy: 0.9462, Disaster accuracy: 0.7150, Type F1-score: 0.9444, Disaster F1-score: 0.7035


Validation: 100%|██████████| 1988/1988 [02:10<00:00, 15.25it/s]


Validation loss: 1.2903, Type accuracy: 0.9165, Disaster accuracy: 0.6582, Type F1-score: 0.9145, Disaster F1-score: 0.6483
EarlyStopping counter: 2 out of 3

Epoch 4/15


Training: 100%|██████████| 15898/15898 [55:36<00:00,  4.77it/s]


Train loss: 0.8392, Type accuracy: 0.9581, Disaster accuracy: 0.7504, Type F1-score: 0.9572, Disaster F1-score: 0.7424


Validation: 100%|██████████| 1988/1988 [02:10<00:00, 15.25it/s]

Validation loss: 1.4249, Type accuracy: 0.9099, Disaster accuracy: 0.6502, Type F1-score: 0.9086, Disaster F1-score: 0.6406
EarlyStopping counter: 3 out of 3
Early stopping





In [None]:
# 가장 좋은 모델 로드
model.load_state_dict(torch.load('best_model.pt'))

In [None]:
correct = 0
total = 0
wrong_predict_list = []

model.eval()

with torch.no_grad():
    for images, labels in test_loader:
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        # 틀린 예측을 찾기 위한 마스크 생성
        for i in range(len(labels)):
            if predicted[i] != labels[i]:
                wrong_predict_list.append((images[i], labels[i], predicted[i], outputs[i]))
    print(f'Test Accuracy: {100 * correct / total} %')

In [None]:
samples = random.choices(population=wrong_predict_list, k=5)

for img, true_label, pred_label, output in samples:
    probabilities = F.softmax(output, dim=0)

    img = decode(img)
    img = img.permute(1, 2, 0).numpy().astype(np.uint8)

    colors = ['gray'] * len(class_names)
    colors[true_label] = 'blue'
    colors[pred_label] = 'red'

    title = f"Truth : {class_names[true_label]}, Predict : {class_names[pred_label]}"
    plt.figure(figsize=(4, 1))
    plt.ylim(0, 1)
    plt.bar(class_names, probabilities.numpy(), color = colors)
    plt.title(title)
    plt.ylabel("Probability")
    plt.xticks(rotation=45)
    plt.show()

    plt.imshow(img)
    plt.axis('off')
    plt.show()

In [None]:
import matplotlib.pyplot as plt

# 에폭 리스트 생성
epochs = range(1, len(train_losses) + 1)

# 손실 그래프
plt.figure(figsize=(12, 4))
plt.subplot(1, 3, 1)
plt.plot(epochs, train_losses, label='Train Loss')
plt.plot(epochs, val_losses, label='Validation Loss')
plt.title('Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

# 정확도 그래프 (Type)
plt.subplot(1, 3, 2)
plt.plot(epochs, train_accs_type, label='Train Accuracy (Type)')
plt.plot(epochs, val_accs_type, label='Validation Accuracy (Type)')
plt.title('Accuracy (Type)')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

# F1-score 그래프 (Type)
plt.subplot(1, 3, 3)
plt.plot(epochs, train_f1s_type, label='Train F1-score (Type)')
plt.plot(epochs, val_f1s_type, label='Validation F1-score (Type)')
plt.title('F1-score (Type)')
plt.xlabel('Epoch')
plt.ylabel('F1-score')
plt.legend()

plt.tight_layout()
plt.show()

# Disaster에 대한 정확도와 F1-score 그래프
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(epochs, train_accs_disaster, label='Train Accuracy (Disaster)')
plt.plot(epochs, val_accs_disaster, label='Validation Accuracy (Disaster)')
plt.title('Accuracy (Disaster)')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(epochs, train_f1s_disaster, label='Train F1-score (Disaster)')
plt.plot(epochs, val_f1s_disaster, label='Validation F1-score (Disaster)')
plt.title('F1-score (Disaster)')
plt.xlabel('Epoch')
plt.ylabel('F1-score')
plt.legend()

plt.tight_layout()
plt.show()

NameError: name 'train_losses' is not defined

7. 예측 함수 수정


In [None]:
def predict_emergency(text, model, tokenizer, summary_model, summary_tokenizer, device):
    # 입력 인코딩
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # 모델 예측
    model.eval()
    with torch.no_grad():
        logits_type, logits_disaster = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        _, pred_type = torch.max(logits_type, dim=1)
        _, pred_disaster = torch.max(logits_disaster, dim=1)

    # 레이블 디코딩
    type_label = type_label_encoder.inverse_transform(pred_type.cpu().numpy())[0]
    disaster_label = disaster_label_encoder.inverse_transform(pred_disaster.cpu().numpy())[0]

    # 요약 생성
    summary = summarize_text(text, summary_tokenizer, summary_model, device)

    # 키워드 추출
    keywords = extract_keywords(text)

    return type_label, disaster_label, keywords, summary