# 감성 분석 모델 학습 및 추론

In [11]:
import numpy as np
import pandas as pd
import openpyxl

### 1. 데이터 로드

In [12]:
communication_df = pd.read_excel('/Users/woojin/Desktop/SK Networks Family AI Camp_17/SKN-17-Family-AI-Camp/NATURAL_LANGUAGE_PROCESSING/04_dl_nlp_basic/한국어_단발성_대화_데이터셋.xlsx')
communication_df.head()

Unnamed: 0,Sentence,Emotion,Unnamed: 2,Unnamed: 3,Unnamed: 4,공포,5468
0,언니 동생으로 부르는게 맞는 일인가요..??,공포,,,,놀람,5898.0
1,그냥 내 느낌일뿐겠지?,공포,,,,분노,5665.0
2,아직너무초기라서 그런거죠?,공포,,,,슬픔,5267.0
3,유치원버스 사고 낫다던데,공포,,,,중립,4830.0
4,근데 원래이런거맞나요,공포,,,,행복,6037.0


### 2. 데이터 전처리

In [13]:
communication_df = communication_df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', '공포', 5468], axis=1).reset_index(drop=True)
communication_df.head()

Unnamed: 0,Sentence,Emotion
0,언니 동생으로 부르는게 맞는 일인가요..??,공포
1,그냥 내 느낌일뿐겠지?,공포
2,아직너무초기라서 그런거죠?,공포
3,유치원버스 사고 낫다던데,공포
4,근데 원래이런거맞나요,공포


In [14]:
communication_df.isnull().sum()

Sentence    0
Emotion     0
dtype: int64

In [15]:
communication_df['Emotion'].value_counts()

Emotion
행복    6037
놀람    5898
분노    5665
공포    5468
혐오    5429
슬픔    5267
중립    4830
Name: count, dtype: int64

In [16]:
from konlpy.tag import Okt
from tqdm import tqdm
import re

okt = Okt()

stopwords = set([
    "은", "는", "이", "가", "을", "를", "에", "의", "도", "로", "으로", "그리고", "하지만", "또는", "에서", "하다"
])

def preprocess_korean(text: str, use_stem=True):
    text = re.sub(r"[^ㄱ-ㅎㅏ-ㅣ가-힣\s]", " ", str(text))
    text = re.sub(r"\s+", " ", text).strip()
    
    tokens = okt.morphs(text, stem=use_stem)
    
    tokens = [tok for tok in tokens if tok not in stopwords and len(tok) > 1]
    
    return tokens

preprocessed_data = []
for idx, sentence in enumerate(tqdm(communication_df['Sentence'])):
    communication_df['Sentence'][idx] = preprocess_korean(sentence)

labels_str = ['행복', '놀람', '분노', '공포', '혐오', '슬픔', '중립']
label_to_int = {label: i for i, label in enumerate(labels_str)}

communication_df['Emotion'] = communication_df['Emotion'].map(label_to_int)

communication_df

100%|██████████| 38594/38594 [00:38<00:00, 1013.70it/s]


Unnamed: 0,Sentence,Emotion
0,"[언니, 동생, 부르다, 맞다, 인가요]",3
1,"[그냥, 느낌, 겠다]",3
2,"[아직, 너무, 초기, 라서, 그런]",3
3,"[유치원, 버스, 사고, 낫다]",3
4,"[근데, 원래, 이렇다, 맞다]",3
...,...,...
38589,"[솔직하다, 예보, 제대로, 세금, 이라도, 아끼다, 그냥, 폐지]",4
38590,"[재미, 없다, 망하다]",4
38591,"[공장, 도시락, 비우다, 적임, 아르바이트, 화장실, 가성, 않씯, 재료, 담다,...",4
38592,"[코딱지, 나라, 지다, 들다, 끼리, 피터지다, 싸우다, 세다, 클래스, ㅉㅉㅉ]",4


In [19]:
from gensim.models import Word2Vec

sentences = communication_df['Sentence']
y = communication_df['Emotion']

w2v_size = 200
w2v = Word2Vec(
    sentences=sentences,
    vector_size=w2v_size,
    window=5,
    min_count=2,
    sg=1,
    epochs=10    
)

In [20]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

vocab = {w:i+1 for i,w in enumerate(w2v.wv.index_to_key)}  # 0=PAD
unk = len(vocab)+1

def to_ids(tokens): return [vocab.get(t, unk) for t in tokens]
X_ids = [to_ids(toks) for toks in sentences]
max_len = int(np.percentile([len(x) for x in X_ids], 95))
X = pad_sequences(X_ids, maxlen=max_len, padding='post', truncating='post', value=0)
y_np = y.values

In [21]:
emb = np.zeros((len(vocab)+2, w2v_size))
for w,i in vocab.items():
    emb[i] = w2v.wv[w]
emb[unk] = emb[1:len(vocab)+1].mean(0)

### 3. 모델 정의 및 생성

In [27]:
import tensorflow as tf
from tensorflow.keras import Input, layers, models

num_classes = int(len(np.unique(y_np)))

model = models.Sequential([
    Input(shape=(max_len,)),
    layers.Embedding(input_dim=emb.shape[0], output_dim=emb.shape[1],
                     weights=[emb], mask_zero=False, trainable=False),
    layers.Bidirectional(layers.LSTM(128)),
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(num_classes, activation='softmax')
])
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

### 4. 모델 학습

In [28]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

X_tr, X_te, y_tr, y_te = train_test_split(
    X, y_np, test_size=0.1, random_state=42, stratify=y_np
)

es = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)
ckpt = ModelCheckpoint('best_lstm.h5', monitor='val_accuracy', save_best_only=True)

history = model.fit(
    X_tr, y_tr,
    validation_split=0.1,
    epochs=15,
    batch_size=128,
    callbacks=[es, ckpt],
    verbose=1
)

test_loss, test_acc = model.evaluate(X_te, y_te, verbose=0)
print(f'Test Acc: {test_acc:.4f}')

Epoch 1/15
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - accuracy: 0.3039 - loss: 1.7266



[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 68ms/step - accuracy: 0.3368 - loss: 1.6628 - val_accuracy: 0.3880 - val_loss: 1.5917
Epoch 2/15
[1m244/245[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 59ms/step - accuracy: 0.3601 - loss: 1.6078



[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 64ms/step - accuracy: 0.3655 - loss: 1.5938 - val_accuracy: 0.3964 - val_loss: 1.5536
Epoch 3/15
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 64ms/step - accuracy: 0.3783 - loss: 1.5736 - val_accuracy: 0.3949 - val_loss: 1.5447
Epoch 4/15
[1m244/245[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 59ms/step - accuracy: 0.3867 - loss: 1.5503



[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 64ms/step - accuracy: 0.3892 - loss: 1.5484 - val_accuracy: 0.4102 - val_loss: 1.5373
Epoch 5/15
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 64ms/step - accuracy: 0.3892 - loss: 1.5407 - val_accuracy: 0.3978 - val_loss: 1.5465
Epoch 6/15
[1m244/245[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 59ms/step - accuracy: 0.4012 - loss: 1.5247



[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 64ms/step - accuracy: 0.3964 - loss: 1.5293 - val_accuracy: 0.4131 - val_loss: 1.5136
Epoch 7/15
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 64ms/step - accuracy: 0.4044 - loss: 1.5146 - val_accuracy: 0.4047 - val_loss: 1.5155
Epoch 8/15
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 64ms/step - accuracy: 0.4070 - loss: 1.5111 - val_accuracy: 0.4056 - val_loss: 1.5059
Epoch 9/15
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 64ms/step - accuracy: 0.4112 - loss: 1.5004 - val_accuracy: 0.4088 - val_loss: 1.5114
Test Acc: 0.3995


### 5. 추론

Word2Vec 학습 (선택)
	•	gensim으로 네 토큰 리스트 전체에 대해 Word2Vec 학습 → 임베딩 확인.
	•	이걸 RNN/LSTM 분류기 입력으로 사용 가능. 이 방법으로 하고 싶은데 현재 내 파일 상황 보여줄게. 그 다음 어떤 걸 해야 되는 지 분석해서 알려줘.

In [36]:
def predict_tokens(tokens):
    ids = [vocab.get(t, unk) for t in tokens]
    arr = pad_sequences([ids], maxlen=max_len, padding='post', truncating='post', value=0)
    
    int_to_label = {v: k for k, v in label_to_int.items()}
    
    probs = model.predict(arr, verbose=0)[0]
    pred_class = int(np.argmax(probs))
    confidence = float(np.max(probs))
    label = int_to_label[pred_class]
    
    return label, pred_class, confidence, probs

In [37]:
example = ["오늘", "날씨", "좋다", "기분", "좋다"]
label, pred, conf, probs = predict_tokens(example)

print("입력 토큰:", example)
print("예측 클래스:", label)
print("신뢰도:", f"{conf:.4f}")
print("전체 확률분포:", probs)

입력 토큰: ['오늘', '날씨', '좋다', '기분', '좋다']
예측 클래스: 행복
신뢰도: 0.9695
전체 확률분포: [0.9694503  0.00314005 0.001149   0.00455268 0.00260659 0.00723143
 0.01186989]


In [48]:
example = ["비", "많이", "와"]
label, pred, conf, probs = predict_tokens(example)

print("입력 토큰:", example)
print("예측 클래스:", label)
print("신뢰도:", f"{conf:.4f}")
print("전체 확률분포:", probs)

입력 토큰: ['비', '많이', '와']
예측 클래스: 공포
신뢰도: 0.2471
전체 확률분포: [0.08591814 0.23476201 0.05705829 0.24710582 0.06371237 0.16405419
 0.14738919]


In [85]:
example = ["다이어트", "실패", "돼지"]
label, pred, conf, probs = predict_tokens(example)

print("입력 토큰:", example)
print("예측 클래스:", label)
print("신뢰도:", f"{conf:.4f}")
print("전체 확률분포:", probs)

입력 토큰: ['다이어트', '실패', '돼지']
예측 클래스: 분노
신뢰도: 0.2190
전체 확률분포: [0.11259221 0.17622636 0.21897945 0.09990779 0.15013286 0.0898891
 0.15227222]
