In [4]:
import numpy as np

# 기준점: 코 끝 (landmark index 1)
def extract_relative_face_keypoints(landmarks):
    base = landmarks[1]  # 기준점 (코 끝)
    relative = []
    for lm in landmarks:
        dx = lm.x - base.x
        dy = lm.y - base.y
        relative.append([dx, dy])
    return np.array(relative).flatten()  # 936개 (468점 x 2)


In [10]:
import os
import cv2
import numpy as np
import mediapipe as mp
from tqdm import tqdm

LABELS = ['laugh', 'serious', 'surprise', 'yawn', 'none']
DATA_PATH = '../face_data'
X, y = [], []

mp_face_mesh = mp.solutions.face_mesh
face_mesh = mp_face_mesh.FaceMesh(static_image_mode=True)

for idx, label in enumerate(LABELS):
    folder = os.path.join(DATA_PATH, label)
    for file in tqdm(os.listdir(folder), desc=label):
        if not file.endswith('.jpg'):
            continue
        img_path = os.path.join(folder, file)
        img = cv2.imread(img_path)
        rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        results = face_mesh.process(rgb)
        if results.multi_face_landmarks:
            landmarks = results.multi_face_landmarks[0].landmark
            keypoints = extract_relative_face_keypoints(landmarks)
            X.append(keypoints)
            y.append(idx)

X = np.array(X)
y = np.array(y)
print("✅ 데이터셋 shape:", X.shape, y.shape)


laugh: 100%|██████████| 511/511 [00:03<00:00, 142.74it/s]
serious: 100%|██████████| 509/509 [00:03<00:00, 144.44it/s]
surprise: 100%|██████████| 539/539 [00:03<00:00, 143.05it/s]
yawn: 100%|██████████| 292/292 [00:01<00:00, 147.01it/s]
none: 100%|██████████| 338/338 [00:02<00:00, 147.54it/s]

✅ 데이터셋 shape: (2157, 936) (2157,)





In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

y_cat = to_categorical(y)
X_train, X_val, y_train, y_val = train_test_split(X, y_cat, test_size=0.2, stratify=y, random_state=42)

model = Sequential([
    Dense(256, activation='relu', input_shape=(X.shape[1],)),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(len(LABELS), activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_val, y_val))

model.save('face_expression_landmark_model.h5')


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.2924 - loss: 1.4983 - val_accuracy: 0.5069 - val_loss: 1.3039
Epoch 2/100
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4572 - loss: 1.2815 - val_accuracy: 0.5856 - val_loss: 1.1517
Epoch 3/100
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5480 - loss: 1.1145 - val_accuracy: 0.6319 - val_loss: 0.9426
Epoch 4/100
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6588 - loss: 0.8821 - val_accuracy: 0.6713 - val_loss: 0.7850
Epoch 5/100
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6921 - loss: 0.7669 - val_accuracy: 0.6921 - val_loss: 0.7526
Epoch 6/100
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7191 - loss: 0.7116 - val_accuracy: 0.7685 - val_loss: 0.6374
Epoch 7/100
[1m54/54[0m [32m━━━━━━━━━━━━━━━



In [14]:
import cv2
import numpy as np
import tensorflow as tf
import mediapipe as mp

LABELS = ['laugh', 'serious', 'surprise', 'yawn', 'none']
EMOJIS = {
    "laugh": ":D",         
    "serious": "-_-",
    "surprise": "!!",
    "yawn": "Zz",
    "none": "..."
}


model = tf.keras.models.load_model('face_expression_landmark_model.h5')

mp_face_mesh = mp.solutions.face_mesh
face_mesh = mp_face_mesh.FaceMesh(max_num_faces=1, refine_landmarks=False, min_detection_confidence=0.7)

def extract_relative_keypoints(landmarks):
    base = landmarks[1]
    relative = []
    for lm in landmarks:
        dx = lm.x - base.x
        dy = lm.y - base.y
        relative.append([dx, dy])
    return np.array(relative).flatten()

MOUTH_IDX = [13, 14, 78, 82, 87, 88, 95, 61, 146, 91, 181, 308, 317, 312, 311, 402]

def is_mouth_covered(landmarks, threshold=0.003):
    base = landmarks[1]
    mouth_movement = 0
    for i in MOUTH_IDX:
        lm = landmarks[i]
        dx = abs(lm.x - base.x)
        dy = abs(lm.y - base.y)
        mouth_movement += dx + dy
    avg_move = mouth_movement / len(MOUTH_IDX)
    return avg_move < threshold

def is_mouth_closed(landmarks, threshold=0.015):
    upper_lip = landmarks[13].y
    lower_lip = landmarks[14].y
    return abs(lower_lip - upper_lip) < threshold

mouth_labels = ['laugh', 'yawn', 'surprise']

cap = cv2.VideoCapture(0)
print("🎥 실시간 표정 추론 시작 — 'q' 눌러 종료")

while True:
    ret, frame = cap.read()
    if not ret:
        break

    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = face_mesh.process(rgb)

    if results.multi_face_landmarks:
        landmarks = results.multi_face_landmarks[0].landmark

        if is_mouth_covered(landmarks):
            label = "mouth covered"
            cv2.putText(frame, label, (30, 50),
                        cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 255, 255), 2)
        else:
            keypoints = extract_relative_keypoints(landmarks)
            input_data = keypoints.reshape(1, -1).astype(np.float32)

            pred_probs = model.predict(input_data, verbose=0)
            pred = np.argmax(pred_probs)
            conf = np.max(pred_probs)
            label = LABELS[pred]

            if conf >= 0.8:
                # 입 벌림 없는 상태에서 mouth-required label은 억제
                if label in mouth_labels and is_mouth_closed(landmarks):
                    cv2.putText(frame, "Uncertain (mouth closed)", (30, 50),
                                cv2.FONT_HERSHEY_SIMPLEX, 1.2, (100, 100, 100), 2)
                else:
                    emoji = EMOJIS[label]
                    cv2.putText(frame, f"{emoji} {label} ({conf:.2f})", (30, 50),
                        cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 255, 0), 2)

            else:
                cv2.putText(frame, "Uncertain", (30, 50),
                            cv2.FONT_HERSHEY_SIMPLEX, 1.2, (100, 100, 100), 2)
    else:
        cv2.putText(frame, "No face detected", (30, 50),
                    cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 255), 2)

    cv2.imshow("Expression (Landmark)", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()




🎥 실시간 표정 추론 시작 — 'q' 눌러 종료
