In [5]:
!pip install openai-whisper
!pip install transformers accelerate
!apt-get update && apt-get install -y ffmpeg

Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->openai-whisper)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->openai-whisper)
  Downloading nvidia_cudnn_cu12-9.1.0

음성 -> text 는 whisper 패키지를 사용한다

In [22]:
import whisper
model = whisper.load_model("base")
def transcribe_audio(audio_path):
    result = model.transcribe(audio_path, language="ko")
    return result["text"]

In [23]:
audio_file_path = "/content/drive/MyDrive/data/record/imsosory.m4a"  # wav, mp3, m4a, webm, ogg, flac 가능
text_from_audio = transcribe_audio(audio_file_path)
print("Transcribed Text:", text_from_audio)



Transcribed Text:  너무너무 미안해


표정값은 efficientnetb0 모델 + weight 값 inference 해오기

In [31]:
from torchvision.models import efficientnet_b0
from PIL import Image
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, models, transforms

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = efficientnet_b0(pretrained=True)

model.classifier = nn.Sequential(
    nn.Dropout(0.5),
    nn.Linear(model.classifier[1].in_features, 7)
)

train_transforms = transforms.Compose([
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(30),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

val_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

model.load_state_dict(torch.load('/content/drive/MyDrive/weight/efficient_fer2013_pretrained.pth', map_location=device))
model = model.to(device)
emotion_labels = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprised']

def final_emotion(image_path, model, transform, device):
    image = Image.open(image_path).convert('RGB')
    input_tensor = transform(image).unsqueeze(0).to(device)

    model.eval()
    with torch.no_grad():
        output = model(input_tensor)
        probs = F.softmax(output, dim=1)
        pred_idx = torch.argmax(probs, dim=1).item()
        pred_label = emotion_labels[pred_idx]
        confidence = probs[0][pred_idx].item()

    return pred_label, confidence

In [32]:
test_image_path = '/content/drive/MyDrive/data/pic/IMG_3688.jpg'
image_emotion, prob = final_emotion(test_image_path, model, val_transforms, device)
print(f"Predicted emotion: {image_emotion} ({prob*100:.2f}%)")

Predicted emotion: fear (49.42%)


로컬 llm으로 텍스트랑 표정 일치 평가 진행

In [33]:
!pip install groq
from groq import Groq
from google.colab import userdata

api_key = userdata.get('GROQ_API_KEY')
client = Groq(api_key=api_key)



In [24]:
scenario = """
엄마: 은우야, 밥 먹으러 오렴.
은우: 네 알겠어요.
그렇지만 은우는 계속 장난감을 가지고 놀았어.
엄마: 은우야, 그만 놀고 밥 먹으러 나오렴.
은우: 네 알겠어요.
그치만 장난감을 가지고 노는 게 너무 재밌는걸.
엄마: 은우야, 밥 먹으러 오라고 3번째 말하고 있어.
엄마의 목소리는 조금 큰 것 같아.
이때 은우는 어떻게 답해야 할까?
"""

In [37]:
def evaluate_alignment(text, image_emotion):
    prompt = f"""
You are an evaluator of emotional appropriateness in context.

Scenario:
"{scenario}"

User said (via speech): "{text_from_audio}"
Their facial expression was: "{image_emotion}"

Step 1: Identify the emotion implied in the speech. Use Ekman's 6 basic emotions: happy, sad, angry, fear, surprise, disgust.

Step 2: Evaluate the emotional alignment between the speech and facial expression using:
1. Emotion Label Match (same emotion)
2. Valence Match (positive/negative)
3. Arousal Match (low/medium/high activation)
4. Contextual Fit (appropriate emotion for the scenario)

Score each from 0 to 25. Total is 100.

Only return:
LabelScore, ValenceScore, ArousalScore, ContextScore, TotalScore, InferredSpeechEmotion, Reason (one sentence).

Important instructions:
- Only output raw scores and a short reason.
- DO NOT restate the inputs.
- Format:
LabelScore: [0-25]
ValenceScore: [0-25]
ArousalScore: [0-25]
ContextScore: [0-25]
TotalScore: [0-100]
Reason: [short sentence]
"""

    response = client.chat.completions.create(
        model="llama3-8b-8192",  # 아니면 llama3-70b-8192 쓰기
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

In [38]:
eval = evaluate_alignment(text_from_audio, image_emotion)
print(eval)

LabelScore: 25
ValenceScore: 0
ArousalScore: 20
ContextScore: 20
TotalScore: 65
Reason: The speech implies fear, matching the facial expression, conveying a sense of apprehension, and fitting well in the context of the mother's increasing frustration.
