In [6]:
#!/usr/bin/env python
"""
视频分析统合系统
将视频分解为图片和音频，使用BLIP、CLAP和ResNet18进行分析，
然后统合结果推测狗的想法和心情
"""
"""
import os
import cv2
import torch
import librosa
import numpy as np
import pandas as pd
from PIL import Image
from transformers import (
    BitsAndBytesConfig, 
    Blip2Processor, 
    Blip2ForConditionalGeneration,
    ClapModel, 
    ClapProcessor,
    pipeline
)
from torchvision import transforms, models
from torch import nn
import tempfile
import json
from typing import Dict, List, Tuple
import warnings
warnings.filterwarnings('ignore')

class VideoAnalysisSystem:
    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"[INFO] Using device: {self.device}")
        
        # 初始化所有模型
        self._init_blip_model()
        self._init_clap_model()
        self._init_resnet_model()
        self._init_llm_model()
        
    def _init_blip_model(self):
        """初始化BLIP模型用于场景描述"""
        print("[INFO] Loading BLIP model...")
        quant_config = BitsAndBytesConfig(
            load_in_8bit=True,
            llm_int8_enable_fp32_cpu_offload=True,
        )
        
        self.blip_processor = Blip2Processor.from_pretrained(
            "Salesforce/blip2-opt-2.7b", use_fast=True
        )
        self.blip_model = Blip2ForConditionalGeneration.from_pretrained(
            "Salesforce/blip2-opt-2.7b",
            quantization_config=quant_config,
            device_map="auto",
        )
        self.blip_model.eval()
        
    def _init_clap_model(self):
        """初始化CLAP模型用于音频分析"""
        print("[INFO] Loading CLAP model...")
        self.clap_model = ClapModel.from_pretrained("laion/clap-htsat-fused").to(self.device)
        self.clap_processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")
        
        # 定义音频分析的提示词
        self.audio_prompts = {
            "Alert": [
                "sharp mid-pitch alert bark",
                "brief clear alarm bark",
                "short crisp warning bark",
                "fast staccato alert bark"
            ],
            "Territorial": [
                "deep sustained territorial bark",
                "low throaty guard bark",
                "prolonged defensive bark",
                "slow booming territorial bark"
            ],
            "Excited": [
                "high-pitched rapid excited yips",
                "quick playful bark",
                "series of lively yips",
                "fast bright excited bark"
            ],
            "Demand": [
                "steady attention-seeking bark",
                "regular rhythmic demand bark",
                "persistent repetitive request bark",
                "moderate-pitch insistence bark"
            ],
            "Fear": [
                "high-pitched trembling fearful bark",
                "quivering anxious bark",
                "shaky high anxious bark",
                "piercing nervous bark"
            ],
            "Aggressive": [
                "low guttural aggressive bark",
                "deep harsh threat bark",
                "rough menacing bark",
                "raspy growling attack bark"
            ],
            "Pain": [
                "single sharp pain yelp",
                "shrill acute pain bark",
                "sudden high pain yelp",
                "short piercing pain bark"
            ],
            "Lonely": [
                "slow spaced lonely bark",
                "mournful drawn-out bark",
                "distant monotone lonely bark",
                "long-interval melancholy bark"
            ],
            "Howl": [
                "long plaintive canine howl",
                "careless, spontaneous howl",
                "sustained mournful howl",
                "extended melodic dog howl"
            ]
        }
        
        # 预处理文本嵌入
        all_prompts, prompt_cls = [], []
        for cls, plist in self.audio_prompts.items():
            all_prompts.extend(plist)
            prompt_cls.extend([cls] * len(plist))
        
        self.prompt_cls = np.array(prompt_cls)
        txt_inputs = self.clap_processor(text=all_prompts, return_tensors="pt", padding=True)
        txt_inputs = {k: v.to(self.device) for k, v in txt_inputs.items()}
        
        with torch.no_grad():
            self.text_emb = torch.nn.functional.normalize(
                self.clap_model.get_text_features(**txt_inputs), dim=-1
            )
        
    def _init_resnet_model(self):
        """初始化ResNet18模型用于情绪识别"""
        print("[INFO] Loading ResNet18 model...")
        self.resnet_model = models.resnet18(pretrained=True)
        
        # 先尝试加载预训练权重以确定类别数量
        if os.path.exists('dog_emotion.pth'):
            try:
                # 加载权重字典
                checkpoint = torch.load('dog_emotion.pth', map_location=self.device)
                
                # 从fc层权重推断类别数量
                if 'fc.weight' in checkpoint:
                    num_classes = checkpoint['fc.weight'].shape[0]
                    print(f"[INFO] Detected {num_classes} emotion classes from checkpoint")
                else:
                    num_classes = 4  # 默认值
                    print(f"[INFO] Using default {num_classes} emotion classes")
                
                # 根据实际数据集调整类别名称
                if num_classes == 4:
                    emotion_classes = ['happy', 'sad', 'angry', 'calm']
                elif num_classes == 5:
                    emotion_classes = ['happy', 'sad', 'angry', 'calm', 'excited']
                else:
                    emotion_classes = [f'emotion_{i}' for i in range(num_classes)]
                
                self.emotion_classes = emotion_classes
                self.resnet_model.fc = nn.Linear(self.resnet_model.fc.in_features, num_classes)
                
                # 加载权重
                self.resnet_model.load_state_dict(checkpoint)
                print("[INFO] Loaded pre-trained emotion model")
                
            except Exception as e:
                print(f"[WARNING] Error loading emotion model: {e}")
                print("[INFO] Using default emotion classes")
                emotion_classes = ['happy', 'sad', 'angry', 'calm']
                self.emotion_classes = emotion_classes
                self.resnet_model.fc = nn.Linear(self.resnet_model.fc.in_features, len(emotion_classes))
        else:
            print("[INFO] No pre-trained emotion model found, using default classes")
            emotion_classes = ['happy', 'sad', 'angry', 'calm']
            self.emotion_classes = emotion_classes
            self.resnet_model.fc = nn.Linear(self.resnet_model.fc.in_features, len(emotion_classes))
        
        self.resnet_model = self.resnet_model.to(self.device)
        self.resnet_model.eval()
        
        # 图像预处理
        self.image_transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                               std=[0.229, 0.224, 0.225])
        ])
        
    def _init_llm_model(self):
        """初始化LLM模型用于最终分析"""
        print("[INFO] Loading LLM model...")
        try:
            self.llm = pipeline(
                "text-generation",
                model="microsoft/DialoGPT-medium",
                device=0 if torch.cuda.is_available() else -1
            )
        except:
            print("[WARNING] Could not load LLM model, will use rule-based analysis")
            self.llm = None
    
    def extract_video_components(self, video_path: str, sample_rate: int = 48000) -> Tuple[List[np.ndarray], np.ndarray]:
        """从视频中提取图片帧和音频"""
        print(f"[INFO] Extracting components from {video_path}")
        
        # 提取视频帧
        cap = cv2.VideoCapture(video_path)
        frames = []
        frame_count = 0
        fps = cap.get(cv2.CAP_PROP_FPS)
        
        # 每秒采样1帧
        frame_interval = int(fps) if fps > 0 else 30
        
        while True:
            ret, frame = cap.read()
            if not ret:
                break
                
            if frame_count % frame_interval == 0:
                # 转换为RGB格式
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frames.append(frame_rgb)
                
            frame_count += 1
            
        cap.release()
        print(f"[INFO] Extracted {len(frames)} frames")
        
        # 提取音频
        try:
            audio, sr = librosa.load(video_path, sr=sample_rate, mono=True)
            print(f"[INFO] Extracted audio: {len(audio)} samples at {sr} Hz")
        except Exception as e:
            print(f"[WARNING] Could not extract audio: {e}")
            audio = np.array([])
            
        return frames, audio
    
    def analyze_scenes(self, frames: List[np.ndarray]) -> List[str]:
        """使用BLIP分析场景"""
        print("[INFO] Analyzing scenes with BLIP...")
        scene_descriptions = []
        
        for i, frame in enumerate(frames):
            try:
                # 转换为PIL Image
                img = Image.fromarray(frame).convert("RGB")
                
                # BLIP处理
                inputs = self.blip_processor(images=img, return_tensors="pt")
                inputs = {k: v.to(self.blip_model.device) for k, v in inputs.items()}
                
                generated_ids = self.blip_model.generate(
                    **inputs,
                    max_new_tokens=80,
                    num_beams=5,
                    no_repeat_ngram_size=2,
                )
                
                description = self.blip_processor.batch_decode(
                    generated_ids, skip_special_tokens=True
                )[0]
                
                scene_descriptions.append(description)
                print(f"[INFO] Frame {i+1}: {description}")
                
            except Exception as e:
                print(f"[WARNING] Error analyzing frame {i+1}: {e}")
                scene_descriptions.append("Unable to analyze scene")
                
        return scene_descriptions
    
    def analyze_emotions(self, frames: List[np.ndarray]) -> List[Dict]:
        """使用ResNet18分析情绪"""
        print("[INFO] Analyzing emotions with ResNet18...")
        emotion_results = []
        
        for i, frame in enumerate(frames):
            try:
                # 转换为PIL Image并预处理
                img = Image.fromarray(frame).convert("RGB")
                img_tensor = self.image_transform(img).unsqueeze(0).to(self.device)
                
                with torch.no_grad():
                    outputs = self.resnet_model(img_tensor)
                    probabilities = torch.nn.functional.softmax(outputs[0], dim=0)
                    
                    # 获取预测结果
                    pred_idx = torch.argmax(probabilities).item()
                    confidence = probabilities[pred_idx].item()
                    
                    emotion_result = {
                        'emotion': self.emotion_classes[pred_idx],
                        'confidence': confidence,
                        'all_probabilities': {
                            emotion: prob.item() 
                            for emotion, prob in zip(self.emotion_classes, probabilities)
                        }
                    }
                    
                    emotion_results.append(emotion_result)
                    print(f"[INFO] Frame {i+1}: {emotion_result['emotion']} ({confidence:.3f})")
                    
            except Exception as e:
                print(f"[WARNING] Error analyzing emotion for frame {i+1}: {e}")
                emotion_results.append({
                    'emotion': 'unknown',
                    'confidence': 0.0,
                    'all_probabilities': {}
                })
                
        return emotion_results
    
    def analyze_audio(self, audio: np.ndarray, sample_rate: int = 48000) -> Dict:
        """使用CLAP分析音频"""
        print("[INFO] Analyzing audio with CLAP...")
        
        if len(audio) == 0:
            return {
                'predicted_class': 'Silent',
                'confidence': 0.0,
                'all_probabilities': {}
            }
        
        try:
            # 分段处理音频
            seg_seconds = 10
            seg_len = seg_seconds * sample_rate
            segments = [audio[i:i+seg_len] for i in range(0, len(audio), seg_len)]
            seg_sims = []
            
            for seg in segments:
                if len(seg) < 1000:  # 跳过过短片段
                    continue
                    
                a_in = self.clap_processor(
                    audios=[seg], 
                    sampling_rate=sample_rate,
                    return_tensors="pt", 
                    padding=True
                )
                a_in = {k: v.to(self.device) for k, v in a_in.items()}
                
                with torch.no_grad():
                    a_emb = torch.nn.functional.normalize(
                        self.clap_model.get_audio_features(**a_in), dim=-1
                    )
                    
                sim = (a_emb @ self.text_emb.T).softmax(dim=-1)[0].cpu().numpy()
                seg_sims.append(sim)
            
            if not seg_sims:
                seg_sims = [np.zeros(len(self.text_emb))]
            
            sim_avg = np.mean(np.vstack(seg_sims), axis=0)
            
            # 聚合到类别
            cls_prob = {}
            for cls in self.audio_prompts:
                cls_prob[cls] = float(np.mean(sim_avg[self.prompt_cls == cls]))
            
            # 预测类别
            pred_cls = max(cls_prob, key=cls_prob.get)
            confidence = cls_prob[pred_cls]
            
            print(f"[INFO] Audio analysis: {pred_cls} ({confidence:.3f})")
            
            return {
                'predicted_class': pred_cls,
                'confidence': confidence,
                'all_probabilities': cls_prob
            }
            
        except Exception as e:
            print(f"[WARNING] Error analyzing audio: {e}")
            return {
                'predicted_class': 'Unknown',
                'confidence': 0.0,
                'all_probabilities': {}
            }
    
    def synthesize_analysis(self, scene_descriptions: List[str], 
                          emotion_results: List[Dict], 
                          audio_result: Dict) -> Dict:
        """综合分析结果"""
        print("[INFO] Synthesizing analysis...")
        
        # 统计最常见的情绪
        emotions = [result['emotion'] for result in emotion_results if result['emotion'] != 'unknown']
        most_common_emotion = max(set(emotions), key=emotions.count) if emotions else 'unknown'
        
        # 平均情绪置信度
        avg_emotion_confidence = np.mean([
            result['confidence'] for result in emotion_results 
            if result['confidence'] > 0
        ]) if emotion_results else 0.0
        
        # 场景关键词提取
        scene_keywords = []
        for desc in scene_descriptions:
            # 简单的关键词提取
            keywords = [word.lower() for word in desc.split() 
                       if len(word) > 3 and word.lower() not in ['the', 'and', 'with', 'that', 'this']]
            scene_keywords.extend(keywords)
        
        # 生成综合分析
        analysis = {
            'dominant_emotion': most_common_emotion,
            'emotion_confidence': avg_emotion_confidence,
            'audio_behavior': audio_result['predicted_class'],
            'audio_confidence': audio_result['confidence'],
            'scene_context': scene_descriptions,
            'scene_keywords': list(set(scene_keywords)),
            'detailed_emotions': emotion_results,
            'audio_details': audio_result
        }
        
        # 生成狗的想法和心情推测
        dog_analysis = self._generate_dog_thoughts(analysis)
        analysis['dog_thoughts'] = dog_analysis
        
        return analysis
    
    def _generate_dog_thoughts(self, analysis: Dict) -> Dict:
        """基于分析结果推测狗的想法和心情"""
        
        emotion = analysis['dominant_emotion']
        audio_behavior = analysis['audio_behavior']
        scene_keywords = analysis['scene_keywords']
        
        # 规则基础的推测
        thoughts = {
            'mood': 'neutral',
            'likely_thoughts': [],
            'behavioral_interpretation': '',
            'needs_attention': False
        }
        
        # 基于情绪和音频行为的推测
        if emotion == 'happy' and audio_behavior in ['Excited', 'Demand']:
            thoughts['mood'] = 'playful'
            thoughts['likely_thoughts'] = [
                'I want to play!',
                'This is fun!',
                'Pay attention to me!'
            ]
            thoughts['behavioral_interpretation'] = 'The dog appears to be in a playful mood and seeking interaction.'
            
        elif emotion == 'sad' or audio_behavior in ['Lonely', 'Pain']:
            thoughts['mood'] = 'distressed'
            thoughts['likely_thoughts'] = [
                'I feel lonely',
                'I need comfort',
                'Something is bothering me'
            ]
            thoughts['behavioral_interpretation'] = 'The dog may be experiencing distress and needs comfort or attention.'
            thoughts['needs_attention'] = True
            
        elif audio_behavior in ['Alert', 'Territorial']:
            thoughts['mood'] = 'vigilant'
            thoughts['likely_thoughts'] = [
                'Something is happening',
                'I need to protect my territory',
                'Alert! Someone is coming'
            ]
            thoughts['behavioral_interpretation'] = 'The dog is in alert mode, possibly responding to external stimuli.'
            
        elif emotion == 'angry' or audio_behavior == 'Aggressive':
            thoughts['mood'] = 'defensive'
            thoughts['likely_thoughts'] = [
                'I feel threatened',
                'Stay away from me',
                'I need to defend myself'
            ]
            thoughts['behavioral_interpretation'] = 'The dog is displaying defensive or aggressive behavior.'
            thoughts['needs_attention'] = True
            
        elif emotion == 'calm':
            thoughts['mood'] = 'relaxed'
            thoughts['likely_thoughts'] = [
                'I feel comfortable',
                'Everything is peaceful',
                'I am content'
            ]
            thoughts['behavioral_interpretation'] = 'The dog appears to be in a relaxed and comfortable state.'
            
        # 处理未知情绪或其他情绪类别
        elif emotion and emotion not in ['unknown', 'neutral']:
            thoughts['mood'] = emotion
            thoughts['likely_thoughts'] = [f'I am feeling {emotion}']
            thoughts['behavioral_interpretation'] = f'The dog is displaying {emotion} behavior.'
            
        else:
            thoughts['mood'] = 'neutral'
            thoughts['likely_thoughts'] = ['Observing the environment']
            thoughts['behavioral_interpretation'] = 'The dog is in a neutral state, observing its surroundings.'
        
        # 基于场景关键词的额外推测
        if 'food' in scene_keywords or 'eating' in scene_keywords:
            thoughts['likely_thoughts'].append('Food! I want some!')
            
        if 'person' in scene_keywords or 'human' in scene_keywords:
            thoughts['likely_thoughts'].append('My human is here!')
            
        if 'outside' in scene_keywords or 'park' in scene_keywords:
            thoughts['likely_thoughts'].append('Time for adventure!')
        
        return thoughts
    
    def analyze_video(self, video_path: str) -> Dict:
        """主函数：分析视频"""
        print(f"[INFO] Starting video analysis for: {video_path}")
        
        # 1. 提取视频组件
        frames, audio = self.extract_video_components(video_path)
        
        # 2. 分析场景
        scene_descriptions = self.analyze_scenes(frames)
        
        # 3. 分析情绪
        emotion_results = self.analyze_emotions(frames)
        
        # 4. 分析音频
        audio_result = self.analyze_audio(audio)
        
        # 5. 综合分析
        final_analysis = self.synthesize_analysis(
            scene_descriptions, emotion_results, audio_result
        )
        
        print("[INFO] Video analysis completed!")
        return final_analysis
    
    def save_results(self, results: Dict, output_path: str):
        """保存分析结果"""
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        print(f"[INFO] Results saved to {output_path}")

def main():
    """主程序"""
    # 初始化分析系统
    analyzer = VideoAnalysisSystem()
    
    # 分析视频
    video_path = "dog.mp4"  # 替换为您的视频路径
    
    if not os.path.exists(video_path):
        print(f"[ERROR] Video file not found: {video_path}")
        print("[INFO] Please make sure the video file exists or update the video_path variable")
        return
    
    try:
        # 执行分析
        results = analyzer.analyze_video(video_path)
        
        # 打印结果摘要
        print("\n" + "="*50)
        print("🐕 DOG ANALYSIS SUMMARY")
        print("="*50)
        print(f"Mood: {results['dog_thoughts']['mood']}")
        print(f"Dominant Emotion: {results['dominant_emotion']}")
        print(f"Audio Behavior: {results['audio_behavior']}")
        print(f"Needs Attention: {results['dog_thoughts']['needs_attention']}")
        print(f"Behavioral Interpretation: {results['dog_thoughts']['behavioral_interpretation']}")
        print("\nLikely Thoughts:")
        for thought in results['dog_thoughts']['likely_thoughts']:
            print(f"  - {thought}")
        
        # 保存完整结果
        analyzer.save_results(results, "dog_analysis_results.json")
        
    except Exception as e:
        print(f"[ERROR] An error occurred during analysis: {e}")
        print("[INFO] Please check your video file and model files")

if __name__ == "__main__":
    main()
"""

[INFO] Using device: cuda
[INFO] Loading BLIP model...


Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 2/2 [00:37<00:00, 18.60s/it]
Some parameters are on the meta device because they were offloaded to the cpu.


[INFO] Loading CLAP model...
[INFO] Loading ResNet18 model...
[INFO] Detected 4 emotion classes from checkpoint
[INFO] Loaded pre-trained emotion model
[INFO] Loading LLM model...



Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use cuda:0


[INFO] Starting video analysis for: dog.mp4
[INFO] Extracting components from dog.mp4
[INFO] Extracted 14 frames
[INFO] Analyzing scenes with BLIP...
[INFO] Frame 1: a person walking with a dog in a park

[INFO] Frame 2: a woman is petting a golden retriever in a park

[INFO] Frame 3: a person standing next to a dog in a park

[INFO] Frame 4: a person walking with a dog in a park

[INFO] Frame 5: a woman is playing with a dog in a park

[INFO] Frame 6: a golden retriever is playing with its owner in a park

[INFO] Frame 7: a person walking with a golden retriever in a park

[INFO] Frame 8: a woman is playing with a dog in a park

[INFO] Frame 9: a golden retriever is playing with a person in a park

[INFO] Frame 10: golden retriever playing with his owner in the park

[INFO] Frame 11: a woman is playing with a dog in a park

[INFO] Frame 12: a person walking with a dog in a park

[INFO] Frame 13: a golden retriever playing with a frisbee in a park

[INFO] Frame 14: a person petting a d

In [15]:
import os
import cv2
import torch
import librosa
import numpy as np
import pandas as pd
from PIL import Image
from transformers import (
    BitsAndBytesConfig, 
    Blip2Processor, 
    Blip2ForConditionalGeneration,
    ClapModel, 
    ClapProcessor,
    pipeline
)
from torchvision import transforms, models
from torch import nn
import tempfile
import json
from typing import Dict, List, Tuple
import warnings
warnings.filterwarnings('ignore')
import subprocess

class VideoAnalysisSystem:
    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"[INFO] Using device: {self.device}")
        
        # 初始化所有模型
        self._init_blip_model()
        self._init_clap_model()
        self._init_resnet_model()
        self._init_llm_model()
        
    def _init_blip_model(self):
        print("[INFO] Loading BLIP model...")
        quant_config = BitsAndBytesConfig(
            load_in_8bit=True,
            llm_int8_enable_fp32_cpu_offload=True,
        )
        
        self.blip_processor = Blip2Processor.from_pretrained(
            "Salesforce/blip2-opt-2.7b", use_fast=True
        )
        self.blip_model = Blip2ForConditionalGeneration.from_pretrained(
            "Salesforce/blip2-opt-2.7b",
            quantization_config=quant_config,
            device_map="auto",
            offload_folder="offload",
            offload_state_dict=True
        )
        self.blip_model.eval()

        
    def _init_clap_model(self):
        print("[INFO] Loading CLAP model...")
        self.clap_model = ClapModel.from_pretrained("laion/clap-htsat-fused").to(self.device)
        self.clap_processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")
        
        self.audio_prompts = {
            "Alert": [
                "sharp mid-pitch alert bark",
                "brief clear alarm bark",
                "short crisp warning bark",
                "fast staccato alert bark"
            ],
            "Territorial": [
                "deep sustained territorial bark",
                "low throaty guard bark",
                "prolonged defensive bark",
                "slow booming territorial bark"
            ],
            "Excited": [
                "high-pitched rapid excited yips",
                "quick playful bark",
                "series of lively yips",
                "fast bright excited bark"
            ],
            "Demand": [
                "steady attention-seeking bark",
                "regular rhythmic demand bark",
                "persistent repetitive request bark",
                "moderate-pitch insistence bark"
            ],
            "Fear": [
                "high-pitched trembling fearful bark",
                "quivering anxious bark",
                "shaky high anxious bark",
                "piercing nervous bark"
            ],
            "Aggressive": [
                "low guttural aggressive bark",
                "deep harsh threat bark",
                "rough menacing bark",
                "raspy growling attack bark"
            ],
            "Pain": [
                "single sharp pain yelp",
                "shrill acute pain bark",
                "sudden high pain yelp",
                "short piercing pain bark"
            ],
            "Lonely": [
                "slow spaced lonely bark",
                "mournful drawn-out bark",
                "distant monotone lonely bark",
                "long-interval melancholy bark"
            ],
            "Howl": [
                "long plaintive canine howl",
                "careless, spontaneous howl",
                "sustained mournful howl",
                "extended melodic dog howl"
            ]
        }
        
        all_prompts, prompt_cls = [], []
        for cls, plist in self.audio_prompts.items():
            all_prompts.extend(plist)
            prompt_cls.extend([cls] * len(plist))
        
        self.prompt_cls = np.array(prompt_cls)
        txt_inputs = self.clap_processor(text=all_prompts, return_tensors="pt", padding=True)
        txt_inputs = {k: v.to(self.device) for k, v in txt_inputs.items()}
        
        with torch.no_grad():
            self.text_emb = torch.nn.functional.normalize(
                self.clap_model.get_text_features(**txt_inputs), dim=-1
            )
        
    def _init_resnet_model(self):
        print("[INFO] Loading ResNet18 model...")
        self.resnet_model = models.resnet18(pretrained=True)
        
        if os.path.exists('dog_emotion.pth'):
            try:
                checkpoint = torch.load('dog_emotion.pth', map_location=self.device)
                
                if 'fc.weight' in checkpoint:
                    num_classes = checkpoint['fc.weight'].shape[0]
                    print(f"[INFO] Detected {num_classes} emotion classes from checkpoint")
                else:
                    num_classes = 4
                    print(f"[INFO] Using default {num_classes} emotion classes")
                
                if num_classes == 4:
                    emotion_classes = ['angry', 'happy', 'relaxed', 'sad']
                elif num_classes == 5:
                    emotion_classes = ['angry', 'happy', 'relaxed', 'sad', 'calm']
                else:
                    emotion_classes = [f'emotion_{i}' for i in range(num_classes)]
                
                self.emotion_classes = emotion_classes
                self.resnet_model.fc = nn.Linear(self.resnet_model.fc.in_features, num_classes)
                
                self.resnet_model.load_state_dict(checkpoint)
                print("[INFO] Loaded pre-trained emotion model")
                
            except Exception as e:
                print(f"[WARNING] Error loading emotion model: {e}")
                print("[INFO] Using default emotion classes")
                emotion_classes = ['angry', 'happy', 'relaxed', 'sad']
                self.emotion_classes = emotion_classes
                self.resnet_model.fc = nn.Linear(self.resnet_model.fc.in_features, len(emotion_classes))
        else:
            print("[INFO] No pre-trained emotion model found, using default classes")
            emotion_classes = ['angry', 'happy', 'relaxed', 'sad']
            self.emotion_classes = emotion_classes
            self.resnet_model.fc = nn.Linear(self.resnet_model.fc.in_features, len(emotion_classes))
        
        self.resnet_model = self.resnet_model.to(self.device)
        self.resnet_model.eval()
        
        self.image_transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                               std=[0.229, 0.224, 0.225])
        ])
        
    def _init_llm_model(self):
        print("[INFO] Loading LLM model...")
        try:
            self.llm = pipeline(
                "text-generation",
                model="microsoft/DialoGPT-medium",
                device=0 if torch.cuda.is_available() else -1
            )
        except:
            print("[WARNING] Could not load LLM model, will use rule-based analysis")
            self.llm = None
    
    def extract_video_components(self, video_path: str, sample_rate: int = 48000) -> Tuple[List[np.ndarray], np.ndarray]:
        print(f"[INFO] Extracting components from {video_path}")
        
        cap = cv2.VideoCapture(video_path)
        frames = []
        
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        if total_frames <= 0:
            print("[WARNING] Could not get total frame count, reading sequentially.")
            for _ in range(3):
                ret, frame = cap.read()
                if not ret:
                    break
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frames.append(frame_rgb)
        else:
            indices = [0, total_frames // 2, total_frames - 1]
            unique_indices = sorted(set(indices))
            for idx in unique_indices:
                cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
                ret, frame = cap.read()
                if not ret:
                    print(f"[WARNING] Could not read frame at index {idx}")
                    continue
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frames.append(frame_rgb)
        
        cap.release()
        print(f"[INFO] Extracted {len(frames)} frames")
        
        try:
            audio, sr = librosa.load(video_path, sr=sample_rate, mono=True)
            print(f"[INFO] Extracted audio: {len(audio)} samples at {sr} Hz")
        except Exception as e:
            print(f"[WARNING] Could not extract audio: {e}")
            audio = np.array([])
        
        return frames, audio

    
    def analyze_scenes(self, frames: List[np.ndarray]) -> List[str]:
        print("[INFO] Analyzing scenes with BLIP...")
        scene_descriptions = []
        
        for i, frame in enumerate(frames):
            try:
                img = Image.fromarray(frame).convert("RGB")
                
                inputs = self.blip_processor(images=img, return_tensors="pt")
                inputs = {k: v.to(self.blip_model.device) for k, v in inputs.items()}
                
                generated_ids = self.blip_model.generate(
                    **inputs,
                    max_new_tokens=80,
                    num_beams=5,
                    no_repeat_ngram_size=2,
                )
                
                description = self.blip_processor.batch_decode(
                    generated_ids, skip_special_tokens=True
                )[0]
                
                scene_descriptions.append(description)
                print(f"[INFO] Frame {i+1}: {description}")
                
            except Exception as e:
                print(f"[WARNING] Error analyzing frame {i+1}: {e}")
                scene_descriptions.append("Unable to analyze scene")
                
        return scene_descriptions
    
    def analyze_emotions(self, frames: List[np.ndarray]) -> List[Dict]:
        print("[INFO] Analyzing emotions with ResNet18...")
        emotion_results = []
        
        for i, frame in enumerate(frames):
            try:
                # 转换为PIL Image并预处理
                img = Image.fromarray(frame).convert("RGB")
                img_tensor = self.image_transform(img).unsqueeze(0).to(self.device)
                
                with torch.no_grad():
                    outputs = self.resnet_model(img_tensor)
                    probabilities = torch.nn.functional.softmax(outputs[0], dim=0)
                    
                    # 获取预测结果
                    pred_idx = torch.argmax(probabilities).item()
                    confidence = probabilities[pred_idx].item()
                    
                    emotion_result = {
                        'emotion': self.emotion_classes[pred_idx],
                        'confidence': confidence,
                        'all_probabilities': {
                            emotion: prob.item() 
                            for emotion, prob in zip(self.emotion_classes, probabilities)
                        }
                    }
                    
                    emotion_results.append(emotion_result)
                    print(f"[INFO] Frame {i+1}: {emotion_result['emotion']} ({confidence:.3f})")
                    
            except Exception as e:
                print(f"[WARNING] Error analyzing emotion for frame {i+1}: {e}")
                emotion_results.append({
                    'emotion': 'unknown',
                    'confidence': 0.0,
                    'all_probabilities': {}
                })
                
        return emotion_results
    
    def analyze_audio(self, audio: np.ndarray, sample_rate: int = 48000) -> Dict:
        print("[INFO] Analyzing audio with CLAP...")
        
        if len(audio) == 0:
            return {
                'predicted_class': 'Silent',
                'confidence': 0.0,
                'all_probabilities': {}
            }
        
        try:
            # 分段处理音频
            seg_seconds = 10
            seg_len = seg_seconds * sample_rate
            segments = [audio[i:i+seg_len] for i in range(0, len(audio), seg_len)]
            seg_sims = []
            
            for seg in segments:
                if len(seg) < 1000:
                    continue
                    
                a_in = self.clap_processor(
                    audios=[seg], 
                    sampling_rate=sample_rate,
                    return_tensors="pt", 
                    padding=True
                )
                a_in = {k: v.to(self.device) for k, v in a_in.items()}
                
                with torch.no_grad():
                    a_emb = torch.nn.functional.normalize(
                        self.clap_model.get_audio_features(**a_in), dim=-1
                    )
                    
                sim = (a_emb @ self.text_emb.T).softmax(dim=-1)[0].cpu().numpy()
                seg_sims.append(sim)
            
            if not seg_sims:
                seg_sims = [np.zeros(len(self.text_emb))]
            
            sim_avg = np.mean(np.vstack(seg_sims), axis=0)
            
            cls_prob = {}
            for cls in self.audio_prompts:
                cls_prob[cls] = float(np.mean(sim_avg[self.prompt_cls == cls]))
            
            pred_cls = max(cls_prob, key=cls_prob.get)
            confidence = cls_prob[pred_cls]
            
            print(f"[INFO] Audio analysis: {pred_cls} ({confidence:.3f})")
            
            return {
                'predicted_class': pred_cls,
                'confidence': confidence,
                'all_probabilities': cls_prob
            }
            
        except Exception as e:
            print(f"[WARNING] Error analyzing audio: {e}")
            return {
                'predicted_class': 'Unknown',
                'confidence': 0.0,
                'all_probabilities': {}
            }
    
    def synthesize_analysis(self, scene_descriptions: List[str], 
                          emotion_results: List[Dict], 
                          audio_result: Dict) -> Dict:
        print("[INFO] Synthesizing analysis...")
        
        emotions = [result['emotion'] for result in emotion_results if result['emotion'] != 'unknown']
        most_common_emotion = max(set(emotions), key=emotions.count) if emotions else 'unknown'
        
        avg_emotion_confidence = np.mean([
            result['confidence'] for result in emotion_results 
            if result['confidence'] > 0
        ]) if emotion_results else 0.0
        
        scene_keywords = []
        for desc in scene_descriptions:
            keywords = [word.lower() for word in desc.split() 
                       if len(word) > 3 and word.lower() not in ['the', 'and', 'with', 'that', 'this']]
            scene_keywords.extend(keywords)
        
        analysis = {
            'dominant_emotion': most_common_emotion,
            'emotion_confidence': avg_emotion_confidence,
            'audio_behavior': audio_result['predicted_class'],
            'audio_confidence': audio_result['confidence'],
            'scene_context': scene_descriptions,
            'scene_keywords': list(set(scene_keywords)),
            'detailed_emotions': emotion_results,
            'audio_details': audio_result
        }
        
        dog_analysis = self._generate_dog_thoughts(analysis)
        analysis['dog_thoughts'] = dog_analysis
        
        return analysis
    
    def _generate_dog_thoughts(self, analysis: Dict) -> Dict:
        emotion = analysis['dominant_emotion']
        audio_behavior = analysis['audio_behavior']
        scene_keywords = analysis['scene_keywords']
        
        thoughts = {
            'mood': 'neutral',
            'likely_thoughts': [],
            'behavioral_interpretation': '',
            'needs_attention': False
        }
        
        if emotion == 'happy' and audio_behavior in ['Excited', 'Demand']:
            thoughts['mood'] = 'playful'
            thoughts['likely_thoughts'] = [
                'I want to play!',
                'This is fun!',
                'Pay attention to me!'
            ]
            thoughts['behavioral_interpretation'] = 'The dog appears to be in a playful mood and seeking interaction.'
            
        elif emotion == 'sad' or audio_behavior in ['Lonely', 'Pain']:
            thoughts['mood'] = 'distressed'
            thoughts['likely_thoughts'] = [
                'I feel lonely',
                'I need comfort',
                'Something is bothering me'
            ]
            thoughts['behavioral_interpretation'] = 'The dog may be experiencing distress and needs comfort or attention.'
            thoughts['needs_attention'] = True
            
        elif audio_behavior in ['Alert', 'Territorial']:
            thoughts['mood'] = 'vigilant'
            thoughts['likely_thoughts'] = [
                'Something is happening',
                'I need to protect my territory',
                'Alert! Someone is coming'
            ]
            thoughts['behavioral_interpretation'] = 'The dog is in alert mode, possibly responding to external stimuli.'
            
        elif emotion == 'angry' or audio_behavior == 'Aggressive':
            thoughts['mood'] = 'defensive'
            thoughts['likely_thoughts'] = [
                'I feel threatened',
                'Stay away from me',
                'I need to defend myself'
            ]
            thoughts['behavioral_interpretation'] = 'The dog is displaying defensive or aggressive behavior.'
            thoughts['needs_attention'] = True
            
        elif emotion == 'calm':
            thoughts['mood'] = 'relaxed'
            thoughts['likely_thoughts'] = [
                'I feel comfortable',
                'Everything is peaceful',
                'I am content'
            ]
            thoughts['behavioral_interpretation'] = 'The dog appears to be in a relaxed and comfortable state.'
            
        elif emotion and emotion not in ['unknown', 'neutral']:
            thoughts['mood'] = emotion
            thoughts['likely_thoughts'] = [f'I am feeling {emotion}']
            thoughts['behavioral_interpretation'] = f'The dog is displaying {emotion} behavior.'
            
        else:
            thoughts['mood'] = 'neutral'
            thoughts['likely_thoughts'] = ['Observing the environment']
            thoughts['behavioral_interpretation'] = 'The dog is in a neutral state, observing its surroundings.'
        
        if 'food' in scene_keywords or 'eating' in scene_keywords:
            thoughts['likely_thoughts'].append('Food! I want some!')
            
        if 'person' in scene_keywords or 'human' in scene_keywords:
            thoughts['likely_thoughts'].append('My human is here!')
            
        if 'outside' in scene_keywords or 'park' in scene_keywords:
            thoughts['likely_thoughts'].append('Time for adventure!')
        
        return thoughts
    
    def analyze_video(self, video_path: str) -> Dict:
        print(f"[INFO] Starting video analysis for: {video_path}")
        
        frames, audio = self.extract_video_components(video_path)
        
        scene_descriptions = self.analyze_scenes(frames)
        
        emotion_results = self.analyze_emotions(frames)
        
        audio_result = self.analyze_audio(audio)
        
        final_analysis = self.synthesize_analysis(
            scene_descriptions, emotion_results, audio_result
        )
        
        print("[INFO] Video analysis completed!")
        return final_analysis
    
    def save_results(self, results: Dict, output_path: str):
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        print(f"[INFO] Results saved to {output_path}")

def main():
    analyzer = VideoAnalysisSystem()

    video_path = "dog.mp4"
    
    if not os.path.exists(video_path):
        print(f"[ERROR] Video file not found: {video_path}")
        print("[INFO] Please make sure the video file exists or update the video_path variable")
        return
    
    try:
        results = analyzer.analyze_video(video_path)

        print("\n" + "="*50)
        print("🐕 DOG ANALYSIS SUMMARY")
        print("="*50)
        print(f"Mood: {results['dog_thoughts']['mood']}")
        print(f"Dominant Emotion: {results['dominant_emotion']}")
        print(f"Audio Behavior: {results['audio_behavior']}")
        print(f"Needs Attention: {results['dog_thoughts']['needs_attention']}")
        print(f"Behavioral Interpretation: {results['dog_thoughts']['behavioral_interpretation']}")
        print("\nLikely Thoughts:")
        for thought in results['dog_thoughts']['likely_thoughts']:
            print(f"  - {thought}")

        analyzer.save_results(results, "dog_analysis_results.json")
        
    except Exception as e:
        print(f"[ERROR] An error occurred during analysis: {e}")
        print("[INFO] Please check your video file and model files")

if __name__ == "__main__":
    main()

[INFO] Using device: cuda
[INFO] Loading BLIP model...


Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 2/2 [02:17<00:00, 68.63s/it]


[INFO] Loading CLAP model...
[INFO] Loading ResNet18 model...
[INFO] Detected 4 emotion classes from checkpoint
[INFO] Loaded pre-trained emotion model
[INFO] Loading LLM model...


Device set to use cuda:0


[INFO] Starting video analysis for: dog.mp4
[INFO] Extracting components from dog.mp4
[INFO] Extracted 3 frames
[INFO] Analyzing scenes with BLIP...
[INFO] Frame 1:  and
[INFO] Frame 2:  and
[INFO] Frame 3:  and
[INFO] Analyzing emotions with ResNet18...
[INFO] Frame 1: happy (0.855)
[INFO] Frame 2: happy (0.816)
[INFO] Frame 3: happy (0.976)
[INFO] Analyzing audio with CLAP...
[INFO] Synthesizing analysis...
[INFO] Video analysis completed!

🐕 DOG ANALYSIS SUMMARY
Mood: happy
Dominant Emotion: happy
Audio Behavior: Silent
Needs Attention: False
Behavioral Interpretation: The dog is displaying happy behavior.

Likely Thoughts:
  - I am feeling happy
[INFO] Results saved to dog_analysis_results.json
