<a href="https://colab.research.google.com/github/Aayxsh/AI-ML-Projects/blob/main/better_ATI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import librosa
import os
import sys
import torch
from PIL import Image
from datetime import datetime
import time
!pip install -q clip openai-clip
import clip

# Install required packages
!pip install -q tensorflow-hub
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q librosa
!pip install -q diffusers transformers accelerate
!pip install -q git+https://github.com/openai/whisper.git
# Import after installation
import tensorflow_hub as hub
import whisper
from diffusers import (
    StableDiffusionXLPipeline,
    DPMSolverMultistepScheduler,
    StableDiffusionUpscalePipeline
)

# Mount Google Drive if in Colab
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    AUDIO_FILE_PATH = "/content/drive/MyDrive/I wish i had a glass of water"
    OUTPUT_DIR = "/content/drive/MyDrive/AI_Generated_Images"
else:
    AUDIO_FILE_PATH = "path/to/your/audio.wav"  # Replace with local path
    OUTPUT_DIR = "./AI_Generated_Images"

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Configure GPU
print("Configuring GPU...")
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    try:
        for device in physical_devices:
            tf.config.experimental.set_memory_growth(device, True)
        print(f"GPU configured: {len(physical_devices)} GPU(s) found")
        device = "cuda"
        torch_dtype = torch.float16
    except Exception as e:
        print(f"Error configuring GPU: {e}")
        device = "cpu"
        torch_dtype = torch.float32
else:
    print("No GPU found. Running on CPU.")
    device = "cpu"
    torch_dtype = torch.float32

# Load CLIP model globally for efficiency
print("Loading CLIP model...")
clip_model, preprocess = clip.load("ViT-B/32", device=device)
print("CLIP model loaded successfully")

# Function to download YAMNet class map
def get_yamnet_class_map():
    class_map_path = 'yamnet_class_map.csv'
    if not os.path.exists(class_map_path):
        print("Downloading YAMNet class map...")
        import urllib.request
        url = 'https://raw.githubusercontent.com/tensorflow/models/master/research/audioset/yamnet/yamnet_class_map.csv'
        urllib.request.urlretrieve(url, class_map_path)
    return class_map_path

# Load YAMNet model
print("Loading YAMNet model...")
try:
    with tf.device('/cpu:0'):
        yamnet_model = hub.load('https://tfhub.dev/google/yamnet/1')
    print("YAMNet model loaded successfully on CPU")
except Exception as e:
    print(f"Error loading YAMNet model: {e}")
    yamnet_model = None

def load_yamnet_class_names():
    class_names_path = get_yamnet_class_map()
    df = pd.read_csv(class_names_path, header=0)
    return df.iloc[:, 2].tolist()

def classify_environmental_sounds(audio_path, threshold=0.15):
    try:
        waveform, sr = librosa.load(audio_path, sr=16000)
        waveform = waveform.astype(np.float32)
        chunk_size = 5 * sr
        all_scores = []

        for i in range(0, len(waveform), chunk_size):
            chunk = waveform[i:i + chunk_size]
            if len(chunk) < sr:
                continue
            chunk_tensor = tf.convert_to_tensor(chunk, dtype=tf.float32)
            with tf.device('/cpu:0'):
                scores, _, _ = yamnet_model(chunk_tensor)
            all_scores.append(scores)

        if not all_scores:
            return []

        combined_scores = tf.concat(all_scores, axis=0)
        mean_scores = tf.reduce_mean(combined_scores, axis=0).numpy()
        class_names = load_yamnet_class_names()
        detected_classes = [
            (class_names[i], float(mean_scores[i]))
            for i in np.where(mean_scores > threshold)[0]
            if i < len(class_names)
        ]
        return sorted(detected_classes, key=lambda x: x[1], reverse=True)
    except Exception as e:
        print(f"Error in classify_environmental_sounds: {e}")
        return []

# Load Whisper model
print("Loading Whisper model...")
try:
    whisper_device = "cuda" if torch.cuda.is_available() else "cpu"
    whisper_model = whisper.load_model("base", device=whisper_device)
    print(f"Whisper model loaded on {whisper_device}")
except Exception as e:
    print(f"Error loading Whisper model: {e}")
    whisper_model = None

def transcribe_with_validation(audio_path, min_confidence=0.5, min_words=3):
    try:
        if whisper_model is None:
            return {'text': '', 'confidence': 0, 'is_valid': False}
        result = whisper_model.transcribe(audio_path)
        transcription = result.get("text", "").strip()
        segments = result.get("segments", [])
        confidences = [seg.get('confidence', 0.7) for seg in segments]
        avg_confidence = np.mean(confidences) if confidences else 0.8
        is_valid = (
            avg_confidence >= min_confidence and
            len(transcription.split()) >= min_words and
            any(c.isalpha() for c in transcription)
        )
        return {
            'text': transcription,
            'confidence': avg_confidence,
            'is_valid': is_valid
        }
    except Exception as e:
        print(f"Error in transcribe_with_validation: {e}")
        return {'text': '', 'confidence': 0, 'is_valid': False}

def analyze_audio_mood(audio_path):
    """Extract mood and emotional context from audio"""
    try:
        # Load audio
        y, sr = librosa.load(audio_path, sr=22050)

        # Extract features
        tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
        spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr).mean()
        spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr).mean()
        rms = librosa.feature.rms(y=y).mean()

        # Determine mood based on features
        mood = {}
        mood['energy'] = 'high' if rms > 0.1 else 'low'
        mood['tempo'] = 'fast' if tempo > 120 else 'moderate' if tempo > 90 else 'slow'
        mood['brightness'] = 'bright' if spectral_centroid > 2000 else 'warm' if spectral_centroid > 1000 else 'dark'

        # Create mood descriptors for prompts
        mood_descriptors = []
        if mood['energy'] == 'high':
            mood_descriptors.append('energetic atmosphere')
        else:
            mood_descriptors.append('calm atmosphere')

        if mood['brightness'] == 'bright':
            mood_descriptors.append('bright lighting')
        elif mood['brightness'] == 'warm':
            mood_descriptors.append('warm lighting')
        else:
            mood_descriptors.append('moody lighting')

        return mood_descriptors
    except Exception as e:
        print(f"Error analyzing audio mood: {e}")
        return []


def create_image_prompt(audio_path, env_threshold=0.15):
    try:
        env_results = classify_environmental_sounds(audio_path, threshold=env_threshold) if yamnet_model else []
        stt_result = transcribe_with_validation(audio_path)

        # Compute audio metrics
        avg_sound_confidence = np.mean([score for _, score in env_results]) if env_results else 0
        audio_metrics = {
            'avg_sound_confidence': avg_sound_confidence,
            'transcription_confidence': stt_result['confidence'],
            'is_transcription_valid': stt_result['is_valid']
        }

        # Simplified prompt creation (replace with your full implementation)
        if stt_result['is_valid']:
            prompt = f"A scene with a person speaking: {stt_result['text']}"
            if env_results:
                env_desc = ", ".join([label for label, _ in env_results[:3]])
                prompt += f" in an environment with {env_desc}"
        elif env_results:
            env_desc = ", ".join([label for label, _ in env_results[:5]])
            prompt = f"A realistic scene with {env_desc}"
        else:
            prompt = "A realistic environmental scene"

        return prompt, audio_metrics
    except Exception as e:
        print(f"Error in create_image_prompt: {e}")
        return "Realistic natural environment scene", {'avg_sound_confidence': 0, 'transcription_confidence': 0, 'is_transcription_valid': False}

def create_enhanced_image_prompt(audio_path, env_threshold=0.15):
    env_results = classify_environmental_sounds(audio_path, threshold=env_threshold) if yamnet_model else []
    stt_result = transcribe_with_validation(audio_path)

    # Extract key information
    env_sounds = [label for label, _ in env_results[:5]]

    # Compute audio metrics
    avg_sound_confidence = np.mean([score for _, score in env_results]) if env_results else 0
    audio_metrics = {
        'avg_sound_confidence': avg_sound_confidence,
        'transcription_confidence': stt_result['confidence'],
        'is_transcription_valid': stt_result['is_valid']
    }

    # Create a more detailed and descriptive prompt
    if stt_result['is_valid']:
        # For spoken content, create a more vivid scene
        spoken_text = stt_result['text']
        prompt_base = f"Photorealistic scene depicting '{spoken_text}', "

        # Add emotional context based on speech analysis
        if "?" in spoken_text:
            prompt_base += "with a curious or questioning atmosphere, "
        elif "!" in spoken_text:
            prompt_base += "with an excited or emphatic mood, "

        # Add environmental context
        if env_sounds:
            env_desc = ", ".join(env_sounds[:3])
            prompt_base += f"in an environment with {env_desc}, "

        # Add visual details for better CLIP alignment
        prompt_base += "high detail, perfect lighting, cinematic composition, 8k resolution"
    else:
        # For environmental sounds only
        if env_sounds:
            env_desc = ", ".join(env_sounds)
            prompt_base = f"Photorealistic scene featuring {env_desc}, natural lighting, detailed environment, high resolution"
        else:
            prompt_base = "Photorealistic natural environment, perfect lighting, detailed scene, high resolution"

    return prompt_base, audio_metrics


class ImageGenerator:
    def __init__(self):
        self.base_model_loaded = False
        self.upscaler_loaded = False
        self.base_model = None
        self.upscaler = None

    def load_base_model(self):
        if not self.base_model_loaded:
            print("Loading Stable Diffusion XL model...")
            self.base_model = StableDiffusionXLPipeline.from_pretrained(
                "stabilityai/stable-diffusion-xl-base-1.0",
                torch_dtype=torch_dtype,
                variant="fp16",
                use_safetensors=True
            )
            self.base_model.scheduler = DPMSolverMultistepScheduler.from_config(
                self.base_model.scheduler.config,
                algorithm_type="sde-dpmsolver++",
                use_karras_sigmas=True
            )
            self.base_model = self.base_model.to(device)
            self.base_model.enable_attention_slicing()
            if torch.cuda.is_available():
                self.base_model.enable_model_cpu_offload()
            self.base_model_loaded = True
            print("Base model loaded successfully")

    def generate_image(self, prompt, negative_prompt=None, guidance_scale=7.5, steps=30, width=1024, height=1024, num_images=1):
        if not self.base_model_loaded:
            self.load_base_model()
        if negative_prompt is None:
            negative_prompt = "deformed, bad anatomy, blurry, low-res"

        start_time = time.time()
        images = self.base_model(
            prompt=prompt,
            negative_prompt=negative_prompt,
            guidance_scale=guidance_scale,
            num_inference_steps=steps,
            width=width,
            height=height,
            num_images_per_prompt=num_images
        ).images
        gen_time = time.time() - start_time

        # Save images and collect paths
        image_paths = []
        for idx, img in enumerate(images):
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"{OUTPUT_DIR}/image_{timestamp}_{idx}.png"
            img.save(filename)
            image_paths.append(filename)

        return images, {'generation_time': gen_time, 'image_paths': image_paths}

def enhanced_clip_score(image, text):
    """Calculate CLIP score with multiple text variations for robustness"""
    # Create variations of the text prompt
    text_variations = [
        text,
        f"a photo of {text}",
        f"a high-quality image showing {text}",
        f"a realistic scene with {text}"
    ]

    # Process image
    image_input = preprocess(image).unsqueeze(0).to(device)

    # Calculate scores for all variations
    scores = []
    with torch.no_grad():
        image_features = clip_model.encode_image(image_input)
        image_features /= image_features.norm(dim=-1, keepdim=True)

        for variation in text_variations:
            text_input = clip.tokenize([variation], truncate=True).to(device)
            text_features = clip_model.encode_text(text_input)
            text_features /= text_features.norm(dim=-1, keepdim=True)

            # Calculate cosine similarity
            similarity = (image_features @ text_features.T).item()
            scores.append(similarity)

    # Return max score among variations
    return max(scores) * 100  # Scale to 0-100 range for readability

def optimize_prompt_with_clip(base_prompt, iterations=2):
    """Iteratively refine prompt using CLIP feedback"""
    current_prompt = base_prompt
    best_prompt = base_prompt
    best_score = 0

    # Potential modifiers to enhance visual quality
    modifiers = [
        "highly detailed", "sharp focus", "dramatic lighting",
        "professional photography", "8k resolution", "award-winning photograph",
        "stunning composition", "photorealistic", "vivid colors"
    ]

    # Test variations
    for _ in range(iterations):
        # Create prompt variations
        variations = [
            f"{current_prompt}, {modifier}" for modifier in modifiers
        ]
        variations.append(current_prompt)  # Include original

        # Generate small test images for each variation
        test_scores = []
        for var_prompt in variations[:3]:  # Limit to 3 variations to save time
            # Generate a small test image
            generator = ImageGenerator()
            test_img, _ = generator.generate_image(
                var_prompt, width=256, height=256, steps=15, num_images=1
            )

            # Compute CLIP score
            score = enhanced_clip_score(test_img[0], var_prompt)
            test_scores.append((var_prompt, score))

        # Find best variation
        best_var, score = max(test_scores, key=lambda x: x[1])
        if score > best_score:
            best_prompt = best_var
            best_score = score
        current_prompt = best_var

    return best_prompt, best_score


def improved_audio_to_image(audio_file_path, num_images=1):
    start_time = time.time()

    # Step 1: Generate base prompt from audio
    base_prompt, audio_metrics = create_enhanced_image_prompt(audio_file_path)

    # Step 2: Extract mood information
    mood_descriptors = analyze_audio_mood(audio_file_path)
    enhanced_prompt = f"{base_prompt}, {', '.join(mood_descriptors)}" if mood_descriptors else base_prompt

    # Step 3: Optimize prompt with CLIP feedback (optional - can be time-consuming)
    # Comment out if you want faster execution
    # final_prompt, expected_clip_score = optimize_prompt_with_clip(enhanced_prompt)
    final_prompt = enhanced_prompt  # Skip optimization for faster execution

    # Step 4: Generate final images with optimized prompt
    generator = ImageGenerator()
    images, image_metrics = generator.generate_image(
        final_prompt,
        negative_prompt="deformed, distorted, low quality, blurry, nsfw, watermark, signature",
        guidance_scale=8.5,  # Slightly higher guidance for better prompt adherence
        steps=40,  # More steps for higher quality
        num_images=num_images
    )

    # Step 5: Calculate enhanced CLIP scores
    clip_scores = [enhanced_clip_score(img, final_prompt) for img in images]

    # Compile metrics
    total_time = time.time() - start_time
    metrics = {
        'total_execution_time': total_time,
        'generation_time': image_metrics['generation_time'],
        'avg_sound_confidence': audio_metrics['avg_sound_confidence'],
        'transcription_confidence': audio_metrics['transcription_confidence'],
        'is_transcription_valid': audio_metrics['is_transcription_valid'],
        'prompt_length': len(final_prompt.split()),
        'mood_descriptors': mood_descriptors,
        'base_prompt': base_prompt,
        'final_prompt': final_prompt,
        'clip_scores': clip_scores,
        'avg_clip_score': np.mean(clip_scores) if clip_scores else 0,
        'success': True
    }

    return images, final_prompt, metrics

if __name__ == "__main__":
    print(f"Using audio file: {AUDIO_FILE_PATH}")
    if not os.path.exists(AUDIO_FILE_PATH):
        print(f"ERROR: File not found at {AUDIO_FILE_PATH}")
    else:
        print("\nProcessing audio to generate images...")
        try:
            images, prompt, metrics = improved_audio_to_image(AUDIO_FILE_PATH, num_images=1)

            print("\n=== AUDIO ANALYSIS AND IMAGE GENERATION COMPLETE ===")
            print(f"Prompt: {prompt}")
            print(f"Generated {len(images)} images in {OUTPUT_DIR}")

            print("\n=== METRICS ===")
            print(f"Base Prompt: {metrics['base_prompt']}")
            print(f"Final Prompt: {metrics['final_prompt']}")
            print(f"Mood Descriptors: {metrics['mood_descriptors']}")

            print(f"Total Execution Time: {metrics['total_execution_time']:.2f} seconds")
            print(f"Image Generation Time: {metrics['generation_time']:.2f} seconds")
            print(f"Avg Sound Confidence: {metrics['avg_sound_confidence']:.2f}")
            print(f"Transcription Confidence: {metrics['transcription_confidence']:.2f}")
            print(f"Transcription Valid: {metrics['is_transcription_valid']}")
            print(f"Prompt Length: {metrics['prompt_length']} words")
            print(f"CLIP Scores: {[f'{score:.2f}' for score in metrics['clip_scores']]}")
            print(f"Avg CLIP Score: {metrics['avg_clip_score']:.2f}")
            print(f"Success: {metrics['success']}")
            print("=================")
        except Exception as e:
            print(f"Error during processing: {e}")
            print("Metrics collection incomplete due to failure")

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Configuring GPU...
GPU configured: 1 GPU(s) found
Loading CLIP model...
CLIP model loaded successfully
Loading YAMNet model...
YAMNet model loaded successfully on CPU
Loading Whisper model...


  checkpoint = torch.load(fp, map_location=device)


Whisper model loaded on cuda
Using audio file: /content/drive/MyDrive/I wish i had a glass of water

Processing audio to generate images...


  waveform, sr = librosa.load(audio_path, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  y, sr = librosa.load(audio_path, sr=22050)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Loading Stable Diffusion XL model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model_index.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

Fetching 19 files:   0%|          | 0/19 [00:00<?, ?it/s]

special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/575 [00:00<?, ?B/s]

scheduler_config.json:   0%|          | 0.00/479 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/565 [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

model.fp16.safetensors:   0%|          | 0.00/246M [00:00<?, ?B/s]

model.fp16.safetensors:   0%|          | 0.00/1.39G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/737 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/460 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.68k [00:00<?, ?B/s]

diffusion_pytorch_model.fp16.safetensors:   0%|          | 0.00/5.14G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

diffusion_pytorch_model.fp16.safetensors:   0%|          | 0.00/167M [00:00<?, ?B/s]

diffusion_pytorch_model.fp16.safetensors:   0%|          | 0.00/167M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

Base model loaded successfully


  0%|          | 0/40 [00:00<?, ?it/s]


=== AUDIO ANALYSIS AND IMAGE GENERATION COMPLETE ===
Prompt: Photorealistic scene depicting 'I wish I had something to drink, maybe a glass of water, maybe some orange juice, or maybe some glass of cold cola.', in an environment with Speech, high detail, perfect lighting, cinematic composition, 8k resolution, calm atmosphere, warm lighting
Generated 1 images in /content/drive/MyDrive/AI_Generated_Images

=== METRICS ===
Base Prompt: Photorealistic scene depicting 'I wish I had something to drink, maybe a glass of water, maybe some orange juice, or maybe some glass of cold cola.', in an environment with Speech, high detail, perfect lighting, cinematic composition, 8k resolution
Final Prompt: Photorealistic scene depicting 'I wish I had something to drink, maybe a glass of water, maybe some orange juice, or maybe some glass of cold cola.', in an environment with Speech, high detail, perfect lighting, cinematic composition, 8k resolution, calm atmosphere, warm lighting
Mood Descriptors: 

Better Version:

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import librosa
import os
import sys
import torch
from PIL import Image
from datetime import datetime
import time
!pip install -q clip openai-clip
import clip

# Install required packages
!pip install -q tensorflow-hub
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q librosa
!pip install -q diffusers transformers accelerate
!pip install -q git+https://github.com/openai/whisper.git
# Import after installation
import tensorflow_hub as hub
import whisper
from diffusers import (
    StableDiffusionXLPipeline,
    DPMSolverMultistepScheduler,
    StableDiffusionUpscalePipeline
)

# Mount Google Drive if in Colab
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    AUDIO_FILE_PATH = "/content/drive/MyDrive/I wish i had a glass of water"
    OUTPUT_DIR = "/content/drive/MyDrive/AI_Generated_Images"
else:
    AUDIO_FILE_PATH = "path/to/your/audio.wav"  # Replace with local path
    OUTPUT_DIR = "./AI_Generated_Images"

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Configure GPU
print("Configuring GPU...")
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    try:
        for device in physical_devices:
            tf.config.experimental.set_memory_growth(device, True)
        print(f"GPU configured: {len(physical_devices)} GPU(s) found")
        device = "cuda"
        torch_dtype = torch.float16
    except Exception as e:
        print(f"Error configuring GPU: {e}")
        device = "cpu"
        torch_dtype = torch.float32
else:
    print("No GPU found. Running on CPU.")
    device = "cpu"
    torch_dtype = torch.float32

# Load CLIP model globally for efficiency
print("Loading CLIP model...")
clip_model, preprocess = clip.load("ViT-B/32", device=device)
print("CLIP model loaded successfully")

# Function to download YAMNet class map
def get_yamnet_class_map():
    class_map_path = 'yamnet_class_map.csv'
    if not os.path.exists(class_map_path):
        print("Downloading YAMNet class map...")
        import urllib.request
        url = 'https://raw.githubusercontent.com/tensorflow/models/master/research/audioset/yamnet/yamnet_class_map.csv'
        urllib.request.urlretrieve(url, class_map_path)
    return class_map_path

# Load YAMNet model
print("Loading YAMNet model...")
try:
    with tf.device('/cpu:0'):
        yamnet_model = hub.load('https://tfhub.dev/google/yamnet/1')
    print("YAMNet model loaded successfully on CPU")
except Exception as e:
    print(f"Error loading YAMNet model: {e}")
    yamnet_model = None

def load_yamnet_class_names():
    class_names_path = get_yamnet_class_map()
    df = pd.read_csv(class_names_path, header=0)
    return df.iloc[:, 2].tolist()

def classify_environmental_sounds(audio_path, threshold=0.15):
    try:
        waveform, sr = librosa.load(audio_path, sr=16000)
        waveform = waveform.astype(np.float32)
        chunk_size = 5 * sr
        all_scores = []

        for i in range(0, len(waveform), chunk_size):
            chunk = waveform[i:i + chunk_size]
            if len(chunk) < sr:
                continue
            chunk_tensor = tf.convert_to_tensor(chunk, dtype=tf.float32)
            with tf.device('/cpu:0'):
                scores, _, _ = yamnet_model(chunk_tensor)
            all_scores.append(scores)

        if not all_scores:
            return []

        combined_scores = tf.concat(all_scores, axis=0)
        mean_scores = tf.reduce_mean(combined_scores, axis=0).numpy()
        class_names = load_yamnet_class_names()
        detected_classes = [
            (class_names[i], float(mean_scores[i]))
            for i in np.where(mean_scores > threshold)[0]
            if i < len(class_names)
        ]
        return sorted(detected_classes, key=lambda x: x[1], reverse=True)
    except Exception as e:
        print(f"Error in classify_environmental_sounds: {e}")
        return []

# Load Whisper model
print("Loading Whisper model...")
try:
    whisper_device = "cuda" if torch.cuda.is_available() else "cpu"
    whisper_model = whisper.load_model("base", device=whisper_device)
    print(f"Whisper model loaded on {whisper_device}")
except Exception as e:
    print(f"Error loading Whisper model: {e}")
    whisper_model = None

def transcribe_with_validation(audio_path, min_confidence=0.5, min_words=3):
    try:
        if whisper_model is None:
            return {'text': '', 'confidence': 0, 'is_valid': False}
        result = whisper_model.transcribe(audio_path)
        transcription = result.get("text", "").strip()
        segments = result.get("segments", [])
        confidences = [seg.get('confidence', 0.7) for seg in segments]
        avg_confidence = np.mean(confidences) if confidences else 0.8
        is_valid = (
            avg_confidence >= min_confidence and
            len(transcription.split()) >= min_words and
            any(c.isalpha() for c in transcription)
        )
        return {
            'text': transcription,
            'confidence': avg_confidence,
            'is_valid': is_valid
        }
    except Exception as e:
        print(f"Error in transcribe_with_validation: {e}")
        return {'text': '', 'confidence': 0, 'is_valid': False}

def create_image_prompt(audio_path, env_threshold=0.15):
    try:
        env_results = classify_environmental_sounds(audio_path, threshold=env_threshold) if yamnet_model else []
        stt_result = transcribe_with_validation(audio_path)

        # Compute audio metrics
        avg_sound_confidence = np.mean([score for _, score in env_results]) if env_results else 0
        audio_metrics = {
            'avg_sound_confidence': avg_sound_confidence,
            'transcription_confidence': stt_result['confidence'],
            'is_transcription_valid': stt_result['is_valid']
        }

        # Simplified prompt creation (replace with your full implementation)
        if stt_result['is_valid']:
            prompt = f"A scene with a person speaking: {stt_result['text']}"
            if env_results:
                env_desc = ", ".join([label for label, _ in env_results[:3]])
                prompt += f" in an environment with {env_desc}"
        elif env_results:
            env_desc = ", ".join([label for label, _ in env_results[:5]])
            prompt = f"A realistic scene with {env_desc}"
        else:
            prompt = "A realistic environmental scene"

        return prompt, audio_metrics
    except Exception as e:
        print(f"Error in create_image_prompt: {e}")
        return "Realistic natural environment scene", {'avg_sound_confidence': 0, 'transcription_confidence': 0, 'is_transcription_valid': False}

class ImageGenerator:
    def __init__(self):
        self.base_model_loaded = False
        self.upscaler_loaded = False
        self.base_model = None
        self.upscaler = None

    def load_base_model(self):
        if not self.base_model_loaded:
            print("Loading Stable Diffusion XL model...")
            self.base_model = StableDiffusionXLPipeline.from_pretrained(
                "stabilityai/stable-diffusion-xl-base-1.0",
                torch_dtype=torch_dtype,
                variant="fp16",
                use_safetensors=True
            )
            self.base_model.scheduler = DPMSolverMultistepScheduler.from_config(
                self.base_model.scheduler.config,
                algorithm_type="sde-dpmsolver++",
                use_karras_sigmas=True
            )
            self.base_model = self.base_model.to(device)
            self.base_model.enable_attention_slicing()
            if torch.cuda.is_available():
                self.base_model.enable_model_cpu_offload()
            self.base_model_loaded = True
            print("Base model loaded successfully")

    def generate_image(self, prompt, negative_prompt=None, guidance_scale=7.5, steps=30, width=1024, height=1024, num_images=1):
        if not self.base_model_loaded:
            self.load_base_model()
        if negative_prompt is None:
            negative_prompt = "deformed, bad anatomy, blurry, low-res"

        start_time = time.time()
        images = self.base_model(
            prompt=prompt,
            negative_prompt=negative_prompt,
            guidance_scale=guidance_scale,
            num_inference_steps=steps,
            width=width,
            height=height,
            num_images_per_prompt=num_images
        ).images
        gen_time = time.time() - start_time

        # Save images and collect paths
        image_paths = []
        for idx, img in enumerate(images):
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"{OUTPUT_DIR}/image_{timestamp}_{idx}.png"
            img.save(filename)
            image_paths.append(filename)

        return images, {'generation_time': gen_time, 'image_paths': image_paths}

def compute_clip_score(image, text):
    image_input = preprocess(image).unsqueeze(0).to(device)
    text_input = clip.tokenize([text], truncate=True).to(device)
    with torch.no_grad():
        image_features = clip_model.encode_image(image_input)
        text_features = clip_model.encode_text(text_input)
        similarity = (image_features @ text_features.T).item()
    return similarity

def audio_to_image(audio_file_path, num_images=1):
    start_time = time.time()

    # Generate prompt and collect audio metrics
    prompt, audio_metrics = create_image_prompt(audio_file_path)

    # Generate images
    generator = ImageGenerator()
    images, image_metrics = generator.generate_image(prompt, num_images=num_images)

    # Compute CLIP scores
    clip_scores = [compute_clip_score(img, prompt) for img in images]

    # Compile all metrics
    total_time = time.time() - start_time
    metrics = {
        'total_execution_time': total_time,
        'generation_time': image_metrics['generation_time'],
        'avg_sound_confidence': audio_metrics['avg_sound_confidence'],
        'transcription_confidence': audio_metrics['transcription_confidence'],
        'is_transcription_valid': audio_metrics['is_transcription_valid'],
        'prompt_length': len(prompt.split()),
        'clip_scores': clip_scores,
        'avg_clip_score': np.mean(clip_scores) if clip_scores else 0,
        'success': True
    }

    return images, prompt, metrics

if __name__ == "__main__":
    print(f"Using audio file: {AUDIO_FILE_PATH}")
    if not os.path.exists(AUDIO_FILE_PATH):
        print(f"ERROR: File not found at {AUDIO_FILE_PATH}")
    else:
        print("\nProcessing audio to generate images...")
        try:
            images, prompt, metrics = audio_to_image(AUDIO_FILE_PATH, num_images=1)

            print("\n=== AUDIO ANALYSIS AND IMAGE GENERATION COMPLETE ===")
            print(f"Prompt: {prompt}")
            print(f"Generated {len(images)} images in {OUTPUT_DIR}")

            print("\n=== METRICS ===")
            print(f"Total Execution Time: {metrics['total_execution_time']:.2f} seconds")
            print(f"Image Generation Time: {metrics['generation_time']:.2f} seconds")
            print(f"Avg Sound Confidence: {metrics['avg_sound_confidence']:.2f}")
            print(f"Transcription Confidence: {metrics['transcription_confidence']:.2f}")
            print(f"Transcription Valid: {metrics['is_transcription_valid']}")
            print(f"Prompt Length: {metrics['prompt_length']} words")
            print(f"CLIP Scores: {[f'{score:.2f}' for score in metrics['clip_scores']]}")
            print(f"Avg CLIP Score: {metrics['avg_clip_score']:.2f}")
            print(f"Success: {metrics['success']}")
            print("=================")
        except Exception as e:
            print(f"Error during processing: {e}")
            print("Metrics collection incomplete due to failure")