In [1]:
import cv2
from pydub.generators import Sine
from pydub import AudioSegment
import numpy as np
from tqdm import tqdm  # For progress tracking

def pixel_to_sound(image, base_frequency=200, max_frequency=2000, block_size=16, max_duration=15000):
    height, width, _ = image.shape
    sound = AudioSegment.silent(duration=0)  # Start with silent audio
    total_tones = max_duration // 50  # Estimate total tones based on ~50ms per tone
    row_step = max(1, height // total_tones)

    for i in tqdm(range(0, height, row_step), desc="Processing rows", unit="row"):
        for j in range(0, width, block_size):
            # Use raw R, G, B values from the current pixel or block
            pixel = image[i, j]

            # Extract R, G, B components
            red, green, blue = pixel

            # Map R (Red) to frequency
            frequency = base_frequency + (red / 255) * (max_frequency - base_frequency)

            # Map G (Green) to volume
            volume = int((green / 255) * 10) - 5  # Scale volume to [-5dB, 5dB]

            # Map B (Blue) to duration
            tone_duration = 20 + (blue / 255) * 80  # Scale duration to 20-100ms

            tone = Sine(frequency).to_audio_segment(duration=tone_duration).apply_gain(volume)

            # Mix the tone into the overall sound
            sound += tone

            # Apply constraint, Stop if the maximum duration is reached
            if len(sound) >= max_duration:
                break

    return sound[:max_duration]  # This trim the sound to the maximum allowed duration

image_path = "image_used.png"
image = cv2.imread(image_path) 
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  

# Downsample the image to speed up processing and reduce sound length
image = cv2.resize(image, (100, 100))  # Resize to 100x100 pixels

print("Generating sound from the image...")
sound = pixel_to_sound(image)

sound.export("image_sound_raw_rgb.wav", format="wav")
print("Sound generated and saved as 'image_sound_raw_rgb.wav'.")



Generating sound from the image...


Processing rows: 100%|█████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 183.02row/s]

Sound generated and saved as 'image_sound_raw_rgb.wav'.



