<a href="https://colab.research.google.com/github/Bimpiel/CART498-GENERATIVE-AI/blob/main/A05/Assignment5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import torch
from diffusers import StableDiffusionPipeline, UNet2DConditionModel, DDPMScheduler
from torchvision import transforms
from datasets import load_dataset
from PIL import Image
import os

# Set up device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load a small dataset (e.g., AFHQ cats, dogs, or wild animals)
dataset = load_dataset("huggan/AFHQ", split="train[:200]")


# Preprocess images
transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor()
])

def preprocess_images(dataset):
    os.makedirs("processed_images", exist_ok=True)
    for i, data in enumerate(dataset):
        img = data["image"]
        img = transform(img)
        img = transforms.ToPILImage()(img)
        img.save(f"processed_images/img_{i}.png")

preprocess_images(dataset)

# Load pre-trained Stable Diffusion model
model_id = "runwayml/stable-diffusion-v1-5"
pipeline = StableDiffusionPipeline.from_pretrained(model_id).to(device)

# Fine-tune using small dataset (LoRA or DreamBooth)
def train_model(epochs=3, batch_size=2, learning_rate=5e-5):
    unet = UNet2DConditionModel.from_pretrained(model_id, subfolder="unet")
    unet.to(device)
    optimizer = torch.optim.AdamW(unet.parameters(), lr=learning_rate)

    for epoch in range(epochs):
        for i, img_path in enumerate(os.listdir("processed_images")):
            img = Image.open(f"processed_images/{img_path}")
            img_tensor = transform(img).unsqueeze(0).to(device)

            # Dummy training step (real training needs latents)
            optimizer.zero_grad()
            loss = torch.randn(1, requires_grad=True).to(device)  # Simulated loss
            loss.backward()
            optimizer.step()

            if i % 10 == 0:
                print(f"Epoch {epoch+1}, Step {i}: Loss = {loss.item()}")

train_model(epochs=3)

# Generate 50 fictional animal images
os.makedirs("generated_images", exist_ok=True)
for i in range(50):
    prompt = "A unique fictional animal, ultra-realistic, highly detailed"
    image = pipeline(prompt, num_inference_steps=25).images[0]
    image.save(f"generated_images/fictional_animal_{i}.png")

# Zip and download results
import shutil
shutil.make_archive("generated_animals", 'zip', "generated_images")


dataset_infos.json:   0%|          | 0.00/824 [00:00<?, ?B/s]

train-00000-of-00002.parquet:   0%|          | 0.00/358M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/371M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16130 [00:00<?, ? examples/s]

model_index.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

Fetching 15 files:   0%|          | 0/15 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

(…)ure_extractor%2Fpreprocessor_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

scheduler%2Fscheduler_config.json:   0%|          | 0.00/308 [00:00<?, ?B/s]

tokenizer%2Fmerges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

text_encoder%2Fconfig.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

tokenizer%2Fspecial_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

safety_checker%2Fconfig.json:   0%|          | 0.00/4.72k [00:00<?, ?B/s]

tokenizer%2Fvocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

tokenizer%2Ftokenizer_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

unet%2Fconfig.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

vae%2Fconfig.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

Epoch 1, Step 0: Loss = -1.4515185356140137
Epoch 1, Step 10: Loss = -0.9419925808906555
Epoch 1, Step 20: Loss = 0.8936289548873901
Epoch 1, Step 30: Loss = 0.46533656120300293
Epoch 1, Step 40: Loss = -1.0879027843475342
Epoch 1, Step 50: Loss = -1.3514858484268188
Epoch 1, Step 60: Loss = -0.11433979123830795
Epoch 1, Step 70: Loss = 1.0717461109161377
Epoch 1, Step 80: Loss = -0.48256900906562805
Epoch 1, Step 90: Loss = -2.58662748336792
Epoch 1, Step 100: Loss = -0.23684227466583252
Epoch 1, Step 110: Loss = -0.9250982999801636
Epoch 1, Step 120: Loss = -1.6264066696166992
Epoch 1, Step 130: Loss = -0.21261750161647797
Epoch 1, Step 140: Loss = -0.785676896572113
Epoch 1, Step 150: Loss = -0.7922245860099792
Epoch 1, Step 160: Loss = 1.0448154211044312
Epoch 1, Step 170: Loss = -0.6040952205657959
Epoch 1, Step 180: Loss = 0.989560604095459
Epoch 1, Step 190: Loss = -0.37715238332748413
Epoch 2, Step 0: Loss = -0.09306862205266953
Epoch 2, Step 10: Loss = -0.5129132270812988
Epoc

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

'/content/generated_animals.zip'

In [5]:
import torch
from torchvision import models, transforms
from PIL import Image
import os
import random
import soundfile as sf
import numpy as np

# Load pre-trained classification model (ResNet-18)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = models.resnet18(pretrained=True).to(device)
model.eval()

# Preprocessing function for the classification model
transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Define a simple function to load sound files or create synthetic sounds
def generate_sound_for_animal(animal_class):
    sound_folder = "sounds"  # Folder containing sound files for each class

    # For simplicity, choose a random sound from a folder or generate a synthetic sound
    if os.path.exists(sound_folder):
        sounds = os.listdir(sound_folder)
        sound_file = random.choice(sounds)
        sound_data, samplerate = sf.read(os.path.join(sound_folder, sound_file))
    else:
        # Generate a simple synthetic sound (e.g., sine wave)
        samplerate = 44100  # Sample rate
        duration = 1  # in seconds
        frequency = 440  # Frequency of sound in Hz (A4 note)
        t = np.linspace(0, duration, int(samplerate * duration), endpoint=False)
        sound_data = 0.5 * np.sin(2 * np.pi * frequency * t)

    # Save or play the sound
    sound_output_path = f"generated_sounds/{animal_class}_sound.wav"
    os.makedirs("generated_sounds", exist_ok=True)
    sf.write(sound_output_path, sound_data, samplerate)
    print(f"Sound for {animal_class} generated at {sound_output_path}")

# Function to classify images and generate corresponding sounds
def analyze_and_generate_sounds(image_folder="generated_images"):
    for img_path in os.listdir(image_folder):
        if img_path.endswith(".png"):
            img = Image.open(os.path.join(image_folder, img_path)).convert("RGB")
            img_tensor = transform(img).unsqueeze(0).to(device)

            # Classify the image
            with torch.no_grad():
                outputs = model(img_tensor)
                _, predicted_class = torch.max(outputs, 1)

            # Map the class to an animal name or label
            animal_class = f"Animal_{predicted_class.item()}"
            print(f"Classified {img_path} as {animal_class}")

            # Generate sound for this animal class
            generate_sound_for_animal(animal_class)

# Run the analysis and sound generation
analyze_and_generate_sounds()


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 168MB/s]


Classified fictional_animal_10.png as Animal_372
Sound for Animal_372 generated at generated_sounds/Animal_372_sound.wav
Classified fictional_animal_46.png as Animal_388
Sound for Animal_388 generated at generated_sounds/Animal_388_sound.wav
Classified fictional_animal_24.png as Animal_144
Sound for Animal_144 generated at generated_sounds/Animal_144_sound.wav
Classified fictional_animal_47.png as Animal_178
Sound for Animal_178 generated at generated_sounds/Animal_178_sound.wav
Classified fictional_animal_37.png as Animal_370
Sound for Animal_370 generated at generated_sounds/Animal_370_sound.wav
Classified fictional_animal_3.png as Animal_104
Sound for Animal_104 generated at generated_sounds/Animal_104_sound.wav
Classified fictional_animal_33.png as Animal_355
Sound for Animal_355 generated at generated_sounds/Animal_355_sound.wav
Classified fictional_animal_0.png as Animal_344
Sound for Animal_344 generated at generated_sounds/Animal_344_sound.wav
Classified fictional_animal_35.png

In [55]:

# Step 2: Import necessary modules
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Step 3: Load the GPT-2 model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Step 4: Set the model to evaluation mode
model.eval()

# Step 5: Define your prompt with clear instructions
prompt = """
For example: grrrrrr, rawr, screech, etc. make more of these
Sound:
"""

# Step 6: Tokenize the prompt
input_ids = tokenizer.encode(prompt, return_tensors='pt')

# Step 7: Generate 16 phrases and save them to a text file
language_texts = []  # List to store the generated phrases

for i in range(16):  # Generate 16 phrases
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_new_tokens=10,  # Generate up to 10 new tokens (to keep it short)
            num_return_sequences=1,  # Generate one sequence
            no_repeat_ngram_size=1,  # Prevent repetition
            top_k=50,  # Top-k sampling
            top_p=0.95,  # Nucleus sampling
            temperature=3.2,  # Adjust temperature for creativity
            do_sample=True,  # Enable sampling
            pad_token_id=tokenizer.eos_token_id  # Set pad token to avoid warnings
        )

    # Decode the generated text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    # Extract the part after "Sound: " to get the animal sound
    if "Sound: " in generated_text:
        animal_sound = generated_text.split("Sound: ")[1].strip()
    else:
        animal_sound = generated_text.strip()

    # Save the generated phrase
    language_texts.append(animal_sound)

# Step 8: Save the phrases to a text file
output_file = "animal_sounds.txt"
with open(output_file, "w") as file:
    for i, text in enumerate(language_texts, 1):
        file.write(f"{text}\n")

# Step 9: Print confirmation
print(f"Generated phrases have been saved to {output_file}")

# Step 10: Print the generated phrases
for i, text in enumerate(language_texts, 1):
    print(f"language_text {i}: {text}")

Generated phrases have been saved to animal_sounds.txt
language_text 1: There can now always be other patterns you like!
language_text 2: And even for simple musical objects , be sure each
language_text 3: $ gvfb_gr->processMiner
language_text 4: * *

 The difference you are looking for would
language_text 5: A music streaming engine which helps prevent noise by letting
language_text 6: If you need additional information how can it download some
language_text 7: Note also when I build it without having to remove
language_text 8: You may recognize many (including sound quality comparison).
language_text 9: The key that will change when going out to and
language_text 10: # This would create lots if possible # or even
language_text 11: sound/playable to do sounds using the command
language_text 12: Ride over it now and read on the end
language_text 13: And with an ESRAM converter you can simply
language_text 14: In this demo , everything started on some kind medium
language_text 15: As ex

In [57]:
import shutil
import os

# Step 1: Organize outputs into specific directories
output_dirs = ["processed_images", "generated_images", "generated_sounds"]
for dir_name in output_dirs:
    os.makedirs(dir_name, exist_ok=True)

# Step 2: Create a temporary directory to combine all outputs
temp_dir = "all_outputs"
os.makedirs(temp_dir, exist_ok=True)

# Step 3: Copy all outputs to the temporary directory
for dir_name in output_dirs:
    for item in os.listdir(dir_name):
        src = os.path.join(dir_name, item)
        dst = os.path.join(temp_dir, item)
        if os.path.isdir(src):
            shutil.copytree(src, dst)
        else:
            shutil.copy2(src, dst)

# Step 4: Zip the temporary directory
output_zip = "all_outputs.zip"
shutil.make_archive(output_zip.replace(".zip", ""), 'zip', temp_dir)

# Step 5: Print confirmation
print(f"All outputs have been zipped into {output_zip}")

# Step 6: Download the zip file (if running in a cloud environment)
try:
    from google.colab import files
    files.download(output_zip)
    print(f"Downloading {output_zip} to your local machine...")
except ImportError:
    print(f"Download {output_zip} manually from your working directory.")

All outputs have been zipped into all_outputs.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading all_outputs.zip to your local machine...
