#Downloads

In [None]:
!pip install torchcodec --quiet
!pip install torchinfo --quiet
!pip install transformers accelerate sentencepiece torchaudio diffusers datasets soundfile pillow --quiet
!pip install rouge-score
!pip install git+https://github.com/openai/CLIP.git

#Imports

In [None]:
import os
import shutil
import soundfile as sf
from IPython.display import display

import clip
import torch
import torchaudio
import torch.nn as nn
from torchinfo import summary
from torchvision import datasets, transforms
from torchvision.models import vit_b_16, ViT_B_16_Weights
from torch.utils.data import DataLoader, random_split, TensorDataset, Dataset

from PIL import Image
from diffusers import FluxPipeline
from transformers import WhisperProcessor, WhisperModel, WhisperForConditionalGeneration, pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForCausalLM, GenerationConfig

import gc
import json
import uuid
import time
import random
import librosa
import logging
import kagglehub
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from pathlib import Path
import matplotlib.pyplot as plt
from dataclasses import dataclass
from sklearn.svm import LinearSVC
from collections import defaultdict
from rouge_score import rouge_scorer
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from huggingface_hub import login
login()
os.environ["HF_TOKEN"] = ""

#Create the images from the rewritten LLM prompts

Generate pictures to use and classify for EmoA from rewritten prompts

In [None]:
class ImageGeneration(nn.Module):
    def __init__(
        self,
        flux_model_name="black-forest-labs/FLUX.1-schnell",
        device="cuda",
    ):
        super().__init__()

        self.device = device


        self.flux = FluxPipeline.from_pretrained(
            flux_model_name,
            torch_dtype=torch.float16,
            device_map=device
        )
        self.flux.set_progress_bar_config(disable=True)


    def forward(
        self,
        prompt,
        flux_height=512,
        flux_width=512,
        flux_steps=40,
        flux_guidance=3.0,
        flux_seed=0,
    ):
        """
        Forward pass (batched):
        prompt(s) -> FLUX pipeline â†’ image(s)

        prompt: str or List[str]
        """

        if isinstance(prompt, str):
            prompt = [prompt]

        batch_size = len(prompt)


        generator = torch.Generator("cpu").manual_seed(flux_seed)


        out = self.flux(
            prompt,
            height=flux_height,
            width=flux_width,
            guidance_scale=flux_guidance,
            num_inference_steps=flux_steps,
            generator=generator,
        )

        images = out.images

        return {
            "Prompt": prompt,
            "images": images,
        }


In [None]:
!cp "/content/drive/MyDrive/GenAI/Project/modified_prompts.json" "/content/"

In [None]:
def clean_prompt(p):

    p = p.replace("<|endoftext|>", " ")


    p = " ".join(p.split())


    words = p.split()
    if len(words) > 70:
        words = words[:70]
    return " ".join(words)


vit_classes = {'disgust': 0, 'fear': 1, 'happy': 2, 'sad': 3, 'surprised': 4}
emotion_names = list(vit_classes.keys())


with open("/content/modified_prompts.json", "r") as f:
    rewritten_prompts = json.load(f)

data = []
for item in rewritten_prompts:
  prompt = clean_prompt(item["new_prompt"])
  emo = item["emotion"]
  data.append((prompt, emo))

batch_size = 1
test_ds_ImgGen = data
test_loader = DataLoader(test_ds_ImgGen, batch_size=batch_size, shuffle=False)

print("Total samples:", len(test_loader))


In [None]:
image_model = ImageGeneration().to("cuda")
image_model.eval()

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)

def compute_clip_score(pil_image, prompt):
    tokens = prompt.split()
    if len(tokens) > 70:
        prompt = " ".join(tokens[:70])

    image_tensor = clip_preprocess(pil_image).unsqueeze(0).to(device)
    text_tensor = clip.tokenize([prompt], truncate=True).to(device)

    with torch.no_grad():
        img_feat = clip_model.encode_image(image_tensor)
        txt_feat = clip_model.encode_text(text_tensor)
        img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True)
        txt_feat = txt_feat / txt_feat.norm(dim=-1, keepdim=True)

        return (img_feat @ txt_feat.T).item()

In [None]:
!rm -rf /content/Images
save_root = "/content/Images"
os.makedirs(save_root, exist_ok=True)

for emo in emotion_names:
    os.makedirs(os.path.join(save_root, emo), exist_ok=True)

all_samples = []


for batch in tqdm(test_loader):
    prompts, emotions = batch


    out = image_model(
        prompt=list(prompts),
        flux_height=512,
        flux_width=512,
        flux_steps=30,
        flux_guidance=3.0,
        flux_seed=0,
    )

    images = out["images"]

    for prompt, emo, img in zip(prompts, emotions, images):
        clip_score = compute_clip_score(img, prompt)
        all_samples.append({
            "prompt": prompt,
            "emotion": emo,
            "image": img,
            "clip_score": clip_score
        })


In [None]:
os.makedirs(save_root, exist_ok=True)


for emo in emotion_names:
    os.makedirs(os.path.join(save_root, emo), exist_ok=True)

counters = defaultdict(int)

for sample in all_samples:
    emo = str(sample["emotion"])
    img = sample["image"]

    if not hasattr(img, "save"):
        raise TypeError(f"Image for emotion '{emo}' is not a PIL Image")


    idx = counters[emo]
    counters[emo] += 1

    filename = f"{emo}_{idx:05d}.png"
    save_path = os.path.join(save_root, emo, filename)


    img.save(save_path)

    sample["image"] = save_path


json_path = os.path.join(save_root, "metadata.json")
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(all_samples, f, ensure_ascii=False, indent=4)

print(f"Saved metadata to {json_path}")

In [None]:
len(all_samples)

In [None]:
import shutil

shutil.make_archive("/content/Images", "zip", "/content/Images")


#EmoA evaluation

Load model, prepare dataset loader and test.

In [None]:
!unzip -q -o "/content/drive/MyDrive/GenAI/Project/Images.zip" -d "/content/data"

In [None]:
!find /content/data -maxdepth 3 -type d


In [None]:
!rm -rf /content/data/.ipynb_checkpoints
!rm -rf "/content/data/angry"

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


vit_classes = {'disgust': 0, 'fear': 1, 'happy': 2, 'sad': 3, 'surprised': 4}


test_dir = "/content/data" #Maybe need change based on output path

val_test_transform = transforms.Compose([
    transforms.Resize((384, 384)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

test_ds = datasets.ImageFolder(test_dir, transform=val_test_transform)
test_loader = DataLoader(test_ds, batch_size=16, shuffle=False)

class_names = list(test_ds.class_to_idx.keys())
print("Classes:", class_names)
print("Samples:", len(test_ds))


vit_path = "/content/drive/MyDrive/GenAI/Project/vit_swag_best_val (3).pt"

from torchvision.models import vit_b_16, ViT_B_16_Weights

weights = ViT_B_16_Weights.IMAGENET1K_SWAG_E2E_V1
model = vit_b_16(weights=weights)

in_features = model.heads.head.in_features
model.heads.head = nn.Linear(in_features, len(vit_classes))


checkpoint = torch.load(vit_path, map_location=device)
state = checkpoint["model_state_dict"]
model.load_state_dict(state)

model = model.to(device)
model.eval()

criterion = nn.CrossEntropyLoss()


tot_loss = tot_correct = tot = 0
all_preds, all_labels = [], []

for x, y in test_loader:
    x, y = x.to(device), y.to(device)

    with torch.no_grad():
        out = model(x)
        loss = criterion(out, y)

    pred = out.argmax(1)

    tot_loss += loss.item() * x.size(0)
    tot_correct += (pred == y).sum().item()
    tot += y.size(0)

    all_preds.extend(pred.cpu().numpy())
    all_labels.extend(y.cpu().numpy())

avg_loss = tot_loss / tot
acc = tot_correct / tot

print(f"\nTest Loss: {avg_loss:.4f} | Accuracy: {acc:.4f}\n")
print(classification_report(all_labels, all_preds, target_names=class_names, digits=4))


#CLIP Score Evaluation

Generate Clipscore for every emotion and check global average

In [None]:
!unzip -q -o "/content/Images.zip" -d "/content/data"

In [None]:
vit_classes = {'disgust': 0, 'fear': 1, 'happy': 2, 'sad': 3, 'surprised': 4}


with open("/content/data/metadata.json", "r") as f:
    data = json.load(f)


emotion_sums = {emo: 0.0 for emo in vit_classes}
emotion_counts = {emo: 0   for emo in vit_classes}


for entry in data:
    emo = entry["emotion"]
    score = entry["clip_score"]

    if emo in vit_classes:
        emotion_sums[emo] += score
        emotion_counts[emo] += 1

emotion_averages = {
    emo: (emotion_sums[emo] / emotion_counts[emo]
          if emotion_counts[emo] > 0 else None)
    for emo in vit_classes
}


all_scores = [
    entry["clip_score"]
    for entry in data
    if entry["emotion"] in vit_classes
]

global_average = sum(all_scores) / len(all_scores) if all_scores else None


print("Average CLIP score per emotion:")
for emo, avg in emotion_averages.items():
    print(f"{emo}: {avg:.4f}" if avg is not None else f"{emo}: No samples")

print("\nOverall average CLIP score:")
print(global_average)
