<a href="https://colab.research.google.com/github/AdmiralJPJ/ai/blob/main/ML_HW4_Q1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

*Resources used: Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow book by Aurélien Géron, ChatGPT-4o, Google Gemini*

## Applied Machine Learning HW 3 Q1, Group 36, **Jack Tyndall**

In [None]:
import os
from PIL import Image
from torch.utils.data import Dataset
from transformers import CLIPProcessor

class ClipImageDataset(Dataset):
    def __init__(self, root_dir, condition, categories, processor):
        self.processor = processor
        self.categories = categories
        self.image_paths = []
        self.labels = []

        base_path = os.path.join(root_dir, condition)

        for label_idx, category in enumerate(categories):
            for filename in os.listdir(base_path):
                if (filename.lower().endswith(("jpg", "png", "jpeg"))) and (category in filename.lower()):
                    self.image_paths.append(os.path.join(base_path, filename))
                    self.labels.append(label_idx)

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        path = self.image_paths[idx]
        label = self.labels[idx]

        image = Image.open(path).convert("RGB")

        processed = self.processor(images=image, return_tensors="pt")
        processed["pixel_values"] = processed["pixel_values"].squeeze(0)

        return processed["pixel_values"], label, path



In [None]:
from transformers import CLIPModel, CLIPProcessor
from torch.utils.data import DataLoader

categories = ["airplane", "car", "chair", "cup", "dog", "donkey", "duck", "hat"]

processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

real_dataset = ClipImageDataset(
    root_dir="/content/drive/MyDrive/HW3/v0",
    condition="realistic",
    categories=categories,
    processor=processor
)

feature_dataset = ClipImageDataset(
    root_dir="/content/drive/MyDrive/HW3/v0",
    condition="features",
    categories=categories,
    processor=processor
)

geon_dataset = ClipImageDataset(
    root_dir="/content/drive/MyDrive/HW3/v0",
    condition="geons",
    categories=categories,
    processor=processor
)

sil_dataset = ClipImageDataset(
    root_dir="/content/drive/MyDrive/HW3/v0",
    condition="silhouettes",
    categories=categories,
    processor=processor
)

blur_dataset = ClipImageDataset(
    root_dir="/content/drive/MyDrive/HW3/v0",
    condition="blurred",
    categories=categories,
    processor=processor
)

real_dataloader = DataLoader(real_dataset, batch_size=1, shuffle=False)
feature_dataloader = DataLoader(feature_dataset, batch_size=1, shuffle=False)
geon_dataloader = DataLoader(geon_dataset, batch_size=1, shuffle=False)
sil_dataloader = DataLoader(sil_dataset, batch_size=1, shuffle=False)
blur_dataloader = DataLoader(blur_dataset, batch_size=1, shuffle=False)


In [None]:
# Get model configuration
print(model.config)

# Count total parameters
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params:,}")

# Vision encoder details
print("Vision model:", model.vision_model)
print("Number of vision layers:", model.config.vision_config.num_hidden_layers)
print("Hidden size:", model.config.vision_config.hidden_size)
print("Number of attention heads:", model.config.vision_config.num_attention_heads)

# Text encoder details
print("Text model:", model.text_model)
print("Number of text layers:", model.config.text_config.num_hidden_layers)
print("Text hidden size:", model.config.text_config.hidden_size)

# Break down parameters by component
for name, param in model.named_parameters():
    print(f"{name}: {param.shape} = {param.numel():,} parameters")

### **HW3Q1 1.**

Images loaded in, gray scaled, and reduced to 128x128. Each condition is stored in a dict.

Each number of components needed for 95% variance is displayed, and it tracks that the higher quality, more full images, require more components to maintain variance. Blurred images, which had little detail, required the least amount of components, while geons, which appear to be high definition simple 3D renderings, require the most.
