# HW05: Predictive Models

## Setup

Run the following cell if you need to install the PyTorch or Transformers libraries on a lab computer:

In [None]:
# !pip install diffusers transformers accelerate safetensors torchvision --upgrade
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 --upgrade

This code imports the libraries we need to run our inference pipelines:

In [None]:
from PIL import Image
from transformers import pipeline
from transformers import DetrImageProcessor, DetrForObjectDetection

## Depth [Prediction](https://huggingface.co/depth-anything/Depth-Anything-V2-Base-hf)

In [None]:
depth_estimator = pipeline(
  task="depth-estimation",
  model="depth-anything/Depth-Anything-V2-Base-hf"
)

In [None]:
image = Image.open("./imgs/flowers.jpg")
result = depth_estimator(image)
display(result["depth"])

## Object [Detection](https://huggingface.co/facebook/detr-resnet-101)

Some models don't work with the pipeline inference object, but the Transformers library still has some consistent-ish interfaces for running these models.

In [None]:
OBJ_MODEL = "facebook/detr-resnet-101"
detr_processor = DetrImageProcessor.from_pretrained(OBJ_MODEL)
detr_model = DetrForObjectDetection.from_pretrained(OBJ_MODEL)

In [None]:
image = Image.open("./imgs/people.jpg")
iw, ih = image.size

detr_inputs = detr_processor(images=image, return_tensors="pt")

detr_output = detr_model(**detr_inputs)
detr_results = detr_processor.post_process_object_detection(detr_output, 0.99, [(ih, iw)])

display(detr_results)

In [None]:
# Turn label ids into labels
for label_id in detr_results[0]["labels"]:
  print(detr_model.config.id2label[int(label_id)])

## Image to [Text](https://huggingface.co/Salesforce/blip-image-captioning-large)

In [None]:
#code from hugging face to test how it runs 

import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg' 
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')

# conditional image captioning
text = "a photography of"
inputs = processor(raw_image, text, return_tensors="pt")

out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True))
# >>> a photography of a woman and her dog

# unconditional image captioning
inputs = processor(raw_image, return_tensors="pt")

out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True))


In [None]:
import os, torch

#testing code

#unprompted trains

img_folder = "/workspaces/HW05/imgs"
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

train_imgs = [
    "A_train_02.jpg", "A_train_03.jpg", "A_train_04.webp", "A_train_05.webp",
    "A_train_06.jpeg", "A_train_06.webp", "A_train_07.webp", "A_train_08.jpg",
    "A_train_09.jpeg", "A_train_10.jpeg", "A_train_11.jpg", "A_train_12.webp",
    "A_train_13.jpg", "A_train_14.webp", "A_train_15.jpg", "A_train_16.webp",
    "A_train_17.webp", "A_train_18.webp", "A_train_19.jpg", "A_train_20.jpg"
]

count = 1

for photo in train_imgs:
    img_path = os.path.join(img_folder, photo) 
    raw_image = Image.open(img_path).convert("RGB")

    inputs = processor(raw_image, return_tensors="pt").to(device)
    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)

    print(f"{count} - {caption}\n")
    count += 1

In [None]:
#prompted 

count = 1
prompt = "this new subway train is "  

for photo in train_imgs:
    img_path = os.path.join(img_folder, photo) 
    raw_image = Image.open(img_path).convert("RGB")

    inputs = processor(raw_image, text=prompt, return_tensors="pt").to(device)

    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)

    print(f"{count} - {caption}\n")
    count += 1


In [None]:
count = 1
prompt = "New York City Subway: this photo shows "  

for photo in train_imgs:
    img_path = os.path.join(img_folder, photo) 
    raw_image = Image.open(img_path).convert("RGB")

    inputs = processor(raw_image, text=prompt, return_tensors="pt").to(device)

    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)

    print(f"new prompt {count} - {caption}\n")
    count += 1

In [None]:
# animals unprompted and prompted

animal_imgs = [
    "catdog.webp", "cats.webp", "dogs.jpg"
]

count = 1

for photo in animal_imgs:

    img_path = os.path.join(img_folder, photo) 
    raw_image = Image.open(img_path).convert("RGB")

    inputs = processor(raw_image, return_tensors="pt").to(device)
    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)

    print(f"{photo} - {caption}\n")
    count += 1
 
count = 1
#prompt = "how many animals are in the picture "
prompt = "Describe the animals in the image are "

# noticed that when i used colons the output with the prompt made no sense at all so I chnaged the prompt from "the animal(s) in the image are:"" to above 
for photo1 in animal_imgs:
    img_path = os.path.join(img_folder, photo1)  
    raw_image = Image.open(img_path).convert("RGB")


    inputs = processor(raw_image, text=prompt, return_tensors="pt").to(device)

    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)

    print(f"{photo1} - {caption}\n")
    count += 1


catdog.webp - two pugs laying on a blanket with a cat

cats.webp - a group of cats sitting in a circle

dogs.jpg - three dogs are sitting next to each other dogs

catdog.webp - how many animals are in the picture of a cat and a dog

cats.webp - how many animals are in the picture

dogs.jpg - how many animals are in the picture



In [31]:
#various races of people - unprompted and then prompted

ppl_imgs = [
    "whitemen.webp", "eastasianmen.webp", "asian actresses.webp", "blackmen.jpg"
]



for photo in ppl_imgs:

    img_path = os.path.join(img_folder, photo) 
    raw_image = Image.open(img_path).convert("RGB")

    inputs = processor(raw_image, return_tensors="pt").to(device)
    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)

    print(f"{photo} - {caption}\n")



prompt = "the skin color of the people in the picture is "

# 
for photo1 in ppl_imgs:
    img_path = os.path.join(img_folder, photo1)  
    raw_image = Image.open(img_path).convert("RGB")


    inputs = processor(raw_image, text=prompt, return_tensors="pt").to(device)

    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)

    print(f"{photo1} - {caption}\n")


 

whitemen.webp - the cast of the vampire

eastasianmen.webp - four men in suits and ties

asian actresses.webp - the nominees nominees nominees nominees nominees nominees nominees nominees nominees nominees nominees nominees nominees nominees nominees nominees nominees nominees nominees

blackmen.jpg - the cast of the oscars

whitemen.webp - the skin color of the people in the picture is the same

eastasianmen.webp - the skin color of the people in the picture is the same, but the hair color is different

asian actresses.webp - the skin color of the people in the picture is the most popular

blackmen.jpg - the skin color of the people in the picture is a black man



In [41]:
pip install pytesseract

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
#subway exit signs - unprompted & prompted extraction
import pytesseract

sub_imgs = [
    "subway exit 1.jpg","subway exit 2.webp", "subway_exit_signs.jpg"
]

for photo in sub_imgs:

    img_path = os.path.join(img_folder, photo) 
    raw_image = Image.open(img_path).convert("RGB")

    inputs = processor(raw_image, return_tensors="pt").to(device)
    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)

    print(f"{photo} - {caption}\n")

prompt = "the street name on the exit sign says "
#tried to extract the text but did not have a lot of luck
for photo1 in sub_imgs:
    img_path = os.path.join(img_folder, photo1)  
    raw_image = Image.open(img_path).convert("RGB")

    extracted_text = pytesseract.image_to_string(raw_image)
    print(f"Extracted text from {photo1}: {extracted_text}\n")

    inputs = processor(raw_image, text=prompt, return_tensors="pt").to(device)

    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)

    print(f"{photo1} - {caption}\n")

subway exit 1.jpg - a sign on the floor in the subway station

subway exit 2.webp - exit sign in subway station

subway_exit_signs.jpg - a sign that says exit to the subway

Extracted text from subway exit 1.jpg: 

subway exit 1.jpg - the street name on the exit sign says & amp & amp & amp & amp & amp & amp & amp & amp & amp & amp

Extracted text from subway exit 2.webp: 

subway exit 2.webp - the street name on the exit sign says exit

Extracted text from subway_exit_signs.jpg: Sage Sets

Use last two

stairways
for) @eo



subway_exit_signs.jpg - the street name on the exit sign says exit

