# HW05: Predictive Models

## Setup

Run the following cell if you need to install the PyTorch or Transformers libraries on a lab computer:

In [3]:
# !pip install diffusers transformers accelerate safetensors torchvision --upgrade
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 --upgrade

This code imports the libraries we need to run our inference pipelines:

In [4]:
from PIL import Image
from transformers import pipeline
from transformers import DetrImageProcessor, DetrForObjectDetection

## Depth [Prediction](https://huggingface.co/depth-anything/Depth-Anything-V2-Base-hf)

In [None]:
depth_estimator = pipeline(
  task="depth-estimation",
  model="depth-anything/Depth-Anything-V2-Base-hf"
)

In [None]:
image = Image.open("./imgs/flowers.jpg")
result = depth_estimator(image)
display(result["depth"])

## Object [Detection](https://huggingface.co/facebook/detr-resnet-101)

Some models don't work with the pipeline inference object, but the Transformers library still has some consistent-ish interfaces for running these models.

In [None]:
OBJ_MODEL = "facebook/detr-resnet-101"
detr_processor = DetrImageProcessor.from_pretrained(OBJ_MODEL)
detr_model = DetrForObjectDetection.from_pretrained(OBJ_MODEL)

In [None]:
image = Image.open("./imgs/people.jpg")
iw, ih = image.size

detr_inputs = detr_processor(images=image, return_tensors="pt")

detr_output = detr_model(**detr_inputs)
detr_results = detr_processor.post_process_object_detection(detr_output, 0.99, [(ih, iw)])

display(detr_results)

In [None]:
# Turn label ids into labels
for label_id in detr_results[0]["labels"]:
  print(detr_model.config.id2label[int(label_id)])

## Image to [Text](https://huggingface.co/Salesforce/blip-image-captioning-large)

In [5]:
#code from hugging face to test how it runs 

import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg' 
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')

# conditional image captioning
text = "a photography of"
inputs = processor(raw_image, text, return_tensors="pt")

out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True))
# >>> a photography of a woman and her dog

# unconditional image captioning
inputs = processor(raw_image, return_tensors="pt")

out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True))


a photography of a woman and her dog on the beach
a woman sitting on the beach with her dog


In [6]:
import os, torch

#testing code

#unprompted trains

img_folder = "/workspaces/HW05/imgs"
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

train_imgs = [
    "A_train_02.jpg", "A_train_03.jpg", "A_train_04.webp", "A_train_05.webp",
    "A_train_06.jpeg", "A_train_06.webp", "A_train_07.webp", "A_train_08.jpg",
    "A_train_09.jpeg", "A_train_10.jpeg", "A_train_11.jpg", "A_train_12.webp",
    "A_train_13.jpg", "A_train_14.webp", "A_train_15.jpg", "A_train_16.webp",
    "A_train_17.webp", "A_train_18.webp", "A_train_19.jpg", "A_train_20.jpg"
]

count = 1

for photo in train_imgs:
    img_path = os.path.join(img_folder, photo) 
    raw_image = Image.open(img_path).convert("RGB")

    inputs = processor(raw_image, return_tensors="pt").to(device)
    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)

    print(f"{count} - {caption}\n")
    count += 1

1 - a train is stopped at a station in tokyo

2 - a train is pulling passengers on the tracks

3 - a train on the tracks

4 - a train at a station with people standing around

5 - a train on the tracks

6 - a train is parked at the station

7 - a train with a sign on the side of it

8 - a train at the station in the city of cologne, germany - stock image

9 - a train is coming down the tracks in a city

10 - two trains on the tracks

11 - a train traveling through the city of chicago

12 - a train is parked at a station in the night

13 - a train traveling down the tracks in a city

14 - a train traveling down the tracks in a city

15 - a train is stopped at a train station

16 - a train traveling down the tracks in the snow

17 - a train traveling down the tracks in the snow

18 - a train traveling down the tracks in the snow

19 - a train is coming down the tracks in the snow

20 - a train is coming down the tracks in the snow



In [8]:
#prompted 

count = 1
prompt = "this new subway train type is "  

for photo in train_imgs:
    img_path = os.path.join(img_folder, photo) 
    raw_image = Image.open(img_path).convert("RGB")

    inputs = processor(raw_image, text=prompt, return_tensors="pt").to(device)

    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)

    print(f"{count} - {caption}\n")
    count += 1


1 - this new subway train type is the most in the world

2 - this new subway train type is the only in the city

3 - this new subway train type is red and white

4 - this new subway train type is the first in the world to be built

5 - this new subway train type is a high speed

6 - this new subway train type is the most efficient in the world

7 - this new subway train type is parked at the station

8 - this new subway train type is the only used in the city of toronto, canada, and is now used for public transportation stock photo

9 - this new subway train type is running through the city of kolkata

10 - this new subway train type is a great way to get around the city

11 - this new subway train type is seen in the lower manhattan area of new york, new york, new york, new york, new

12 - this new subway train type is a little bit of a train

13 - this new subway train type is the only available in the city of boston

14 - this new subway train type is the most in the world

15 - thi

In [None]:
count = 1
prompt = "this new york city subway car type is  "  

for photo in train_imgs:
    img_path = os.path.join(img_folder, photo) 
    raw_image = Image.open(img_path).convert("RGB")

    inputs = processor(raw_image, text=prompt, return_tensors="pt").to(device)

    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)

    print(f"new prompt {count} - {caption}\n")
    count += 1

In [None]:
# animals unprompted and prompted

animal_imgs = [
    "catdog.webp", "cats.webp", "dogs.jpg"
]

count = 1

for photo in animal_imgs:

    img_path = os.path.join(img_folder, photo) 
    raw_image = Image.open(img_path).convert("RGB")

    inputs = processor(raw_image, return_tensors="pt").to(device)
    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)

    print(f"{photo} - {caption}\n")
    count += 1
 
count = 1
#prompt = "how many animals are in the picture "
prompt = "Describe the animals in the image are "

# noticed that when i used colons the output with the prompt made no sense at all so I chnaged the prompt from "the animal(s) in the image are:"" to above 
for photo1 in animal_imgs:
    img_path = os.path.join(img_folder, photo1)  
    raw_image = Image.open(img_path).convert("RGB")


    inputs = processor(raw_image, text=prompt, return_tensors="pt").to(device)

    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)

    print(f"{photo1} - {caption}\n")
    count += 1


In [None]:
#various races of people - unprompted and then prompted

ppl_imgs = [
    "whitemen.webp", "eastasianmen.webp", "asian actresses.webp", "blackmen.jpg"
]



for photo in ppl_imgs:

    img_path = os.path.join(img_folder, photo) 
    raw_image = Image.open(img_path).convert("RGB")

    inputs = processor(raw_image, return_tensors="pt").to(device)
    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)

    print(f"{photo} - {caption}\n")



prompt = "the skin color of the people in the picture is "

# 
for photo1 in ppl_imgs:
    img_path = os.path.join(img_folder, photo1)  
    raw_image = Image.open(img_path).convert("RGB")


    inputs = processor(raw_image, text=prompt, return_tensors="pt").to(device)

    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)

    print(f"{photo1} - {caption}\n")


 

In [None]:
pip install pytesseract

In [None]:
#subway exit signs - unprompted & prompted extraction tried with just prompting first but didn't work well then I found out about tesseract an OCR tool
import pytesseract

sub_imgs = [
    "subway exit 1.jpg","subway exit 2.webp", "subway_exit_signs.jpg"
]

for photo in sub_imgs:

    img_path = os.path.join(img_folder, photo) 
    raw_image = Image.open(img_path).convert("RGB")

    inputs = processor(raw_image, return_tensors="pt").to(device)
    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)

    print(f"{photo} - {caption}\n")

prompt = "the street name on the exit sign says "
#tried to extract the text but did not have a lot of luck
for photo1 in sub_imgs:
    img_path = os.path.join(img_folder, photo1)  
    raw_image = Image.open(img_path).convert("RGB")

    extracted_text = pytesseract.image_to_string(raw_image)
    print(f"Extracted text from {photo1}: {extracted_text}\n")

    inputs = processor(raw_image, text=prompt, return_tensors="pt").to(device)

    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)

    print(f"{photo1} - {caption}\n")