In [22]:
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from PIL import Image

# Load model and tokenizer
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# Load and preprocess the image from local file
image_path = "cat1.jpg"  # Replace with your image path
image = Image.open(image_path).convert("RGB")  # Ensure the image is in RGB mode
pixel_values = feature_extractor(images=[image], return_tensors="pt").pixel_values

# Generate caption with minimum and maximum length settings
generated_ids = model.generate(
    pixel_values,
    min_length=25,  # Set a minimum length to encourage longer captions
    max_length=100,  # Set a maximum length to avoid excessively long captions
    num_beams=5,  # Use beam search to improve the quality of the generated caption
    no_repeat_ngram_size=2,  # Avoid repeating n-grams of the specified size
    early_stopping=True  # Stop early when the end token is generated
)

caption = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

# # Ensure the caption has exactly 20 words, adjusting if necessary
# words = caption.split()
# if len(words) > 20:
#     caption = ' '.join(words[:20])
# elif len(words) < 20:
#     # Optionally, pad with additional words or symbols if less than 20
#     caption = ' '.join(words + ['<pad>'] * (20 - len(words)))

print("Caption:", caption)


Caption: an orange and white cat is looking at the camera with its mouth open and its eyes wide open, with the cat's tongue hanging out 


In [10]:
import torch
from PIL import Image
import matplotlib.pyplot as plt
import torchvision.transforms as T

# Load YOLOv5 model
model = torch.hub.load('ultralytics/yolov5', 'yolov5s')

# Load and preprocess the image from local file
image_path = "path/to/your/image.jpg"
image = Image.open(image_path)

# Perform inference
results = model(image_path)

# Print results
results.print()
results.show()


RuntimeError: It looks like there is no internet connection and the repo could not be found in the cache (/Users/cth/.cache/torch/hub)

In [9]:
%pip install torch torchvision

Collecting torchvision
  Downloading torchvision-0.18.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.6 kB)
Downloading torchvision-0.18.0-cp312-cp312-macosx_11_0_arm64.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: torchvision
Successfully installed torchvision-0.18.0
Note: you may need to restart the kernel to use updated packages.


In [13]:
import torch
from torchvision import models, transforms
from PIL import Image

# Load ResNet model
model = models.resnet50(pretrained=True)
model.eval()

# Load and preprocess the image from local file
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

image_path = "cat1.jpg"  # Replace with your image path
image = Image.open(image_path)
image = preprocess(image)
image = image.unsqueeze(0)  # Add batch dimension

# Perform inference
with torch.no_grad():
    output = model(image)
_, predicted = torch.max(output, 1)

print("Predicted class:", predicted.item())


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /Users/cth/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1000)>

In [23]:
from diffusers import DiffusionPipeline

pipeline = DiffusionPipeline.from_pretrained("nDimensional/Experience")

EntryNotFoundError: 404 Client Error. (Request ID: Root=1-665e8bf4-647e75d83dc3df7e5d21474e;c2a4d893-1f5b-4753-a705-395672fbee1b)

Entry Not Found for url: https://huggingface.co/nDimensional/Experience/resolve/main/model_index.json.

In [24]:
%pip install huggingface_hub

Note: you may need to restart the kernel to use updated packages.


In [31]:
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

# Load the processor and model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

# Load and preprocess the image from local file
image_path = "cat1.jpg"  # Replace with your image path
raw_image = Image.open(image_path).convert("RGB")  # Ensure the image is in RGB mode

# Conditional image captioning
text = "a photography of"
inputs = processor(raw_image, text, return_tensors="pt")

out = model.generate(**inputs)
conditional_caption = processor.decode(out[0], skip_special_tokens=True)
print("Conditional Caption:", conditional_caption)

# Unconditional image captioning
inputs = processor(raw_image, return_tensors="pt")

out = model.generate(**inputs)
unconditional_caption = processor.decode(out[0], skip_special_tokens=True)
print("Unconditional Caption:", unconditional_caption)

# Generate caption with minimum and maximum length settings
inputs = processor(raw_image, return_tensors="pt")

generated_ids = model.generate(
    **inputs,
    min_length=25,  # Set a minimum length to encourage longer captions
    max_length=100,  # Set a maximum length to avoid excessively long captions
    num_beams=5,  # Use beam search to improve the quality of the generated caption
    no_repeat_ngram_size=2,  # Avoid repeating n-grams of the specified size
    early_stopping=True  # Stop early when the end token is generated
)

caption = processor.decode(generated_ids[0], skip_special_tokens=True)

# Ensure the caption has exactly 20 words, adjusting if necessary
words = caption.split()
if len(words) > 20:
    caption = ' '.join(words[:20])
elif len(words) < 20:
    # Optionally, pad with additional words or symbols if less than 20
    caption = ' '.join(words + ['<pad>'] * (20 - len(words)))

print("Caption:", caption)


Conditional Caption: a photography of a cat yawning with its mouth open
Unconditional Caption: there is a cat that is yawning and looking at the camera
Caption: there is an orange and white cat yawning with its mouth wide open while sitting on top of a window
