In [1]:
#NLP CONNECT MODEL 
#https://huggingface.co/nlpconnect/vit-gpt2-image-captioning

In [2]:
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import torch
from PIL import Image

model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
def predict_step(image_paths):
  images = []
  for image_path in image_paths:
    i_image = Image.open(image_path)
    if i_image.mode != "RGB":
      i_image = i_image.convert(mode="RGB")

    images.append(i_image)

  pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
  pixel_values = pixel_values.to(device)

  output_ids = model.generate(pixel_values, **gen_kwargs)

  preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
  preds = [pred.strip() for pred in preds]
  return preds


result = predict_step(['/kaggle/input/img-roads/Generate camera captured shot of a indian road in urban area with people navigating through traffic including pedestrians cyclists and motorcyclists of year 2019.png'])
result



Downloading (…)lve/main/config.json:   0%|          | 0.00/4.61k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/982M [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/241 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


['a street filled with lots of cars and people']

In [3]:

from transformers import pipeline

image_to_text = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")

image_to_text("/kaggle/input/img-roads/Generate camera captured shot of a indian road in urban area with people navigating through traffic including pedestrians cyclists and motorcyclists of year 2019.png")


Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


[{'generated_text': 'a busy street filled with cars and pedestrians '}]

In [4]:
from PIL import Image
import os

# Open the image file
image = Image.open("/kaggle/input/img-roads/Generate camera captured shot of a indian road in urban area with people navigating through traffic including pedestrians cyclists and motorcyclists of year 2019.png")
file_name = os.path.basename("/kaggle/input/img-roads/Generate camera captured shot of a indian road in urban area with people navigating through traffic including pedestrians cyclists and motorcyclists of year 2019.png")
# Get image metadata
file_size = image.info.get('filesize')
file_type = image.format
image_size = image.size
megapixels = (image_size[0] * image_size[1]) / 1000000  # Calculate megapixels
image_width = image_size[0]
image_height = image_size[1]

# Print the extracted metadata
print(f"FileName: {file_name}")
print(f"FileType: {file_type}")
print(f"ImageSize: {image_size[0]}x{image_size[1]}")
print(f"Megapixels: {megapixels:.2f}")


FileName: Generate camera captured shot of a indian road in urban area with people navigating through traffic including pedestrians cyclists and motorcyclists of year 2019.png
FileType: PNG
ImageSize: 1024x1024
Megapixels: 1.05


In [5]:
# pip install accelerate
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large", device_map="auto")

input_text = "Extract the image's weather from the following filename. If it is mentioned, use it; otherwise, default to sunny." + file_name 
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")

outputs = model.generate(input_ids)
description = tokenizer.decode(outputs[0])

print(tokenizer.decode(outputs[0]))
description = description.replace('<pad>', '').strip()
description = description.replace('</s>', '').strip()
print(description)

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Downloading (…)lve/main/config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

<pad> sunny</s>
sunny


In [6]:
# Import necessary libraries
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large", device_map="auto")

# Define your three prompts
prompt_image_description = "Extract a brief description of the image from the following filename : "  + file_name
prompt_daytime = "Extract the image's day duration from the following filename. If it is mentioned, use it; otherwise, default to daytime."  + file_name
prompt_weather = "Extract the image's weather conditions from the following filename. If it is mentioned, use it; otherwise, default to sunny."  + file_name

# Process each prompt and generate the corresponding output
output_image_description = model.generate(tokenizer(prompt_image_description, return_tensors="pt").input_ids.to("cuda"))
output_daytime = model.generate(tokenizer(prompt_daytime, return_tensors="pt").input_ids.to("cuda"))
output_weather = model.generate(tokenizer(prompt_weather, return_tensors="pt").input_ids.to("cuda"))

# Decode and clean the generated outputs
description = tokenizer.decode(output_image_description[0], skip_special_tokens=True).strip()
daytime = tokenizer.decode(output_daytime[0], skip_special_tokens=True).strip()
weather = tokenizer.decode(output_weather[0], skip_special_tokens=True).strip()

# Print and use the generated outputs as needed
print("Image Description:", description)
print("Daytime:", daytime)
print("Weather:", weather)


Image Description: a camera captured shot of a indian road in urban area with people navigating through
Daytime: daytime
Weather: sunny


In [7]:
outputs 

tensor([[    0, 13546,     1]], device='cuda:0')

In [8]:
import pandas as pd

In [9]:
dict = {
    'Image Caption' : result,
    'Image Description' : description,
    'Image DayTime' :daytime,
    'Image Weather' :weather,
    'File Type' : file_type,
    'Image Size' : f"{image_size[0]}x{image_size[1]}" ,
    'Megapixels' : f"{megapixels:.2f}",
    'Image Width' : image_width,
    'Image Height' : image_height,
}

In [10]:
df = pd.DataFrame(dict)

In [11]:
df.head()

Unnamed: 0,Image Caption,Image Description,Image DayTime,Image Weather,File Type,Image Size,Megapixels,Image Width,Image Height
0,a street filled with lots of cars and people,a camera captured shot of a indian road in urb...,daytime,sunny,PNG,1024x1024,1.05,1024,1024
