<a href="https://colab.research.google.com/github/BRIAN12682/Automation-Projects/blob/main/NLPMalaria1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sample Code

In [7]:

from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import torch
from PIL import Image

model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



max_length = 30
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}


In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
def predict_step(image_paths):
  images = []
  for image_path in image_paths:
    i_image = Image.open(image_path)
    if i_image.mode != "RGB":
      i_image = i_image.convert(mode="RGB")

    images.append(i_image)

  pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
  pixel_values = pixel_values.to(device)

  output_ids = model.generate(pixel_values, **gen_kwargs)

  preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
  preds = [pred.strip() for pred in preds]
  return preds


predict_step(['/content/drive/MyDrive/thier.jpg']) # ['a woman in a hospital bed with a woman in a hospital bed']


['a woman standing in the middle of a forest']

# The Application

In [10]:
from IPython.display import Image, display, clear_output
!pip install transformers torch torchvision pillow lxml
clear_output()

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [26]:
import os
import xml.etree.ElementTree as ET
import pandas as pd

def parse_xml_annotations_with_bboxes(annotation_dir):
    annotations = []
    for xml_file in os.listdir(annotation_dir):
        if xml_file.endswith('.xml'):
            file_path = os.path.join(annotation_dir, xml_file)
            print(f"Processing file: {file_path}")
            try:
                tree = ET.parse(file_path)
                root = tree.getroot()

                filename_element = root.find('filename')
                if filename_element is None or filename_element.text is None or filename_element.text.strip() == "":
                    # Use the XML file name (minus .xml) as the filename
                    filename = os.path.splitext(xml_file)[0] + ".jpg"
                    print(f"Using fallback filename: {filename}")
                else:
                    filename = filename_element.text.strip()
                    print(f"Found filename: {filename}")

                boxes = []
                for obj in root.findall('object'):
                    bbox = obj.find('bndbox')
                    if bbox is not None:
                        xmin = float(bbox.find('xmin').text)
                        ymin = float(bbox.find('ymin').text)
                        xmax = float(bbox.find('xmax').text)
                        ymax = float(bbox.find('ymax').text)
                        boxes.append([xmin, ymin, xmax, ymax])
                    else:
                        print(f"Missing bounding box in {xml_file}")
                annotations.append({'filename': filename, 'boxes': boxes})
            except ET.ParseError as e:
                print(f"Error parsing {xml_file}: {e}")
    return pd.DataFrame(annotations)

annotations_dir = '/content/drive/MyDrive/DATASETS/MalariaPI/annotation'
annotations_df = parse_xml_annotations_with_bboxes(annotations_dir)

print(annotations_df.head())


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processing file: /content/drive/MyDrive/DATASETS/MalariaPI/annotation/plasmodium-1244.xml
Using fallback filename: plasmodium-1244.jpg
Processing file: /content/drive/MyDrive/DATASETS/MalariaPI/annotation/plasmodium-1633.xml
Using fallback filename: plasmodium-1633.jpg
Processing file: /content/drive/MyDrive/DATASETS/MalariaPI/annotation/plasmodium-0937.xml
Using fallback filename: plasmodium-0937.jpg
Processing file: /content/drive/MyDrive/DATASETS/MalariaPI/annotation/plasmodium-1348.xml
Using fallback filename: plasmodium-1348.jpg
Processing file: /content/drive/MyDrive/DATASETS/MalariaPI/annotation/plasmodium-1095.xml
Using fallback filename: plasmodium-1095.jpg
Processing file: /content/drive/MyDrive/DATASETS/MalariaPI/annotation/plasmodium-0824.xml
Using fallback filename: plasmodium-0824.jpg
Processing file: /content/drive/MyDrive/DATASETS/MalariaPI/annotation/plasmodium-0595.xml
Using fallback filename: plasmodium

In [27]:
from PIL import Image, ImageDraw
from torchvision import transforms

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def draw_bounding_boxes(image, boxes):
    for box in boxes:
        draw = ImageDraw.Draw(image)
        draw.rectangle(box, outline="red", width=2)
    return image

def load_image_with_boxes(image_path, boxes):
    image = Image.open(image_path).convert('RGB')
    image = draw_bounding_boxes(image, boxes)
    return transform(image)


In [31]:
import torch
from PIL import Image, ImageDraw
from torchvision import transforms
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer

# Model and tokenizer initialization
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Image transformation
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

normalize_transform = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

def draw_bounding_boxes(image, boxes):
    for box in boxes:
        draw = ImageDraw.Draw(image)
        draw.rectangle(box, outline="red", width=2)
    return image

def load_image_with_boxes(image_path, boxes):
    image = Image.open(image_path).convert('RGB')
    image = draw_bounding_boxes(image, boxes)
    return transform(image)

def predict_step(image_paths, annotations_df):
    images = []
    for image_path in image_paths:
        filename = os.path.basename(image_path)
        if filename not in annotations_df['filename'].values:
            print(f"Filename {filename} not found in annotations")
            continue

        boxes = annotations_df[annotations_df['filename'] == filename]['boxes'].values[0]
        image = load_image_with_boxes(image_path, boxes)
        images.append(image)

    if not images:
        return []

    pixel_values = torch.stack(images)
    pixel_values = normalize_transform(pixel_values)
    pixel_values = pixel_values.to(device)

    output_ids = model.generate(pixel_values, max_length=30, num_beams=4)

    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    preds = [pred.strip() for pred in preds]
    return preds

In [32]:
print(annotations_df.head())

              filename                                              boxes
0  plasmodium-1617.jpg  [[191.2484, 44.764, 241.2484, 94.764], [403.52...
1  plasmodium-2080.jpg  [[188.7954, 70.6534, 238.7954, 120.6534], [-4....
2  plasmodium-1241.jpg  [[573.1466, 579.1525, 623.1466, 629.1525], [47...
3  plasmodium-2524.jpg  [[658.2828, 276.4392, 708.2828, 326.4392], [75...
4  plasmodium-2108.jpg  [[909.7578, 243.9193, 959.7578, 293.9193], [67...


In [33]:
# Example usage
image_paths = ['/content/drive/MyDrive/DATASETS/MalariaPI/images/plasmodium-0000.jpg']
preds = predict_step(image_paths, annotations_df)
print(preds)


['a black and white photo of a blue and white object']
