<a href="https://colab.research.google.com/github/AdopleAIOrg/Image-To-Text/blob/main/Image_To_Text_Product.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -r /content/requirements.txt

In [None]:
%%writefile app.py
import streamlit as st
import torch
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from PIL import Image

class ImageToText:

    def __init__(self):
      """
      Initializes the ImageToText class.
      """

      self.model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
      self.feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
      self.tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

      self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
      self.model.to(self.device)

      max_length = 16
      num_beams = 4
      self.gen_kwargs = {"max_length": max_length, "num_beams": num_beams}

    def _predict_step(self, image_paths: list) -> list:

        """
        Generates captions for the given list of image paths.

        Args:
            image_paths (list): List of paths to the images.

        Returns:
            list: List of generated captions for the images.
        """
        images = []
        for image_path in image_paths:
            i_image = Image.open(image_path)
            if i_image.mode != "RGB":
                i_image = i_image.convert(mode="RGB")

            images.append(i_image)

        pixel_values = self.feature_extractor(images=images, return_tensors="pt").pixel_values
        pixel_values = pixel_values.to(self.device)

        output_ids = self.model.generate(pixel_values, **self.gen_kwargs)

        preds = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)
        preds = [pred.strip() for pred in preds]
        return preds

    def streamlit_interface(self) -> None:

        """
        Defines the Streamlit user interface and logic.
        """

        st.title("Image Captioning App")

        # Add an upload button to let the user select an image
        image_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])

        if image_file is not None:
            # Read the uploaded image
            image = Image.open(image_file)

            # Display the uploaded image
            st.image(image, caption="Uploaded Image", use_column_width=True)

            # Add a "Generate Caption" button to trigger the caption generation
            if st.button("Generate "):
                # Call the predict_step function to generate a caption
                caption = self._predict_step([image_file])[0]

                # Display the generated caption
                st.write("Generated Text :", caption)

if __name__ == "__main__":

    img_to_txt = ImageToText()
    img_to_txt.streamlit_interface()

In [None]:
!streamlit run app.py & npx localtunnel --port 8501