CAPTION AND KEYWORD GENERATOR USING BLIP **MODEL**

In [29]:
#IMPORT LIBRARIES

import gradio as gr
import os
import cv2
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration  #BLIP model
from io import BytesIO
import tempfile

In [30]:
#IMPORT BLIP MODEL AND PROCESSOR

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")


In [31]:
#FUNCTION TO DOWNLOAD VIDEO FROM URL

def download_video_from_url(video_url):
    try:
        response = requests.get(video_url)
        if response.status_code == 200:
            # Save the video to a temporary file
            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
            with open(temp_file.name, 'wb') as f:
                f.write(response.content)
            return temp_file.name
        else:
            return None
    except Exception as e:
        print(f"Error downloading video: {e}")
        return None

#FUNCTION TO PROCEES VIDEO AND GENERATE CAPTIONS

def process_video(video_file, video_url):
    # Check if video_file or video_url is provided
    if video_file:
        cap = cv2.VideoCapture(video_file.name)
    elif video_url:
        video_path = download_video_from_url(video_url)
        if video_path:
            cap = cv2.VideoCapture(video_path)
        else:
            return "Error: Video URL could not be downloaded.", "Error: Video URL could not be downloaded."
    else:
        return "Error: No video provided.", "Error: No video provided."

    captions = []
    all_keywords = set()
    frame_interval = 10
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    for frame_num in range(0, frame_count, frame_interval):
        ret, frame = cap.read()
        if not ret:
            break
        pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        inputs = processor(pil_image, return_tensors="pt")
        out = model.generate(**inputs)
        caption = processor.decode(out[0], skip_special_tokens=True)
        captions.append(caption)

        #KEYWORD EXTRACTION BY SLPITTING CAPTIONS

        frame_keywords = set(caption.split())
        all_keywords.update(frame_keywords)

    cap.release()

    #COMBINE UNIQUE CAPTIONS

    combined_captions = "\n".join(list(set(captions)))
    keywords = ", ".join(all_keywords)  # Combine unique keywords

    return combined_captions, keywords

#INTERFACE OF GRADIO

def build_interface():
    with gr.Blocks() as interface:
        gr.Markdown("# Video Caption and Keyword Generator")

        #OPTION FOR VIDEO INPUT

        with gr.Row():
            video_upload = gr.File(label="Upload Video", file_types=[".mp4", ".avi", ".mov"])
            video_link = gr.Textbox(label="Paste Video Link", placeholder="Enter video URL")

        #OUTPUT FOR CAPTION AND KEYWORDS

        with gr.Row():
            captions_output = gr.Textbox(label="Captions", lines=10)
            keywords_output = gr.Textbox(label="Keywords", lines=5)


#BUTTON FOR GENERATE

        process_btn = gr.Button("Generate")


        process_btn.click(
            fn=process_video,
            inputs=[video_upload, video_link],
            outputs=[captions_output, keywords_output]
        )

    return interface


if __name__ == "__main__":
    interface = build_interface()
    interface.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://9a75eb57e3b668f472.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
