In [1]:
# Install necessary packages
!pip install --upgrade transformers
!pip install torch
!pip install gradio
!pip install pyngrok
!pip install pillow

# Imports
from transformers import BlipProcessor, BlipForConditionalGeneration
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from PIL import Image
import requests
from io import BytesIO
import gradio as gr
from pyngrok import ngrok
from transformers import pipeline
from transformers import AutoModelForCausalLM, AutoTokenizer
# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load BLIP model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

# Set your token here
access_token = ""
#brave API Key
api_key = ''
subscription_token = ''
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-1B-Instruct",
    use_auth_token=access_token
)
tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-3.2-1B-Instruct",
    use_auth_token=access_token
)

# image_context = "A black hoodie with superman logo"
# user_prompt = "Show me t-shirts with the same design"

model_id = "meta-llama/Llama-3.2-1B-Instruct"
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

Collecting transformers
  Downloading transformers-4.45.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.45.2-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m93.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m80.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.19.1
    Uninstalling tokenizers-0.19.1:
      Successfully uninstalled tokenizers-0.1

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [2]:
# Function to process image and user prompt
def process_input(image, user_prompt):
    try:
        # If an image is uploaded, use it directly
        if image is not None:
            image = image.convert('RGB')  # The image is already a PIL object
        else:
            return "No image provided.", "", ""

    except Exception as e:
        return f"Error loading image: {e}", "", ""

    # Preprocess the image and move it to the GPU
    inputs = processor(image, return_tensors="pt").to(device)

    # Generate a caption for the image using BLIP
    with torch.no_grad():
        caption_ids = blip_model.generate(**inputs)
        image_context = processor.decode(caption_ids[0], skip_special_tokens=True)

    # Combine image context with the user prompt
    messages = [
        {
            "role": "system",
            "content": """
            You are an intelligent assistant refining search queries by combining image descriptions and user queries.
            Always prioritize the user's query, using the image context only when relevant.
            Your output must be a clean, concise, and to-the-point search query with no additional explanations, phrases, or unnecessary words.
            """
        },
        {"role": "user",
         "content": f"Image description- {image_context} , User's question- {user_prompt}"},
    ]
    outputs = pipe(
        messages,
        max_new_tokens=64,
    )
    refined_question = outputs[0]["generated_text"][-1]['content']
    #refined_question = "Images of "+refined_question
    # Perform web search using the Brave API with the refined question
    params = {
        'q': refined_question,
        'count': 10,  # Number of results
        'offset': 0   # Offset for pagination
    }
    headers = {
        'Authorization': f'Bearer {api_key}',
        'x-subscription-token': subscription_token
    }
    response = requests.get('https://api.search.brave.com/res/v1/web/search', headers=headers, params=params)
    search_results = response.json()

    # Extract titles and URLs from the web search results and format them
    web_results = search_results.get('web', {}).get('results', [])
    output = ""
    if web_results:
        for result in web_results:
            title = result.get('title', 'No title available')
            url = result.get('url', 'No URL available')
            output += f"<a href='{url}' target='_blank'>{title}</a><br><br>"
    else:
        output = "No results found."

    return image_context, refined_question, output


# Create Gradio interface
iface = gr.Interface(
    fn=process_input,
    inputs=[
        gr.Image(type="pil", label="Drop Image or Upload"),  # Resize the image to 512x512
        gr.Textbox(lines=1, placeholder="Enter your question here...", label="User Question")
    ],
    outputs=[
        gr.Textbox(label="Image Context"),  # Output the image context first
        gr.Textbox(label="Final Search Query"),  # Output the final refined query
        gr.HTML(label="Search Results")  # Output the search results as HTML with clickable links
    ],
    title="Google Lens Pro Max",
    description="""
        <h4 style='font-size: 18px; text-align: center;'>Image and Question-based Search Assistant</h4>
        <p>Upload an image, enter your question, and get relevant search results along with image results.</p>
    """  # Styled title inside the web app
)


ngrok.kill()
!ngrok config add-authtoken 2nKvobAogGY1eWBnw3DBIOKBIKn_6HSLYUkA2iSfreYyTN3XY

# Start a new tunnel
public_url = ngrok.connect(7860)
print(f"Public URL: {public_url}")
# Launch the Gradio interface
iface.launch()


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
Public URL: NgrokTunnel: "https://c2d4-34-86-161-172.ngrok-free.app" -> "http://localhost:7860"
Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://046239cb3067de89e5.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


