### DESCRIPTION:
    This example shows how to use Azure AI Computer vision SDK to extract dense captions from an image and then use Azure OpenAI GPT3.5 to create a text ad based on the elements identified in the image

### REQUIREMENTS:
    Create an .env file with your OpenAI API key and save it in the root directory of this project.




In [1]:
from dotenv import load_dotenv
import openai
import os

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") 
OPENAI_DEPLOYMENT_ENDPOINT = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")
OPENAI_DEPLOYMENT_NAME = os.getenv("OPENAI_DEPLOYMENT_NAME")
OPENAI_MODEL_NAME = os.getenv("OPENAI_MODEL_NAME")
OPENAI_DEPLOYMENT_VERSION = os.getenv("OPENAI_DEPLOYMENT_VERSION")

AZURE_COMPUTER_VISION_ENDPOINT = os.getenv("AZURE_COMPUTER_VISION_ENDPOINT")
AZURE_COMPUTER_VISION_KEY = os.getenv("AZURE_COMPUTER_VISION_KEY")


In [3]:
import os
import azure.ai.vision as visionsdk


def analyze_image(image_url):
    service_options = visionsdk.VisionServiceOptions(AZURE_COMPUTER_VISION_ENDPOINT, AZURE_COMPUTER_VISION_KEY)

    # Specify the image file on disk to analyze. sample.jpg is a good example to show most features
    # vision_source = visionsdk.VisionSource(filename="sample.jpg")

    # Or, instead of the above, specify a publicly accessible image URL to analyze. For example:
    vision_source = visionsdk.VisionSource(url=image_url)

    analysis_options = visionsdk.ImageAnalysisOptions()

    # Mandatory. You must set one or more features to analyze. Here we use the full set of features.
    # Note that "CAPTION" and "DENSE_CAPTIONS" are only supported in Azure GPU regions (East US, France Central,
    # Korea Central, North Europe, Southeast Asia, West Europe, West US). Remove "CAPTION" and "DENSE_CAPTIONS"
    # from the list below if your Computer Vision key is not from one of those regions.
    analysis_options.features = (
        # visionsdk.ImageAnalysisFeature.CROP_SUGGESTIONS |
        visionsdk.ImageAnalysisFeature.CAPTION |
        visionsdk.ImageAnalysisFeature.DENSE_CAPTIONS |
        visionsdk.ImageAnalysisFeature.OBJECTS |
        visionsdk.ImageAnalysisFeature.PEOPLE |
        visionsdk.ImageAnalysisFeature.TEXT |
        visionsdk.ImageAnalysisFeature.TAGS
    )

    # Optional, and only relevant when you select ImageAnalysisFeature.CROP_SUGGESTIONS.
    # Define one or more aspect ratios for the desired cropping. Each aspect ratio needs
    # to be in the range [0.75, 1.8]. If you do not set this, the service will return one
    # crop suggestion with the aspect ratio it sees fit.
    # analysis_options.cropping_aspect_ratios = [0.9, 1.33]

    # Optional. Default is "en" for English. See https://aka.ms/cv-languages for a list of supported
    # language codes and which visual features are supported for each language.
    analysis_options.language = "en"
    analysis_options.model_version = "latest"
    # Set this to "true" to get a gender neutral caption (the default is "false").
    analysis_options.gender_neutral_caption = True

    # Create the image analyzer object
    image_analyzer = visionsdk.ImageAnalyzer(service_options, vision_source, analysis_options)

    # This call creates the network connection and blocks until Image Analysis results
    # return (or an error occurred). Note that there is also an asynchronous (non-blocking)
    # version of this method: image_analyzer.analyze_async().
    result = image_analyzer.analyze()

    # Checks result.
    if result.reason == visionsdk.ImageAnalysisResultReason.ANALYZED:

        print(" Image height: {}".format(result.image_height))
        print(" Image width: {}".format(result.image_width))
        print(" Model version: {}".format(result.model_version))

        if result.caption is not None:
            print(" Caption:")
            print("   '{}', Confidence {:.4f}".format(result.caption.content, result.caption.confidence))

        if result.dense_captions is not None:
            print(" Dense Captions:")
            for caption in result.dense_captions:
                print("   '{}', {}, Confidence: {:.4f}".format(caption.content, caption.bounding_box, caption.confidence))

        if result.objects is not None:
            print(" Objects:")
            for object in result.objects:
                print("   '{}', {}, Confidence: {:.4f}".format(object.name, object.bounding_box, object.confidence))

        if result.tags is not None:
            print(" Tags:")
            for tag in result.tags:
                print("   '{}', Confidence {:.4f}".format(tag.name, tag.confidence))

        if result.people is not None:
            print(" People:")
            for person in result.people:
                print("   {}, Confidence {:.4f}".format(person.bounding_box, person.confidence))

        if result.crop_suggestions is not None:
            print(" Crop Suggestions:")
            for crop_suggestion in result.crop_suggestions:
                print("   Aspect ratio {}: Crop suggestion {}"
                      .format(crop_suggestion.aspect_ratio, crop_suggestion.bounding_box))

        if result.text is not None:
            print(" Text:")
            for line in result.text.lines:
                points_string = "{" + ", ".join([str(int(point)) for point in line.bounding_polygon]) + "}"
                print("   Line: '{}', Bounding polygon {}".format(line.content, points_string))
                for word in line.words:
                    points_string = "{" + ", ".join([str(int(point)) for point in word.bounding_polygon]) + "}"
                    print("     Word: '{}', Bounding polygon {}, Confidence {:.4f}"
                          .format(word.content, points_string, word.confidence))

        result_details = visionsdk.ImageAnalysisResultDetails.from_result(result)
        print(" Result details:")
        print("   Image ID: {}".format(result_details.image_id))
        print("   Result ID: {}".format(result_details.result_id))
        print("   Connection URL: {}".format(result_details.connection_url))
        print("   JSON result: {}".format(result_details.json_result))

    else:
        error_details = visionsdk.ImageAnalysisErrorDetails.from_result(result)
        print(" Analysis failed.")
        print("   Error reason: {}".format(error_details.reason))
        print("   Error code: {}".format(error_details.error_code))
        print("   Error message: {}".format(error_details.message))
        print(" Did you set the computer vision endpoint and key?")

    return result_details.json_result

#### Analyze a picture using Azure Cognitve services to extract text from a picture

In [4]:
# image_url = "https://aka.ms/azai/vision/image-analysis-sample.jpg"
image_url = "https://www.tradeinn.com/f/13738/137387495/levis---essential-western-long-sleeve-shirt.jpg"
json_result = analyze_image(image_url)

 Image height: 1000
 Image width: 1000
 Model version: 2023-02-01-preview
 Caption:
   'a person with curly hair wearing jeans and a blue shirt', Confidence 0.7689
 Dense Captions:
   'a person with curly hair wearing jeans and a blue shirt', Rectangle(x=0, y=0, w=1000, h=1000), Confidence: 0.7689
   'a person wearing a blue shirt and jeans', Rectangle(x=309, y=16, w=386, h=974), Confidence: 0.7428
   'a white belt with a circle on it', Rectangle(x=463, y=521, w=60, h=59), Confidence: 0.7001
   'a person with curly hair', Rectangle(x=331, y=12, w=291, h=286), Confidence: 0.7511
   'a person wearing a blue shirt', Rectangle(x=295, y=203, w=376, h=395), Confidence: 0.7504
   'a close-up of a person's legs', Rectangle(x=335, y=505, w=350, h=487), Confidence: 0.8434
   'a person with curly hair wearing jeans and a blue shirt', Rectangle(x=0, y=0, w=977, h=977), Confidence: 0.7781
   'a close up of a belt', Rectangle(x=386, y=505, w=216, h=77), Confidence: 0.8305
   'a hand with a ring on i

#### Extract all dense captions from the json result

In [5]:
# JSON result: {"captionResult":{"text":"a person wearing a mask sitting at a table with a laptop","confidence":1.0},"objectsResult":{"values":[{"boundingBox":{"x":303,"y":194,"w":181,"h":223},"tags":[{"name":"person","confidence":0.765}]},{"boundingBox":{"x":221,"y":289,"w":159,"h":80},"tags":[{"name":"Laptop","confidence":0.574}]}]},"readResult":{"stringIndexType":"TextElements","content":"Sample text\nHand writing\n123 456","pages":[{"height":432.0,"width":648.0,"angle":0.5729,"pageNumber":1,"words":[{"content":"Sample","boundingBox":[542.0,377.0,588.0,377.0,587.0,389.0,542.0,389.0],"confidence":0.992,"span":{"offset":0,"length":6}},{"content":"text","boundingBox":[598.0,377.0,630.0,376.0,630.0,390.0,598.0,389.0],"confidence":0.989,"span":{"offset":7,"length":4}},{"content":"Hand","boundingBox":[540.0,394.0,569.0,394.0,569.0,407.0,540.0,407.0],"confidence":0.991,"span":{"offset":12,"length":4}},{"content":"writing","boundingBox":[573.0,394.0,613.0,395.0,613.0,409.0,573.0,407.0],"confidence":0.995,"span":{"offset":17,"length":7}},{"content":"123","boundingBox":[542.0,412.0,561.0,411.0,561.0,424.0,542.0,424.0],"confidence":0.998,"span":{"offset":25,"length":3}},{"content":"456","boundingBox":[568.0,411.0,590.0,412.0,590.0,424.0,568.0,424.0],"confidence":0.998,"span":{"offset":29,"length":3}}],"spans":[{"offset":0,"length":32}],"lines":[{"content":"Sample text","boundingBox":[541.0,376.0,632.0,376.0,632.0,389.0,541.0,389.0],"spans":[{"offset":0,"length":11}]},{"content":"Hand writing","boundingBox":[540.0,393.0,613.0,395.0,613.0,408.0,540.0,406.0],"spans":[{"offset":12,"length":12}]},{"content":"123 456","boundingBox":[542.0,411.0,592.0,411.0,592.0,424.0,542.0,423.0],"spans":[{"offset":25,"length":7}]}]}],"styles":[],"modelVersion":"2022-04-30"},"denseCaptionsResult":{"values":[{"text":"a person wearing a mask sitting at a table with a laptop","confidence":1.0,"boundingBox":{"x":0,"y":0,"w":648,"h":432}},{"text":"a person using a laptop","confidence":1.0,"boundingBox":{"x":220,"y":289,"w":144,"h":73}},{"text":"a person wearing a colorful face mask","confidence":1.0,"boundingBox":{"x":285,"y":178,"w":202,"h":249}},{"text":"a green chair in a room","confidence":1.0,"boundingBox":{"x":463,"y":160,"w":117,"h":184}},{"text":"a close-up of a person's hand","confidence":1.0,"boundingBox":{"x":217,"y":162,"w":109,"h":180}},{"text":"a person sitting in a chair","confidence":1.0,"boundingBox":{"x":418,"y":320,"w":105,"h":109}},{"text":"a blue and green background","confidence":1.0,"boundingBox":{"x":456,"y":163,"w":60,"h":155}},{"text":"a close-up of a wooden table","confidence":1.0,"boundingBox":{"x":59,"y":318,"w":55,"h":58}},{"text":"a person sitting at a table","confidence":1.0,"boundingBox":{"x":287,"y":315,"w":236,"h":113}},{"text":"a close up of a laptop on a table","confidence":1.0,"boundingBox":{"x":88,"y":338,"w":354,"h":88}}]},"modelVersion":"2023-02-01-preview","metadata":{"width":648,"height":432},"tagsResult":{"values":[{"name":"furniture","confidence":0.9880748987197876},{"name":"clothing","confidence":0.9805002212524414},{"name":"person","confidence":0.948422908782959},{"name":"houseplant","confidence":0.9420685768127441},{"name":"desk","confidence":0.9156692028045654},{"name":"indoor","confidence":0.9050061106681824},{"name":"computer","confidence":0.8922904133796692},{"name":"laptop","confidence":0.8696370124816895},{"name":"sitting","confidence":0.819450855255127},{"name":"wall","confidence":0.7605603933334351},{"name":"woman","confidence":0.7446085214614868},{"name":"table","confidence":0.6902506351470947},{"name":"plant","confidence":0.641657829284668},{"name":"using","confidence":0.5301232933998108}]},"peopleResult":{"values":[{"boundingBox":{"x":296,"y":181,"w":196,"h":250},"confidence":0.9578750729560852},{"boundingBox":{"x":2,"y":30,"w":8,"h":23},"confidence":0.004669021349400282},{"boundingBox":{"x":623,"y":182,"w":24,"h":193},"confidence":0.0030183952767401934}]}}
import json
dict = json.loads(json_result)
dense_captions = dict["denseCaptionsResult"]["values"]

text = ""
for caption in dense_captions:
    text += caption["text"] + "\n"
print(text)

a person with curly hair wearing jeans and a blue shirt
a person wearing a blue shirt and jeans
a white belt with a circle on it
a person with curly hair
a person wearing a blue shirt
a close-up of a person's legs
a person with curly hair wearing jeans and a blue shirt
a close up of a belt
a hand with a ring on it
a close up of a blue jean jacket



#### Generate a product description from the text extracted from the photo using OpenAI

In [7]:
# let's use a sequence of prompts to create a chain
from langchain.prompts import PromptTemplate
from langchain import LLMChain
from langchain.llms import AzureOpenAI
# Configure OpenAI API
openai.api_type = "azure"
openai.api_version = OPENAI_DEPLOYMENT_VERSION
openai.api_base = OPENAI_DEPLOYMENT_ENDPOINT
openai.api_key = OPENAI_API_KEY

llm = AzureOpenAI(deployment_name=OPENAI_DEPLOYMENT_NAME, 
            model_name=OPENAI_MODEL_NAME, 
            openai_api_base=OPENAI_DEPLOYMENT_ENDPOINT, 
            openai_api_key=OPENAI_API_KEY)

# summarize the video transcript from youtube
template = """Write a witty product description in a conversational style so young adult shoppers understand what this product does and how it benefits them. Use the following product details to summarize your description:
The Product description should not exceed {length} words.
Answer in a concise manner only with the product description.
Title: Levi´s® Essential Western Long Sleeve Shirt
Image description sentences:
"""

prompt_template = PromptTemplate(input_variables=["length"], template=template)
prompt_template.format(length=100)

chain1=LLMChain(llm=llm,prompt=prompt_template)
chain1.run(text)

'• The Levi´s® Essential Western Long Sleeve Shirt is a versatile and timeless piece that every young adult should have in their wardrobe.\n• The shirt is made of a soft and breathable fabric that will keep you comfortable throughout the day.\n• It is perfect for any occasion, from casual outings to more formal events.\n• The Western-inspired details give the shirt a unique and stylish look.\n• The shirt features snap closures and two chest pockets with snap flaps.\n• It comes in a slim fit that will flatter your figure and enhance your style.\n• Pair it with jeans for a classic and effortless look.\n• Get your Levi´s® Essential Western Long Sleeve Shirt today and elevate your wardrobe to the next level!<|im_end|>'