In [None]:
!pip install openai==1.14.2

In [None]:
from    IPython.display     import Image
import  base64
import  time
import  matplotlib.pyplot   as plt
import  matplotlib.image    as mpimg
from    openai              import OpenAI, RateLimitError

In [None]:
# get the API key from Colab Secrets
from    google.colab        import userdata, widgets
KEY     = userdata.get( 'OA_TOKEN_P' )

# set the API key in the OpenAI client
client          = OpenAI( api_key=KEY )

### Data download

In [None]:
my_image_1  = "img1.jpg"
my_image_2  = "img2.jpg"
my_image_3  = "img3.jpg"
my_image_4  = "img4.jpg"

!wget -O {my_image_1} https://www.dropbox.com/scl/fi/5fkn00h925xjf51jcodcj/c1.jpg?rlkey=2a7kjoqphqaz5mt1wl3c5bp4i&dl=0
!wget -O {my_image_2} https://www.dropbox.com/scl/fi/s5089zy9pt5ed10ocn46y/c2.jpg?rlkey=01qndot441zgst8g8cdz0emqw&dl=0
!wget -O {my_image_3} https://www.dropbox.com/scl/fi/u65cc3yaz91y45yu28eq8/c3.jpg?rlkey=6ynt5npd51k5z0zqy2k4s17fe&dl=0
!wget -O {my_image_4} https://www.dropbox.com/scl/fi/djmfggr7ho1os4nihkzg5/c6.jpeg?rlkey=hdp41x87gac6pvggn7rgf989v&dl=0

### Functions and parameters

In [None]:
MODEL           = "gpt-4-vision-preview"        # selected model for inference
# TOP_P           = 1
TEMP            = 0.5                           # sampling temperature to use [0...2], with 0.8 already a high value
N_RET           = 3                             # how many chat completion choices to generate for each input message
MX_TOK          = 1000                          # maximum number of tokens that can be generated in the chat completion
DETAIL          = "high"                        # with 'low' use 65 tokens to represent the image
                                                # with 'high' use 129 tokens
                                                # https://platform.openai.com/docs/guides/vision/low-or-high-fidelity-image-understanding

OPENAI_ERRORS   = ( RateLimitError, )           # rate limits are restrictions that OpenAI imposes on the number of times
                                                # a user can access our services within a specified period of time
                                                # https://platform.openai.com/docs/guides/rate-limits/rate-limits
EMERGENCY_DELAY = 4                             # seconds to wait in case of RateLimitError

In [None]:
# the model accepts images in base 64 encoded format
# https://platform.openai.com/docs/guides/vision/uploading-base-64-encoded-images
def encode_b64( img ):
  with open( img, "rb") as f:
    return base64.b64encode( f.read() ).decode( 'utf-8' )

In [None]:
def complete( prmpt, img_b64 ):
    # https://platform.openai.com/docs/api-reference/messages/object
    message = [ {
        'role': 'user',
        'content': [
            { 'type': 'text',       'text': prmpt  },
            { 'type': 'image_url',  'image_url': {
                'url':      f"data:image/jpeg;base64,{img_b64}",
                'detail':   DETAIL
                }
            }
        ]
    } ]

    try:
        # https://platform.openai.com/docs/api-reference/chat/create
        res     = client.chat.completions.create(
            model       = MODEL,
            messages    = message,
            max_tokens  = MX_TOK,
            n           = N_RET,
            # top_p       = TOP_P,
            temperature = TEMP
        )

    # rate limits are restrictions that OpenAI imposes on the number of times
    # a user can access our services within a specified period of time
    # https://platform.openai.com/docs/guides/rate-limits/rate-limits
    except OPENAI_ERRORS as e:
        delay   = EMERGENCY_DELAY
        if e is not RateLimitError:
            delay   = 5 * delay
        print( f"Catched error: {e}. Now sleeping for {delay} seconds." )
        time.sleep( delay )

        res     = client.chat.completions.create(
            model       = MODEL,
            messages    = message,
            max_tokens  = MX_TOK,
            n           = N_RET,
            # top_p       = TOP_P,
            temperature = TEMP
        )

    return [ t.message.content for t in res.choices ]

In [None]:
# visualize results in a grid using widgets
def print_res( prmpt, img, compl ):
    grid = widgets.Grid(2, 1)

    with grid.output_to( 0, 0 ):
        # first row: cell 1 and 2
        grid2 = widgets.Grid(1, 2)

        # cell 1: input image
        with grid2.output_to( 0, 0 ):
            i = mpimg.imread( img )
            plt.imshow( i )
            plt.axis( 'off' )
            plt.show()

        # cell 2: prompt
        with grid2.output_to( 0, 1 ):
            print( prmpt )

    # second row: cell 3 with one tab for each completion
    # cell 3: completions
    with grid.output_to( 1, 0 ):
        tabbar  = widgets.TabBar( [ f"COMPL-{i+1}" for i in range( N_RET ) ] )
        for i in range( N_RET ):
            with tabbar.output_to( f"COMPL-{i+1}" ):
                print( compl[ i ] )

In [None]:
# execute completion
def do_exec( prmpt, img ):
    img_b64 = encode_b64( img )
    compl   = complete( prmpt, img_b64 )
    print_res( prmpt, img, compl )
    return compl

### Usage

In [None]:
my_prompt = "You're the mature driver behind the wheel and this image is what you see now. "
my_prompt += "Describe what you see and what are the relevant elements. "
my_prompt += "Then, describe what you plan to do accordingly."

In [None]:
my_compl_1 = do_exec( my_prompt, my_image_1 )

In [None]:
my_compl_2 = do_exec( my_prompt, my_image_2 )

In [None]:
my_compl_3 = do_exec( my_prompt, my_image_3 )

In [None]:
my_compl_4 = do_exec( my_prompt, my_image_4 )