In [1]:
from pathlib import Path
from typing import List
import base64
from io import BytesIO
import hashlib
import json
import time

from PIL import Image

from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI, AzureChatOpenAI
from langchain_core.pydantic_v1 import BaseModel, Field
from dotenv import dotenv_values

import pandas as pd


# Config

In [2]:
BASE_DIR = Path.cwd().parent

DATA_DIR = BASE_DIR / "data"

ENV_VARIABLES = {
    **dotenv_values(str(BASE_DIR / ".env")),  # load environment variables from .env file
    #**os.environ,  # override loaded values with environment variables
}

GERENCIAL_REPORT_NAME = "gerencial"
DETAIL_REPORT_NAME = "detallado"

gpt4_vision_key = ENV_VARIABLES["AZURE_OPENAI_KEY"]
gpt4_vision_endpoint = f"https://{ENV_VARIABLES['AZURE_OPENAI_SERVICE']}.openai.azure.com"
gpt4_vision_version = "2024-04-01-preview"
gpt4_vision_name = ENV_VARIABLES['AZURE_OPENAI_MODEL_GPT4O']

llm = AzureChatOpenAI(
    temperature=0.2,
    openai_api_key=gpt4_vision_key,
    api_version=gpt4_vision_version,
    azure_endpoint=gpt4_vision_endpoint,
    # openai_api_version = openai.api_version,
    azure_deployment=gpt4_vision_name,
    max_tokens=2000)

# Utils

In [3]:
def image_to_base64(image_path):
    # Abre la imagen
    with Image.open(image_path) as image:
        # Convierte la imagen a un objeto BytesIO
        buffered = BytesIO()
        image.save(buffered, format="JPEG")
        # Codifica la imagen a base64
        img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
    return img_str

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")
        
def generate_sha256_hash(input_string):
    # Convertir el string a bytes
    byte_string = input_string.encode('utf-8')
    
    # Crear un objeto de hash SHA-256
    sha256_hash = hashlib.sha256()
    
    # Actualizar el objeto de hash con los datos en bytes
    sha256_hash.update(byte_string)
    
    # Obtener el hash final en formato hexadecimal
    hex_digest = sha256_hash.hexdigest()
    
    return hex_digest

def sort_key(path):
    # Extraer la parte numérica del nombre del archivo después del guion bajo
    name = path.stem  # Obtiene el nombre del archivo sin la extensión
    parts = name.split("_")
    return int(parts[1]) 

## prompts

### for report type: gerencial

In [294]:
reflection_system_prompt = """
You are a vision model designed to operate in an autonomous store. Your goal is to analyze the images captured in the store and list information about customer actions. Your specific tasks include:

Identify how many possible take actions exist in the image sequence.
write very consice conclusion where indicate posible actions and products related

IMPORTANT: ## Focus on Hand Movements:
Concentrate on the movement of the customer's hands.
Identify the objects being held in the moving hand.

format:
1. product taked, it seem a ..
2. product taked, it seem a ..
...

"""

sys_prompt = """
You are a vision model designed to operate in an autonomous store. Your goal is to analyze the images captured in the store and list information about customer actions. Your specific tasks include:

## Action Identification:
1. Detect if a customer is taking a product from a shelf.
2. Detect if a customer is returning a product to a shelf.

## Action Listing:
1. List all detected actions of taking or returning products.
2. Specify which product is being taken or returned.

## Additional Guidelines:
 - Provide high accuracy in identifying customer actions.
 - You must include all action detected.
 - Ensure clear differentiation between the actions of taking and returning products.
 - Generate real-time reports for each detected action, including a summary of the product involved.
 - Maintain a log of the time and location within the store where the actions were detected.
"""

final_description_system_prompt = """
"""


In [295]:
class Action(BaseModel):
    """Information about an action identified."""

    action: str = Field(default=None, description="Action detected taking/returning")
    product: str = Field(default=None, description="product detected in the action. products allowed: [red-beer, yellow-beer, gray-beer, chips]")

class Data(BaseModel):
    """Extracted data about actions."""

    # Creates a model so that we can extract multiple entities.
    actions: List[Action]

## Pipeline

In [296]:
## load sources
store_session = "store0011235537879556"
images_dir = DATA_DIR / store_session / "imagenes_camaras"

In [297]:
filter_range = [(20, 60), (61, 100)] #(61, 100)

In [298]:
llm_structured =  llm.with_structured_output(schema=Data)

In [299]:
predictions_by_cameras = {
    0: [],
    1: [],
    2: []
}
for range_ob in filter_range:
    images_camera = {
    0: [],
    1: [],
    2: []
}

    # get images by camera
    for image_dir in images_dir.iterdir():
        name = image_dir.stem
        camera = int(name.split("_")[0])
        n_frame = int(name.split("_")[1])
        if n_frame > range_ob[0] and n_frame < range_ob[1]:
            images_camera[camera].append(image_dir)
    for i in range(3):
        images_camera[i] = sorted(images_camera[i], key=sort_key)

    # predict by camera
    for camera in images_camera.keys():
        print(f"Predicting for camera {camera}")
        # select sample - skip 2
        sample = [path for i, path in enumerate(images_camera[camera]) if i % 2 == 0]

        base64_images = []
        for image_file_dir in sample:
            base64_images.append(encode_image(image_file_dir))
        
        imgs_content = [{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_bs64}"}} for image_bs64 in base64_images]

        # generate first escene description
        first_description = llm.invoke(
                    [
                        HumanMessage(
                        content=[
                            {"type": "text", "text": reflection_system_prompt}] + imgs_content
                        )
                    ]
                )
        
        print(f"Analysis {first_description.content}")
        # generate structured output
        prev_analysis = [{"type": "text", "text": f"## scene analysis:\n {first_description.content}\n Start to generate the structured output"}]
        structured_output = llm_structured.invoke(
                    [
                        HumanMessage(
                        content=[
                            {"type": "text", "text": sys_prompt}] + imgs_content + prev_analysis
                        )
                    ]
                )

        actions_detected = structured_output.actions
        predictions_by_cameras[camera].extend(actions_detected)

Predicting for camera 0
Analysis 1. Product taken, it seems to be a can of soda.
2. Product taken, it seems to be another can of soda.
Predicting for camera 1
Analysis 1. Product taken, it seems a small yellow and red item (possibly a snack).
2. Product taken, it seems a black and red can (possibly a beverage).
3. Product taken, it seems a red and white can (possibly a beverage).
Predicting for camera 2
Analysis 1. Product taken, it seems a red can.
2. Product taken, it seems a red can.
Predicting for camera 0
Analysis 1. Product taken, it seems to be a can of "Speed" energy drink.
2. Product taken, it seems to be a bag of chips.
Predicting for camera 1
Analysis 1. Product taken, it seems a yellow snack pack.
2. Product taken, it seems a green can (Speed energy drink).
3. Product taken, it seems a blue and white snack pack.
Predicting for camera 2
Analysis 1. Product taken, it seems to be a yellow can.
2. Product taken, it seems to be a blue and white bag.


In [300]:
prediction_result = {}

for i in range(3):
    fmodel_predictions = predictions_by_cameras[i]
    clean_pred = []
    for item in fmodel_predictions:
        clean_pred.append(
            {
                "action": item.action,
                "product": item.product
            }
        )
    prediction_result[i] = clean_pred

In [301]:
prediction_result

{0: [{'action': 'taking', 'product': 'gray-beer'},
  {'action': 'taking', 'product': 'gray-beer'},
  {'action': 'taking', 'product': 'yellow-beer'},
  {'action': 'taking', 'product': 'chips'}],
 1: [{'action': 'taking', 'product': 'chips'},
  {'action': 'taking', 'product': 'red-beer'},
  {'action': 'taking', 'product': 'yellow-beer'},
  {'action': 'taking', 'product': 'chips'},
  {'action': 'taking', 'product': 'yellow-beer'},
  {'action': 'taking', 'product': 'gray-beer'}],
 2: [{'action': 'taking', 'product': 'red-beer'},
  {'action': 'taking', 'product': 'red-beer'},
  {'action': 'taking', 'product': 'yellow-beer'},
  {'action': 'taking', 'product': 'chips'}]}

In [302]:
# save result
with open(DATA_DIR / 'prediction_result.json', 'w') as file:
    json.dump(prediction_result, file, indent=4)

In [303]:
# load data
with open(DATA_DIR / 'prediction_result.json', 'r') as file:
    prediction_result = json.load(file)

In [304]:
prediction_result

{'0': [{'action': 'taking', 'product': 'gray-beer'},
  {'action': 'taking', 'product': 'gray-beer'},
  {'action': 'taking', 'product': 'yellow-beer'},
  {'action': 'taking', 'product': 'chips'}],
 '1': [{'action': 'taking', 'product': 'chips'},
  {'action': 'taking', 'product': 'red-beer'},
  {'action': 'taking', 'product': 'yellow-beer'},
  {'action': 'taking', 'product': 'chips'},
  {'action': 'taking', 'product': 'yellow-beer'},
  {'action': 'taking', 'product': 'gray-beer'}],
 '2': [{'action': 'taking', 'product': 'red-beer'},
  {'action': 'taking', 'product': 'red-beer'},
  {'action': 'taking', 'product': 'yellow-beer'},
  {'action': 'taking', 'product': 'chips'}]}

In [236]:
images_camera = {
    0: [],
    1: [],
    2: []
}

In [237]:
filter_range = (20, 60) # 62-100
for image_dir in images_dir.iterdir():
    name = image_dir.stem
    camera = int(name.split("_")[0])
    n_frame = int(name.split("_")[1])
    if n_frame > filter_range[0] and n_frame < filter_range[1]:
        images_camera[camera].append(image_dir)

In [238]:
for i in range(3):
    images_camera[i] = sorted(images_camera[i], key=sort_key)

In [239]:
sample = [path for i, path in enumerate(images_camera[0]) if i % 3 == 0]

In [240]:
len(sample)

13

In [241]:
base64_images = []
for image_file_dir in sample:
    base64_images.append(encode_image(image_file_dir))

imgs_content = [{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_bs64}"}} for image_bs64 in base64_images]

In [242]:
start = time.time()
first_description = llm.invoke(
                    [
                        HumanMessage(
                        content=[
                            {"type": "text", "text": reflection_system_prompt}] + imgs_content
                        )
                    ]
                )
end = time.time()
print(end - start)

27.238897562026978


In [243]:
first_description.content

'In the provided image sequence, the customer performs the following actions:\n\n1. **Take Actions:**\n   - The customer reaches for a product from the shelf containing canned beverages.\n   - The customer picks up a can from the shelf and places it into a bag.\n\n2. **Return Actions:**\n   - The customer does not appear to return any products to the shelf in the provided sequence.\n\n**Analysis:**\n- The customer performs a total of 1 take action.\n- There are no product return actions observed in the sequence.'

In [244]:
prev_analysis = [{"type": "text", "text": f"## scene analysis:\n {first_description.content}\n Start to generate the structured output"}]

In [245]:
llm_structured =  llm.with_structured_output(schema=Data)

In [246]:
start = time.time()
structured_output = llm_structured.invoke(
                    [
                        HumanMessage(
                        content=[
                            {"type": "text", "text": sys_prompt}] + imgs_content + prev_analysis
                        )
                    ]
                )
end = time.time()
print(end - start)

16.975375652313232


In [247]:
structured_output.actions

[Action(action='taking', product='yellow-beer')]

## Prepare data