## Planning to build this Architecture

![My First Board](https://github.com/Ak62007/Multimodel-Reasoning/blob/main/AI/My%20First%20Board.jpg?raw=true)

In [1]:
import os
import sys
from pathlib import Path

current_dir = Path().cwd()
project_root = current_dir.parent.parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))
    print("done")

done


In [29]:
import os
import logfire
import pandas as pd
import numpy as np
from rich.console import Console
from rich.markdown import Markdown
from pydantic_ai import Agent
from pydantic_ai.models.openai import OpenAIChatModel
from pydantic_ai.providers.openai import OpenAIProvider
from src.utils.save_data import load_df_parquet_safe
from src.utils.datamodels import (
    Blink,
    Smile,
    Gaze,
    Jaw,
    LoudnessState,
    PitchState,
    PitchStd,
    WPS,
    PausePercentageIncrease,
    FillerPercentageIncrease
) 
from AI.prompts import VISUAL_PROMPT

console = Console()

logfire.configure()
logfire.instrument_pydantic_ai()

In [3]:
logfire.info("hello, {place}!", place='World')

19:16:46.179 hello, World!


In [3]:
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
path = project_root / "data" / "processed" / "merge" / "final_dataframe.parquet"
df = load_df_parquet_safe(path=str(path))

### Openrouter

In [5]:
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"

In [6]:
model = OpenAIChatModel(
    model_name='deepseek/deepseek-r1',
    provider=OpenAIProvider(
        base_url=OPENROUTER_BASE_URL,
        api_key=OPENROUTER_API_KEY,
    ),
)

In [7]:
agent = Agent(
    model=model,
    system_prompt=VISUAL_PROMPT
)

In [8]:
async def main(input: str) -> str:
    result = await agent.run(input)
    return result.output

In [26]:
df.head()

Unnamed: 0,blinking_data,gaze_data,jaw_movement_data,smile_data,loudness_data,average_pitch_data,pitch_standard_deviation,words_per_sec,filler_words_usage,pauses_taken,Time,words,text_concat
0,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'open': 0.023826630786061287, 'lateral': None...","{'intensity': 0.00020072988331669704, 'asymmet...",,,,,,,0.0,"[We're, starting]",We're starting
1,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'open': 0.01953260414302349, 'lateral': None,...","{'intensity': 0.00020793307342685806, 'asymmet...",,,,,,,0.5,"[now., [*]]",now. [*]
2,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'open': 0.0008870773017406464, 'lateral': Non...","{'intensity': 6.213216343535776e-05, 'asymmetr...",,,,,,,1.0,,
3,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'open': 0.0015012087533250451, 'lateral': Non...","{'intensity': 4.647151249770331e-06, 'asymmetr...",,,,,,,1.5,"[So, welcome]",So welcome
4,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'open': 0.00043307070154696703, 'lateral': No...","{'intensity': 4.343396951611567e-05, 'asymmetr...",,,,,,,2.0,"[to, the]",to the


In [30]:
df.shape

(1264, 13)

In [27]:
df.columns

Index(['blinking_data', 'gaze_data', 'jaw_movement_data', 'smile_data',
       'loudness_data', 'average_pitch_data', 'pitch_standard_deviation',
       'words_per_sec', 'filler_words_usage', 'pauses_taken', 'Time', 'words',
       'text_concat'],
      dtype='object')

In [9]:
visual_columns = ['blinking_data', 'gaze_data', 'smile_data']

In [10]:
df[visual_columns].head()

Unnamed: 0,blinking_data,gaze_data,smile_data
0,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'intensity': 0.00020072988331669704, 'asymmet..."
1,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'intensity': 0.00020793307342685806, 'asymmet..."
2,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'intensity': 6.213216343535776e-05, 'asymmetr..."
3,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'intensity': 4.647151249770331e-06, 'asymmetr..."
4,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'intensity': 4.343396951611567e-05, 'asymmetr..."


In [11]:
df['blinking_data'].iloc[0]

{'intensity': None,
 'asymmetry': None,
 'is_blinking': True,
 'rz_score': 3.57146416776886,
 'is_anomalous': False,
 'continuos_anomaly': False,
 'part_of_anomalous_range': None}

In [12]:
df[visual_columns].iloc[0]

blinking_data    {'intensity': None, 'asymmetry': None, 'is_bli...
gaze_data        {'horizontal_deviation': None, 'vertical_devia...
smile_data       {'intensity': 0.00020072988331669704, 'asymmet...
Name: 0, dtype: object

In [13]:
blink_data = Blink(**df['blinking_data'].iloc[2])
smile_data = Smile(**df['smile_data'].iloc[2])
gaze_data = Gaze(**df['gaze_data'].iloc[2])
time = df['Time'].iloc[2]

In [14]:
ip = f"""Analize the following visual data for time: {time} sec in the video:  
blinking_data: {blink_data}
smiling_data: {smile_data}
gaze_data: {gaze_data}
"""

print(ip)

Analize the following visual data for time: 1.0 sec in the video:  
blinking_data: intensity=None asymmetry=None is_blinking=False rz_score=1.2975444582065774 is_anomalous=False continuos_anomaly=False part_of_anomalous_range=None
smiling_data: intensity=6.213216343535776e-05 asymmetry=None left_intensity=None right_intensity=None mouth_stretch=None is_smiling=False rz_score=1.0561262510548828 is_anomalous=False continuos_anomaly=False part_of_anomalous_range=None
gaze_data: horizontal_deviation=None vertical_deviation=None primary_direction='down' rz_score=1.1958070888359338 is_anomalous=False continuos_anomaly=False part_of_anomalous_range=None



In [15]:
result = await main(input=ip)
console.print(Markdown(result))

20:00:57.058 agent run
20:00:57.089   chat deepseek/deepseek-r1


In [19]:
df.columns

Index(['blinking_data', 'gaze_data', 'jaw_movement_data', 'smile_data',
       'loudness_data', 'average_pitch_data', 'pitch_standard_deviation',
       'words_per_sec', 'filler_words_usage', 'pauses_taken', 'Time', 'words',
       'text_concat'],
      dtype='object')

In [40]:
visual_columns = ['blinking_data', 'gaze_data', 'smile_data', 'jaw_movement_data']
audio_columns = ['loudness_data', 'average_pitch_data', 'pitch_standard_deviation']
vocabulary_columns = ['words_per_sec', 'filler_words_usage', 'pauses_taken','text_concat']

count = 0
visual_cont_anom = []
audio_cont_anom = []
vocabulary_cont_anom = []
for row in df.iterrows():
    for column in visual_columns:
        if row[1][column]["continuos_anomaly"]:
            # visual
            if column == "blinking_data":
                blink_data = Blink(**row[1][column])
                visual_cont_anom.append(blink_data)
            elif column == "gaze_data":
                gaze_data = Gaze(**row[1][column])
                visual_cont_anom.append(gaze_data)
            elif column == "smile_data":
                smile_data = Smile(**row[1][column])
                visual_cont_anom.append(smile_data)
            elif column == "jaw_movement_data":
                jaw_data = Jaw(**row[1][column])
                visual_cont_anom.append(jaw_data)
            # audio
            elif column == "loudness_data":
                loudness = LoudnessState(**row[1][column])
                audio_cont_anom.append(loudness)
            elif column == "average_pitch_data":
                pitch_state = PitchState(**row[1][column])
                audio_cont_anom.append(pitch_state)
            elif column == "pitch_standard_deviation":
                pitch_std = PitchStd(**row[1][column])
                audio_cont_anom.append(pitch_std)
            # vocab    
            elif column == "words_per_sec":
                wps = WPS(**row[1][column])
                vocabulary_cont_anom.append(wps)
            elif column == "filler_words_usage":
                filler = FillerPercentageIncrease(**row[1][column])
                vocabulary_cont_anom.append(filler)
            elif column == "pauses_taken":
                pause = PausePercentageIncrease(**row[1][column])
                vocabulary_cont_anom.append(pause)           

In [41]:
visual_cont_anom

[Jaw(open=0.09769998490810394, lateral=-0.008335927486768924, forward=8.645569323562086e-05, is_open=False, rz_score=0.19769195175620863, is_anomalous=True, continuos_anomaly=True, part_of_anomalous_range=[27.0, 28.0, 29.0]),
 Jaw(open=0.14945782721042633, lateral=-0.06227132874482777, forward=0.00012334305210970342, is_open=False, rz_score=1.4374498313673931, is_anomalous=True, continuos_anomaly=True, part_of_anomalous_range=[27.0, 28.0, 29.0]),
 Jaw(open=0.09816937148571014, lateral=-0.01106842583976686, forward=8.533063373761252e-05, is_open=False, rz_score=1.3546747034156208, is_anomalous=True, continuos_anomaly=True, part_of_anomalous_range=[27.0, 28.0, 29.0]),
 Smile(intensity=0.03983886547646804, asymmetry=0.029761669960440426, left_intensity=0.024958030496247827, right_intensity=0.05471970045668825, mouth_stretch=0.046422227285802364, is_smiling=False, rz_score=108.13762851196142, is_anomalous=True, continuos_anomaly=True, part_of_anomalous_range=[46.0, 47.0, 48.0, 49.0, 50.0, 

In [42]:
len(visual_cont_anom)

349