## Planning to build this Architecture

![My First Board](https://github.com/Ak62007/Multimodel-Reasoning/blob/main/AI/My%20First%20Board.jpg?raw=true)

In [1]:
import os
import sys
from pathlib import Path

current_dir = Path.cwd()
project_root = current_dir.parent.parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))
    print("done")

done


In [2]:
import os
import logfire
import pandas as pd
import numpy as np
import asyncio
from pprint import pprint
from rich.console import Console
from rich.markdown import Markdown
from pydantic_ai import Agent, RunContext
from typing import Union
from pydantic_ai.models.groq import GroqModel
from groq import Groq
from pydantic_ai.models.openai import OpenAIChatModel
from pydantic_ai.providers.openai import OpenAIProvider
from src.utils.save_data import load_df_parquet_safe
from src.utils.datamodels import (
    Blink,
    Smile,
    Gaze,
    Jaw,
    LoudnessState,
    PitchState,
    PitchStd,
    WPS,
    PausePercentageIncrease,
    FillerPercentageIncrease
)
from AI.datamodels import (
    VisualAnalysisReport,
    AudioAnalysisReport,
    VocabularyAnalysisReport,
    IntegratedBehavioralReport,
    FinalReport
    
)
from AI.prompts import (
    VISUAL_PROMPT,
    AUDIO_PROMPT,
    VOCABULARY_PROMPT,
    CORR_CONT_PROMPT,
    JUDGE_PROMPT,
)

console = Console()

logfire.configure()
logfire.instrument_pydantic_ai()

In [3]:
logfire.info("hello, {place}!", place='World')

20:31:21.782 hello, World!


In [4]:
from dotenv import load_dotenv
load_dotenv()

True

In [5]:
path = project_root / "data" / "processed" / "merge" / "final_dataframe.parquet"
df = load_df_parquet_safe(path=str(path))

### Openrouter

In [6]:
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
HUGGING_FACE = os.getenv("HUGGING_FACE")
OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"

In [7]:
df.head()

Unnamed: 0,blinking_data,gaze_data,jaw_movement_data,smile_data,loudness_data,average_pitch_data,pitch_standard_deviation,words_per_sec,filler_words_usage,pauses_taken,Time,words,text_concat,speaker
0,"{'intensity': 0.5590226024389267, 'asymmetry':...","{'horizontal_deviation': -0.03615564603520849,...","{'open': 0.023826630786061287, 'lateral': -0.0...","{'intensity': 0.00020072988331669704, 'asymmet...",,,,,,,0.0,"[We're, starting]",We're starting,
1,"{'intensity': 0.20927320420742035, 'asymmetry'...","{'horizontal_deviation': -0.08192594102298828,...","{'open': 0.01953260414302349, 'lateral': -0.00...","{'intensity': 0.00020793307342685806, 'asymmet...",,,,,,,0.5,"[now., [*]]",now. [*],A
2,"{'intensity': 0.2994565159082413, 'asymmetry':...","{'horizontal_deviation': -0.03495631815543154,...","{'open': 0.0008870773017406464, 'lateral': -0....","{'intensity': 6.213216343535776e-05, 'asymmetr...",,,,,,,1.0,,,A
3,"{'intensity': 0.28583882749080664, 'asymmetry'...","{'horizontal_deviation': 0.005119363319349368,...","{'open': 0.0015012087533250451, 'lateral': -0....","{'intensity': 4.647151249770331e-06, 'asymmetr...",,,,,,,1.5,"[So, welcome]",So welcome,A
4,"{'intensity': 0.37450549006462097, 'asymmetry'...","{'horizontal_deviation': 0.0903685688076663, '...","{'open': 0.00043307070154696703, 'lateral': -0...","{'intensity': 4.343396951611567e-05, 'asymmetr...",,,,,,,2.0,"[to, the]",to the,A


In [8]:
df.shape

(1264, 14)

In [9]:
df.columns

Index(['blinking_data', 'gaze_data', 'jaw_movement_data', 'smile_data',
       'loudness_data', 'average_pitch_data', 'pitch_standard_deviation',
       'words_per_sec', 'filler_words_usage', 'pauses_taken', 'Time', 'words',
       'text_concat', 'speaker'],
      dtype='object')

In [10]:
visual_columns = ['blinking_data', 'gaze_data', 'smile_data']

In [11]:
df[visual_columns].head()

Unnamed: 0,blinking_data,gaze_data,smile_data
0,"{'intensity': 0.5590226024389267, 'asymmetry':...","{'horizontal_deviation': -0.03615564603520849,...","{'intensity': 0.00020072988331669704, 'asymmet..."
1,"{'intensity': 0.20927320420742035, 'asymmetry'...","{'horizontal_deviation': -0.08192594102298828,...","{'intensity': 0.00020793307342685806, 'asymmet..."
2,"{'intensity': 0.2994565159082413, 'asymmetry':...","{'horizontal_deviation': -0.03495631815543154,...","{'intensity': 6.213216343535776e-05, 'asymmetr..."
3,"{'intensity': 0.28583882749080664, 'asymmetry'...","{'horizontal_deviation': 0.005119363319349368,...","{'intensity': 4.647151249770331e-06, 'asymmetr..."
4,"{'intensity': 0.37450549006462097, 'asymmetry'...","{'horizontal_deviation': 0.0903685688076663, '...","{'intensity': 4.343396951611567e-05, 'asymmetr..."


In [12]:
df['blinking_data'].iloc[0]

{'intensity': 0.5590226024389267,
 'asymmetry': 0.018826717138290427,
 'is_blinking': True,
 'rz_score': 3.57146416776886,
 'is_anomalous': False,
 'continuous_anomaly': False,
 'part_of_anomalous_range': None}

In [13]:
df[visual_columns].iloc[0]

blinking_data    {'intensity': 0.5590226024389267, 'asymmetry':...
gaze_data        {'horizontal_deviation': -0.03615564603520849,...
smile_data       {'intensity': 0.00020072988331669704, 'asymmet...
Name: 0, dtype: object

In [17]:
blink_data = Blink(**df['blinking_data'].iloc[2])
smile_data = Smile(**df['smile_data'].iloc[2])
gaze_data = Gaze(**df['gaze_data'].iloc[2])
time = df['Time'].iloc[2]

In [18]:
ip = f"""Analize the following visual data for time: {time} sec in the video:  
blinking_data: {blink_data}
smiling_data: {smile_data}
gaze_data: {gaze_data}
"""

print(ip)

Analize the following visual data for time: 1.0 sec in the video:  
blinking_data: intensity=None asymmetry=None is_blinking=False rz_score=1.2975444582065774 is_anomalous=False continuous_anomaly=False part_of_anomalous_range=None
smiling_data: intensity=6.213216343535776e-05 asymmetry=None left_intensity=None right_intensity=None mouth_stretch=None is_smiling=False rz_score=1.0561262510548828 is_anomalous=False continuous_anomaly=False part_of_anomalous_range=None
gaze_data: horizontal_deviation=None vertical_deviation=None primary_direction='down' rz_score=1.1958070888359338 is_anomalous=False continuous_anomaly=False part_of_anomalous_range=None



In [19]:
result = await main(input=ip)
console.print(Markdown(result))

17:32:38.529 agent run
17:32:38.545   chat deepseek/deepseek-r1


In [19]:
df.columns

Index(['blinking_data', 'gaze_data', 'jaw_movement_data', 'smile_data',
       'loudness_data', 'average_pitch_data', 'pitch_standard_deviation',
       'words_per_sec', 'filler_words_usage', 'pauses_taken', 'Time', 'words',
       'text_concat'],
      dtype='object')

In [14]:
visual_columns = ['blinking_data', 'gaze_data', 'smile_data', 'jaw_movement_data']
audio_columns = ['loudness_data', 'average_pitch_data', 'pitch_standard_deviation']
vocabulary_columns = ['words_per_sec', 'filler_words_usage', 'pauses_taken']

In [15]:
def get_cont_anomaly(df: pd.DataFrame, columns: list[str], start_time: float, end_time: float) -> list:
    cont_anom_series = []
    start = int(start_time / 0.5)
    end = int(end_time / 0.5)
    
    if end > len(df):
        end = len(df)
    
    window_df = df.iloc[start : end]
    for row in window_df.iterrows():
        for column in columns:
        # print(column)
            if row[1][column] != None:
                if row[1][column].get("continuous_anomaly"):
                    # visual
                    if column == "blinking_data":
                        blink_data = Blink(**row[1][column])
                        cont_anom_series.append(blink_data)
                    elif column == "gaze_data":
                        gaze_data = Gaze(**row[1][column])
                        cont_anom_series.append(gaze_data)
                    elif column == "smile_data":
                        smile_data = Smile(**row[1][column])
                        cont_anom_series.append(smile_data)
                    elif column == "jaw_movement_data":
                        jaw_data = Jaw(**row[1][column])
                        cont_anom_series.append(jaw_data)
                    # audio
                    elif column == "loudness_data":
                        loudness = LoudnessState(**row[1][column])
                        cont_anom_series.append(loudness)
                    elif column == "average_pitch_data":
                        pitch_state = PitchState(**row[1][column])
                        cont_anom_series.append(pitch_state)
                    elif column == "pitch_standard_deviation":
                        pitch_std = PitchStd(**row[1][column])
                        cont_anom_series.append(pitch_std)
                    # vocab    
                    elif column == "words_per_sec":
                        wps = WPS(**row[1][column])
                        cont_anom_series.append(wps)
                    elif column == "filler_words_usage":
                        filler = FillerPercentageIncrease(**row[1][column])
                        cont_anom_series.append(filler)
                    elif column == "pauses_taken":
                        pause = PausePercentageIncrease(**row[1][column])
                        cont_anom_series.append(pause)
                else:
                    continue
            else:
                continue
    
    return cont_anom_series   

In [16]:
get_cont_anomaly(
    df=df,
    columns=visual_columns,
    end_time=100,
    start_time=20,
)

[Smile(intensity=0.03983886547646804, asymmetry=0.029761669960440426, left_intensity=0.024958030496247827, right_intensity=0.05471970045668825, mouth_stretch=0.046422227285802364, is_smiling=False, rz_score=108.13762851196142, is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[23.0, 23.5, 24.0, 24.5, 25.0, 25.5, 26.0, 26.5]),
 Smile(intensity=2.0560361376809055e-05, asymmetry=5.8263413968973055e-06, left_intensity=1.7647190678360402e-05, right_intensity=2.3473532075257707e-05, mouth_stretch=7.175633618317079e-05, is_smiling=False, rz_score=83.93131624897828, is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[23.0, 23.5, 24.0, 24.5, 25.0, 25.5, 26.0, 26.5]),
 Smile(intensity=1.6891864990853377e-05, asymmetry=3.840338350613593e-06, left_intensity=1.4971695815546582e-05, right_intensity=1.8812034166160175e-05, mouth_stretch=5.220679213380208e-05, is_smiling=False, rz_score=65.09418791100305, is_anomalous=True, continuous_anomaly=True, part_of_anomalous_

In [17]:
df.columns

Index(['blinking_data', 'gaze_data', 'jaw_movement_data', 'smile_data',
       'loudness_data', 'average_pitch_data', 'pitch_standard_deviation',
       'words_per_sec', 'filler_words_usage', 'pauses_taken', 'Time', 'words',
       'text_concat', 'speaker'],
      dtype='object')

In [18]:
df[["Time", "blinking_data"]].iloc[-1]

Time                                                         631.5
blinking_data    {'intensity': 0.2976881206035614, 'asymmetry':...
Name: 1263, dtype: object

In [19]:
for i in range(len(df)):
    if df['filler_words_usage'].iloc[i] != None:
        pp = FillerPercentageIncrease(**(df['filler_words_usage'].iloc[i]))
        if pp.continuous_anomaly:
            print(pp)

filler_percentage_level='abnormally high' is_anomalous=True continuous_anomaly=True part_of_anomalous_range=[173.0, 174.5]
filler_percentage_level='abnormally high' is_anomalous=True continuous_anomaly=True part_of_anomalous_range=[173.0, 174.5]
filler_percentage_level='abnormally high' is_anomalous=True continuous_anomaly=True part_of_anomalous_range=[192.5, 193.5]
filler_percentage_level='abnormally high' is_anomalous=True continuous_anomaly=True part_of_anomalous_range=[192.5, 193.5]
filler_percentage_level='abnormally high' is_anomalous=True continuous_anomaly=True part_of_anomalous_range=[225.0, 225.5]
filler_percentage_level='abnormally high' is_anomalous=True continuous_anomaly=True part_of_anomalous_range=[225.0, 225.5]
filler_percentage_level='abnormally high' is_anomalous=True continuous_anomaly=True part_of_anomalous_range=[242.5, 244.5, 245.0, 245.5]
filler_percentage_level='abnormally high' is_anomalous=True continuous_anomaly=True part_of_anomalous_range=[242.5, 244.5, 24

In [20]:
df['text_concat'].iloc[0:10]

0        We're starting
1              now. [*]
2                      
3            So welcome
4                to the
5            interview.
6                 Let's
7            begin with
8    a simple question.
9                      
Name: text_concat, dtype: object

## Working on two more simple helper functions.

In [21]:
from src.sync.Feature_Transformation import get_speaker_timings    

In [22]:
df.columns

Index(['blinking_data', 'gaze_data', 'jaw_movement_data', 'smile_data',
       'loudness_data', 'average_pitch_data', 'pitch_standard_deviation',
       'words_per_sec', 'filler_words_usage', 'pauses_taken', 'Time', 'words',
       'text_concat', 'speaker'],
      dtype='object')

In [23]:
b = get_speaker_timings(speaker_times=df[['Time', 'speaker']], speaker='B')

In [24]:
a = get_speaker_timings(speaker_times=df[['Time', 'speaker']], speaker='A')

In [25]:
a

[(0.5, 11.5), (140.0, 163.5), (288.5, 318.0), (604.0, 631.5)]

In [26]:
b

[(11.5, 140.0), (163.5, 288.5), (318.0, 604.0)]

In [27]:
def g_get_speaker_timings(df: pd.DataFrame, start: float, end: float) -> list:
    interviewer_timings = get_speaker_timings(
        speaker_times=df,
        speaker='A'
    )
    interviewee_timings = get_speaker_timings(
        speaker_times=df,
        speaker='B'
    )
    
    spe_time_map = []
    
    for times in interviewee_timings:
        spe_time_map.append((times, "Interviewee"))
        
    for times in interviewer_timings:
        spe_time_map.append((times, "Interviewer"))
        
    spe_time_map = sorted(spe_time_map, key=lambda x:x[0][0])
    
    start_idx = 0
    end_idx = 0
    
    for i, map in enumerate(spe_time_map):
        if (map[0][0] <= start) and (map[0][1] > start):
            start_idx = i
            break
        
    for i, map in enumerate(spe_time_map):
        if (map[0][0] <= end) and (map[0][1] > end):
            end_idx = i
            break
    
    return spe_time_map[start_idx : end_idx + 1]

In [28]:
g_get_speaker_timings(
    df=df[['Time', 'speaker']],
    start = 30,
    end = 200
)

[((11.5, 140.0), 'Interviewee'),
 ((140.0, 163.5), 'Interviewer'),
 ((163.5, 288.5), 'Interviewee')]

In [29]:
df.columns

Index(['blinking_data', 'gaze_data', 'jaw_movement_data', 'smile_data',
       'loudness_data', 'average_pitch_data', 'pitch_standard_deviation',
       'words_per_sec', 'filler_words_usage', 'pauses_taken', 'Time', 'words',
       'text_concat', 'speaker'],
      dtype='object')

In [30]:
df["text_concat"]

0               We're starting
1                     now. [*]
2                             
3                   So welcome
4                       to the
                 ...          
1259                        So
1260                     yeah,
1261                  that's a
1262    really solid approach.
1263                          
Name: text_concat, Length: 1264, dtype: object

In [31]:
def get_transcript(df:pd.DataFrame, start: float, end: float):
    transcript = {}
    speaker_timings = g_get_speaker_timings(
        df=df[['Time', 'speaker']],
        start=start,
        end=end,
    )
    
    for map in speaker_timings:
        transcript[map[0]] = " ".join(df.loc[(df["Time"] >= map[0][0]) & (df["Time"] < map[0][1]), "text_concat"].values)
        
    return transcript

In [32]:
" ".join(df.loc[(df["Time"] >= 11.5) & (df["Time"] < 140.0), "text_concat"].values)

"[*]  right now I'm in third year of my [*] college.  I mean, I'm going to be in the sixth semester,  but [*] yeah, [*]  from [*] the start of my [*] college, I think from my second semester towards [*] the end [*] of it, I got interested in [*]  like [*] machine learning. I [*] got to know about [*] machine learning [*] from some [*] channels, from YouTube channels and all. [*] And I thought, yeah, it is an interesting field.  I got to know [*] some things about [*] it. I took [*] a course, a very famous course from Andrew Ng, [*] which is ML [*] specialization course,  which is on course era. So in the [*] break, in the summer break, which I've got in the college, I actually completed that course. [*] And actually I was very intrigued [*] by [*]  how   like we do all this stuff, what we do in the [*] like machine learning field, right? So [*] it was very interesting. [*] And [*]  like [*]  after that, I was just doing some courses, then I got into deep learning.  I then got into some

In [33]:
get_transcript(
    df=df[['Time', 'text_concat', 'speaker']],
    start=30,
    end=200,
)

{(11.5,
  140.0): "[*]  right now I'm in third year of my [*] college.  I mean, I'm going to be in the sixth semester,  but [*] yeah, [*]  from [*] the start of my [*] college, I think from my second semester towards [*] the end [*] of it, I got interested in [*]  like [*] machine learning. I [*] got to know about [*] machine learning [*] from some [*] channels, from YouTube channels and all. [*] And I thought, yeah, it is an interesting field.  I got to know [*] some things about [*] it. I took [*] a course, a very famous course from Andrew Ng, [*] which is ML [*] specialization course,  which is on course era. So in the [*] break, in the summer break, which I've got in the college, I actually completed that course. [*] And actually I was very intrigued [*] by [*]  how   like we do all this stuff, what we do in the [*] like machine learning field, right? So [*] it was very interesting. [*] And [*]  like [*]  after that, I was just doing some courses, then I got into deep learning.  I 

### Let's Build the Agents

#### Visual Agent

In [34]:
get_cont_anomaly(
    df=df,
    columns=vocabulary_columns,
    start_time=0,
    end_time=150
)

[PausePercentageIncrease(pause_percentage_level='abnormally high', is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[58.0, 59.0, 59.5]),
 PausePercentageIncrease(pause_percentage_level='abnormally high', is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[58.0, 59.0, 59.5]),
 PausePercentageIncrease(pause_percentage_level='abnormally high', is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[58.0, 59.0, 59.5]),
 PausePercentageIncrease(pause_percentage_level='abnormally high', is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[67.5, 68.5]),
 PausePercentageIncrease(pause_percentage_level='abnormally high', is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[67.5, 68.5]),
 PausePercentageIncrease(pause_percentage_level='abnormally high', is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[92.0, 93.5]),
 PausePercentageIncrease(pause_percentage_level='abnormally high', is_anomalous=Tr

In [35]:
timings = g_get_speaker_timings(
    df=df[['Time', 'speaker']],
    start = 40,
    end = 80
)

In [36]:
timings

[((11.5, 140.0), 'Interviewee')]

In [37]:
# let's design the input for the vision agent
"""
time_range - {40 - 80} in sec:
comtinous anomalous visual data:
[LoudnessState(level='normal', rz_score=0.21644708892544837, is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[51.0, 51.5]),
 LoudnessState(level='normal', rz_score=0.32703274586668574, is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[51.0, 51.5]),
 LoudnessState(level='very_quiet', rz_score=-18.420420367335318, is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[57.0, 57.5, 58.0, 58.5, 59.5, 60.0, 60.5, 61.0]),
 LoudnessState(level='very_quiet', rz_score=-12.012118282163145, is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[57.0, 57.5, 58.0, 58.5, 59.5, 60.0, 60.5, 61.0]),
 LoudnessState(level='very_quiet', rz_score=-8.768461296822297, is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[57.0, 57.5, 58.0, 58.5, 59.5, 60.0, 60.5, 61.0]),
 LoudnessState(level='very_quiet', rz_score=-6.124213469142822, is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[57.0, 57.5, 58.0, 58.5, 59.5, 60.0, 60.5, 61.0]),
 LoudnessState(level='very_quiet', rz_score=-20.95845085627751, is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[57.0, 57.5, 58.0, 58.5, 59.5, 60.0, 60.5, 61.0]),
 LoudnessState(level='very_quiet', rz_score=-14.167723959999458, is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[57.0, 57.5, 58.0, 58.5, 59.5, 60.0, 60.5, 61.0]),
 LoudnessState(level='very_quiet', rz_score=-8.739599690081883, is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[57.0, 57.5, 58.0, 58.5, 59.5, 60.0, 60.5, 61.0]),
 LoudnessState(level='very_quiet', rz_score=-5.73364850434323, is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[57.0, 57.5, 58.0, 58.5, 59.5, 60.0, 60.5, 61.0]),
 PitchState(relative_level='lower', rz_score=-1.5851804435286332, is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[62.5, 63.0, 63.5, 64.0]),
 PitchState(relative_level='lower', rz_score=-1.628496432439459, is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[62.5, 63.0, 63.5, 64.0]),
 PitchState(relative_level='lower', rz_score=-1.748538550003983, is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[62.5, 63.0, 63.5, 64.0]),
 PitchState(relative_level='lower', rz_score=-1.8512040772549108, is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[62.5, 63.0, 63.5, 64.0]),
 PitchStd(expressiveness='slightly_expressive', rz_score=-1.9220249920804544, is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[67.5, 68.5]),
 PitchStd(expressiveness='slightly_expressive', rz_score=-2.002495804835701, is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[67.5, 68.5]),
 LoudnessState(level='normal', rz_score=0.4205996485804423, is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[72.5, 73.0]),
 LoudnessState(level='normal', rz_score=0.5440056070530159, is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[72.5, 73.0])]
 
 Who is speaking when:
 [((11.5, 140.0), 'Interviewee')]
"""

"\ntime_range - {40 - 80} in sec:\ncomtinous anomalous visual data:\n[LoudnessState(level='normal', rz_score=0.21644708892544837, is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[51.0, 51.5]),\n LoudnessState(level='normal', rz_score=0.32703274586668574, is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[51.0, 51.5]),\n LoudnessState(level='very_quiet', rz_score=-18.420420367335318, is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[57.0, 57.5, 58.0, 58.5, 59.5, 60.0, 60.5, 61.0]),\n LoudnessState(level='very_quiet', rz_score=-12.012118282163145, is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[57.0, 57.5, 58.0, 58.5, 59.5, 60.0, 60.5, 61.0]),\n LoudnessState(level='very_quiet', rz_score=-8.768461296822297, is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[57.0, 57.5, 58.0, 58.5, 59.5, 60.0, 60.5, 61.0]),\n LoudnessState(level='very_quiet', rz_score=-6.124213469142822, is_anomalous=True, cont

In [38]:
model_name = "llama-3.3-70b-versatile"

In [39]:
from pydantic_ai.models.groq import GroqModel
from pydantic_ai.providers.groq import GroqProvider

model = GroqModel(
    model_name, provider=GroqProvider(api_key=GROQ_API_KEY)
)

In [40]:
visual_agent = Agent(
    model=model,
    system_prompt=VISUAL_PROMPT,
    output_type=VisualAnalysisReport,
    output_retries=2,
    retries=2,
)

In [41]:
audio_agent = Agent(
    model=model,
    system_prompt=AUDIO_PROMPT,
    output_type=AudioAnalysisReport,
    output_retries=2,
    retries=2,
)

In [42]:
vocabulary_agent = Agent(
    model=model,
    system_prompt=VOCABULARY_PROMPT,
    output_type=VocabularyAnalysisReport,
    output_retries=2,
    retries=2
)

In [43]:
corr_and_contr_agent = Agent(
    model=model,
    system_prompt=CORR_CONT_PROMPT,
    output_type=IntegratedBehavioralReport,
    output_retries=2,
    retries=2
)

In [44]:
judge_agent = Agent(
    model=model,
    system_prompt=JUDGE_PROMPT,
    output_type=FinalReport,
    output_retries=2,
    retries=2
)

In [45]:
async def run_agent(agent: Agent, input_prompt: str) -> Union[VisualAnalysisReport, AudioAnalysisReport, VocabularyAnalysisReport, IntegratedBehavioralReport]:
    result = await agent.run(input_prompt)
    return result.output

In [46]:
async def one_loop(
    df: pd.DataFrame,
    start_time: float,
    end_time: float,
    visual_columns: list[str],
    audio_columns: list[str],
    vocabulary_columns : list[str],
    visual_agent: Agent,
    audio_agent: Agent,
    vocabulary_agent: Agent,
    corr_and_contr_agent: Agent,
    ) -> IntegratedBehavioralReport:
    
    logfire.info("Inside the {start}secs - {end}secs time range loop", start=start_time, end=end_time)
    
    # preparing inputs
    # visual anoms
    logfire.info("getting the visual continous anomalies")
    visual_cont_anoms = get_cont_anomaly(
        df=df,
        start_time=start_time,
        end_time=end_time,
        columns=visual_columns,
    )
    logfire.info("getting the audio continous anomalies")
    # audio anoms
    audio_cont_anoms = get_cont_anomaly(
        df=df,
        start_time=start_time,
        end_time=end_time,
        columns=audio_columns,
    )
    logfire.info("getting the vocab continous anomalies")
    # vocabulary anoms
    vocab_cont_anoms = get_cont_anomaly(
        df=df,
        start_time=start_time,
        end_time=end_time,
        columns=vocabulary_columns,
    )
    logfire.info("getting the who is speaking when data")
    # Who is speaking when
    who_when = g_get_speaker_timings(
        df=df[['Time', 'speaker']],
        start=start_time,
        end=end_time,
    )
    logfire.info("getting the what is spoken when data")
    # when is what spoken
    who_what = get_transcript(
        df=df[['Time', 'text_concat', 'speaker']],
        start=start_time,
        end=end_time,
    )
    logfire.info("building the prompts...")
    # building the prompts
    # visual_prompt
    visual_prompt = f"""
    Time_Range: {start_time} : {end_time} secs.
    Continuous Anomalous Visual Features: 
    {visual_cont_anoms}
    Who is speaking when: 
    {who_when}
    """
    
    # audio_prompt
    audio_prompt = f"""
    Time_Range: {start_time} : {end_time} secs.
    Continuous Anomalous Audio Features: 
    {audio_cont_anoms}
    Who is speaking when: 
    {who_when}
    """
    
    # vocab_prompt
    vocab_prompt = f"""
    Time_Range: {start_time} : {end_time} secs.
    Continuous Anomalous Vocabulary Features: 
    {vocab_cont_anoms}
    Who is speaking when: 
    {who_when}
    """
    
    logfire.info("Running Visual, Audio, and Vocab Agents in PARALLEL...")
    
    # coroutines (tasks)
    visual_task = run_agent(agent=visual_agent, input_prompt=visual_prompt)
    audio_task = run_agent(agent=audio_agent, input_prompt=audio_prompt)
    vocab_task = run_agent(agent=vocabulary_agent, input_prompt=vocab_prompt)
    
    # Running them simultaneously
    try:
        visual_report, audio_report, vocab_report = await asyncio.gather(
            visual_task, 
            audio_task, 
            vocab_task
        )
    except Exception as e:
        logfire.error("Error during parallel agent execution: {e}", e=e)
        print(f"Error during parallel agent execution: {e}")
        return

    logfire.info("Building prompt for corr_agent...")
    
    # corr_prompt
    corr_prompt = f"""
    Time_Range: {start_time} : {end_time} secs.
    
    Continuous Anomalous visual analysis report: 
    {visual_report}
    
    Continuous Anomalous audio analysis report: 
    {audio_report}
    
    Continuous Anomalous vocabulary analysis report: 
    {vocab_report}
    
    Who is speaking when: 
    {who_when}
    
    What is spoken when:
    {who_what}
    """
    
    logfire.info("running corr_agent...")
    # running the corr_agent
    try:
        corr_cont_report = await run_agent(
            agent=corr_and_contr_agent,
            input_prompt=corr_prompt
        )
    except Exception as e:
        logfire.error("Error running corr_agent: {e}", e=e)
        print(f"Error: {e}")
        return
        
    logfire.info("corr_agent ran successfully.")
    return corr_cont_report

In [47]:
start = 0
end = 60

result = await one_loop(
    df=df,
    start_time=start,
    end_time=end,
    visual_columns=visual_columns,
    audio_columns=audio_columns,
    vocabulary_columns=vocabulary_columns,
    visual_agent=visual_agent,
    audio_agent=audio_agent,
    vocabulary_agent=vocabulary_agent,
    corr_and_contr_agent=corr_and_contr_agent,
)

20:31:40.592 Inside the 0secs - 60secs time range loop
20:31:40.594 getting the visual continous anomalies
20:31:40.606 getting the audio continous anomalies
20:31:40.617 getting the vocab continous anomalies
20:31:40.628 getting the who is speaking when data
20:31:40.763 getting the what is spoken when data
20:31:40.913 building the prompts...
20:31:40.913 Running Visual, Audio, and Vocab Agents in PARALLEL...
20:31:40.913 agent run
20:31:40.929 agent run
20:31:40.929 agent run
             agent run
20:31:40.929   chat llama-3.3-70b-versatile
             agent run
20:31:40.966   chat llama-3.3-70b-versatile
             agent run
20:31:40.966   chat llama-3.3-70b-versatile
20:31:42.331 Building prompt for corr_agent...
20:31:42.332 running corr_agent...
20:31:42.334 agent run
20:31:42.336   chat llama-3.3-70b-versatile
20:31:44.031 corr_agent ran successfully.


In [48]:
pprint(result.model_dump())

{'executive_summary': 'The subject exhibited high stress and cognitive '
                      'overload, with anomalies in visual, audio, and '
                      'vocabulary analysis, indicating possible nervousness '
                      'and frustration while discussing their background and '
                      "interest in ML engineering. The subject's behavior and "
                      'verbal cues suggest a lack of confidence or '
                      'exaggeration in their claims.',
 'key_insights': [{'anomalies_detected': ['Visual: Jaw Tension'],
                   'behavioral_analysis': 'Subject exhibited jaw tension while '
                                          'discussing their college '
                                          'experience, indicating possible '
                                          'nervousness.',
                   'spoken_content': "right now I'm in third year of my "
                                     'college',
                   '

In [50]:
judge_input_prompt = f"""
Overall multimodel analysis of the interviewee:
{result.model_dump()}
"""

In [53]:
result=await run_agent(
    agent=judge_agent,
    input_prompt=judge_input_prompt
)

20:37:14.833 agent run
20:37:14.836   chat llama-3.3-70b-versatile


In [55]:
pprint(result.model_dump())

{'areas_for_improvement': 'To improve interview presence, the candidate should '
                          'focus on managing cognitive load by taking '
                          'deliberate pauses before answering complex '
                          'questions. They should also work on maintaining a '
                          'consistent tone and volume, avoiding sudden drops '
                          'in volume that may suggest hesitation or secrecy. '
                          'Additionally, the candidate should strive to reduce '
                          'nervous behaviors such as jaw tension, rapid '
                          'blinking, and micro-expression smiles. Finally, '
                          'they should practice answering questions in a more '
                          'straightforward and confident manner, minimizing '
                          'the use of stalling words and phrases.',
 'behavioral_strengths': 'There were no significant moments of strong '
        