## Planning to build this Architecture

![My First Board](https://github.com/Ak62007/Multimodel-Reasoning/blob/main/AI/My%20First%20Board.jpg?raw=true)

In [1]:
import os
import sys
from pathlib import Path

current_dir = Path.cwd()
project_root = current_dir.parent.parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))
    print("done")

done


In [2]:
import os
import logfire
import pandas as pd
import numpy as np
from rich.console import Console
from rich.markdown import Markdown
from pydantic_ai import Agent
from pydantic_ai.models.openai import OpenAIChatModel
from pydantic_ai.providers.openai import OpenAIProvider
from src.utils.save_data import load_df_parquet_safe
from src.utils.datamodels import (
    Blink,
    Smile,
    Gaze,
    Jaw,
    LoudnessState,
    PitchState,
    PitchStd,
    WPS,
    PausePercentageIncrease,
    FillerPercentageIncrease
) 
from AI.prompts import VISUAL_PROMPT

console = Console()

logfire.configure()
logfire.instrument_pydantic_ai()

In [3]:
logfire.info("hello, {place}!", place='World')

19:41:28.755 hello, World!


In [4]:
from dotenv import load_dotenv
load_dotenv()

True

In [5]:
path = project_root / "data" / "processed" / "merge" / "final_dataframe.parquet"
df = load_df_parquet_safe(path=str(path))

### Openrouter

In [6]:
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"

In [7]:
model = OpenAIChatModel(
    model_name='deepseek/deepseek-r1',
    provider=OpenAIProvider(
        base_url=OPENROUTER_BASE_URL,
        api_key=OPENROUTER_API_KEY,
    ),
)

In [8]:
agent = Agent(
    model=model,
    system_prompt=VISUAL_PROMPT
)

In [9]:
async def main(input: str) -> str:
    result = await agent.run(input)
    return result.output

In [10]:
df.head()

Unnamed: 0,blinking_data,gaze_data,jaw_movement_data,smile_data,loudness_data,average_pitch_data,pitch_standard_deviation,words_per_sec,filler_words_usage,pauses_taken,Time,words,text_concat,speaker
0,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'open': 0.023826630786061287, 'lateral': None...","{'intensity': 0.00020072988331669704, 'asymmet...",,,,,,,0.0,"[We're, starting]",We're starting,
1,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'open': 0.01953260414302349, 'lateral': None,...","{'intensity': 0.00020793307342685806, 'asymmet...",,,,,,,0.5,"[now., [*]]",now. [*],A
2,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'open': 0.0008870773017406464, 'lateral': Non...","{'intensity': 6.213216343535776e-05, 'asymmetr...",,,,,,,1.0,,,A
3,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'open': 0.0015012087533250451, 'lateral': Non...","{'intensity': 4.647151249770331e-06, 'asymmetr...",,,,,,,1.5,"[So, welcome]",So welcome,A
4,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'open': 0.00043307070154696703, 'lateral': No...","{'intensity': 4.343396951611567e-05, 'asymmetr...",,,,,,,2.0,"[to, the]",to the,A


In [11]:
df.shape

(1264, 14)

In [12]:
df.columns

Index(['blinking_data', 'gaze_data', 'jaw_movement_data', 'smile_data',
       'loudness_data', 'average_pitch_data', 'pitch_standard_deviation',
       'words_per_sec', 'filler_words_usage', 'pauses_taken', 'Time', 'words',
       'text_concat', 'speaker'],
      dtype='object')

In [13]:
visual_columns = ['blinking_data', 'gaze_data', 'smile_data']

In [14]:
df[visual_columns].head()

Unnamed: 0,blinking_data,gaze_data,smile_data
0,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'intensity': 0.00020072988331669704, 'asymmet..."
1,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'intensity': 0.00020793307342685806, 'asymmet..."
2,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'intensity': 6.213216343535776e-05, 'asymmetr..."
3,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'intensity': 4.647151249770331e-06, 'asymmetr..."
4,"{'intensity': None, 'asymmetry': None, 'is_bli...","{'horizontal_deviation': None, 'vertical_devia...","{'intensity': 4.343396951611567e-05, 'asymmetr..."


In [15]:
df['blinking_data'].iloc[0]

{'intensity': None,
 'asymmetry': None,
 'is_blinking': True,
 'rz_score': 3.57146416776886,
 'is_anomalous': False,
 'continuous_anomaly': False,
 'part_of_anomalous_range': None}

In [16]:
df[visual_columns].iloc[0]

blinking_data    {'intensity': None, 'asymmetry': None, 'is_bli...
gaze_data        {'horizontal_deviation': None, 'vertical_devia...
smile_data       {'intensity': 0.00020072988331669704, 'asymmet...
Name: 0, dtype: object

In [17]:
blink_data = Blink(**df['blinking_data'].iloc[2])
smile_data = Smile(**df['smile_data'].iloc[2])
gaze_data = Gaze(**df['gaze_data'].iloc[2])
time = df['Time'].iloc[2]

In [18]:
ip = f"""Analize the following visual data for time: {time} sec in the video:  
blinking_data: {blink_data}
smiling_data: {smile_data}
gaze_data: {gaze_data}
"""

print(ip)

Analize the following visual data for time: 1.0 sec in the video:  
blinking_data: intensity=None asymmetry=None is_blinking=False rz_score=1.2975444582065774 is_anomalous=False continuous_anomaly=False part_of_anomalous_range=None
smiling_data: intensity=6.213216343535776e-05 asymmetry=None left_intensity=None right_intensity=None mouth_stretch=None is_smiling=False rz_score=1.0561262510548828 is_anomalous=False continuous_anomaly=False part_of_anomalous_range=None
gaze_data: horizontal_deviation=None vertical_deviation=None primary_direction='down' rz_score=1.1958070888359338 is_anomalous=False continuous_anomaly=False part_of_anomalous_range=None



In [19]:
result = await main(input=ip)
console.print(Markdown(result))

17:32:38.529 agent run
17:32:38.545   chat deepseek/deepseek-r1


In [19]:
df.columns

Index(['blinking_data', 'gaze_data', 'jaw_movement_data', 'smile_data',
       'loudness_data', 'average_pitch_data', 'pitch_standard_deviation',
       'words_per_sec', 'filler_words_usage', 'pauses_taken', 'Time', 'words',
       'text_concat'],
      dtype='object')

In [None]:
visual_cont_anom = []
audio_cont_anom = []
vocabulary_cont_anom = []
for row in df.iterrows():
    for column in visual_columns:
        if row[1][column]["continuos_anomaly"]:
            # visual
            if column == "blinking_data":
                blink_data = Blink(**row[1][column])
                visual_cont_anom.append(blink_data)
            elif column == "gaze_data":
                gaze_data = Gaze(**row[1][column])
                visual_cont_anom.append(gaze_data)
            elif column == "smile_data":
                smile_data = Smile(**row[1][column])
                visual_cont_anom.append(smile_data)
            elif column == "jaw_movement_data":
                jaw_data = Jaw(**row[1][column])
                visual_cont_anom.append(jaw_data)
            # audio
            elif column == "loudness_data":
                loudness = LoudnessState(**row[1][column])
                audio_cont_anom.append(loudness)
            elif column == "average_pitch_data":
                pitch_state = PitchState(**row[1][column])
                audio_cont_anom.append(pitch_state)
            elif column == "pitch_standard_deviation":
                pitch_std = PitchStd(**row[1][column])
                audio_cont_anom.append(pitch_std)
            # vocab    
            elif column == "words_per_sec":
                wps = WPS(**row[1][column])
                vocabulary_cont_anom.append(wps)
            elif column == "filler_words_usage":
                filler = FillerPercentageIncrease(**row[1][column])
                vocabulary_cont_anom.append(filler)
            elif column == "pauses_taken":
                pause = PausePercentageIncrease(**row[1][column])
                vocabulary_cont_anom.append(pause)           

In [19]:
visual_columns = ['blinking_data', 'gaze_data', 'smile_data', 'jaw_movement_data']
audio_columns = ['loudness_data', 'average_pitch_data', 'pitch_standard_deviation']
vocabulary_columns = ['words_per_sec', 'filler_words_usage', 'pauses_taken']

In [26]:
def get_cont_anomaly(df: pd.DataFrame, columns: list[str], start_time: float, time_range_in_secs: float) -> list:
    cont_anom_series = []
    start = int(start_time / 0.5)
    no_of_rows = int(time_range_in_secs / 0.5)
    
    window_df = df.iloc[start: start + no_of_rows]
    row_count = 0
    for row in window_df.iterrows():
        if row_count != no_of_rows:
            for column in columns:
                # print(column)
                if row[1][column] != None:
                    if row[1][column].get("continuous_anomaly"):
                        # visual
                        if column == "blinking_data":
                            blink_data = Blink(**row[1][column])
                            cont_anom_series.append(blink_data)
                        elif column == "gaze_data":
                            gaze_data = Gaze(**row[1][column])
                            cont_anom_series.append(gaze_data)
                        elif column == "smile_data":
                            smile_data = Smile(**row[1][column])
                            cont_anom_series.append(smile_data)
                        elif column == "jaw_movement_data":
                            jaw_data = Jaw(**row[1][column])
                            cont_anom_series.append(jaw_data)
                        # audio
                        elif column == "loudness_data":
                            loudness = LoudnessState(**row[1][column])
                            cont_anom_series.append(loudness)
                        elif column == "average_pitch_data":
                            pitch_state = PitchState(**row[1][column])
                            cont_anom_series.append(pitch_state)
                        elif column == "pitch_standard_deviation":
                            pitch_std = PitchStd(**row[1][column])
                            cont_anom_series.append(pitch_std)
                        # vocab    
                        elif column == "words_per_sec":
                            wps = WPS(**row[1][column])
                            cont_anom_series.append(wps)
                        elif column == "filler_words_usage":
                            filler = FillerPercentageIncrease(**row[1][column])
                            cont_anom_series.append(filler)
                        elif column == "pauses_taken":
                            pause = PausePercentageIncrease(**row[1][column])
                            cont_anom_series.append(pause)
                    else:
                        continue
                else:
                    continue       
            row_count += 1
        else:
            break
    
    return cont_anom_series   

In [27]:
get_cont_anomaly(
    df=df,
    columns=vocabulary_columns,
    time_range_in_secs=100,
    start_time=20,
)

[WPS(speaking_rate='slow', rz_score=-1.7754858655641423, is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[116.0, 119.0]),
 PausePercentageIncrease(pause_percentage_level='abnormally high', is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[116.0, 118.0]),
 PausePercentageIncrease(pause_percentage_level='abnormally high', is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[116.0, 118.0]),
 WPS(speaking_rate='slow', rz_score=-2.3231221052467834, is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[116.0, 119.0]),
 WPS(speaking_rate='fast', rz_score=2.06601146843359, is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[229.0, 230.0, 231.0, 232.0]),
 WPS(speaking_rate='fast', rz_score=2.0077234208065966, is_anomalous=True, continuous_anomaly=True, part_of_anomalous_range=[229.0, 230.0, 231.0, 232.0]),
 WPS(speaking_rate='fast', rz_score=2.692078641179067, is_anomalous=True, continuous_anomaly=True, part_

In [29]:
df.columns

Index(['blinking_data', 'gaze_data', 'jaw_movement_data', 'smile_data',
       'loudness_data', 'average_pitch_data', 'pitch_standard_deviation',
       'words_per_sec', 'filler_words_usage', 'pauses_taken', 'Time', 'words',
       'text_concat'],
      dtype='object')

In [63]:
for i in range(len(df)):
    if df['filler_words_usage'].iloc[i] != None:
        pp = FillerPercentageIncrease(**(df['filler_words_usage'].iloc[i]))
        if pp.continuous_anomaly:
            print(pp)

filler_percentage_level='abnormally high' is_anomalous=True continuous_anomaly=True part_of_anomalous_range=[245.0, 247.0]
filler_percentage_level='abnormally high' is_anomalous=True continuous_anomaly=True part_of_anomalous_range=[245.0, 247.0]
filler_percentage_level='abnormally high' is_anomalous=True continuous_anomaly=True part_of_anomalous_range=[432.0, 434.0]
filler_percentage_level='abnormally high' is_anomalous=True continuous_anomaly=True part_of_anomalous_range=[432.0, 434.0]
filler_percentage_level='abnormally high' is_anomalous=True continuous_anomaly=True part_of_anomalous_range=[469.0, 472.0]
filler_percentage_level='abnormally high' is_anomalous=True continuous_anomaly=True part_of_anomalous_range=[469.0, 472.0]
filler_percentage_level='abnormally high' is_anomalous=True continuous_anomaly=True part_of_anomalous_range=[489.0, 491.0]
filler_percentage_level='abnormally high' is_anomalous=True continuous_anomaly=True part_of_anomalous_range=[489.0, 491.0]
filler_percentag

In [73]:
df['text_concat'].iloc[0:10]

0        We're starting
1              now. [*]
2                      
3            So welcome
4                to the
5            interview.
6                 Let's
7            begin with
8    a simple question.
9                      
Name: text_concat, dtype: object

## Working on two more simple helper functions.

In [28]:
from src.sync.Feature_Transformation import get_speaker_timings    

In [30]:
df.columns

Index(['blinking_data', 'gaze_data', 'jaw_movement_data', 'smile_data',
       'loudness_data', 'average_pitch_data', 'pitch_standard_deviation',
       'words_per_sec', 'filler_words_usage', 'pauses_taken', 'Time', 'words',
       'text_concat', 'speaker'],
      dtype='object')

In [46]:
b = get_speaker_timings(speaker_times=df[['Time', 'speaker']], speaker='B')

In [47]:
a = get_speaker_timings(speaker_times=df[['Time', 'speaker']], speaker='A')

In [48]:
a

[(0.5, 11.5), (140.0, 163.5), (288.5, 318.0), (604.0, 631.5)]

In [49]:
b

[(11.5, 140.0), (163.5, 288.5), (318.0, 604.0)]

In [67]:
def g_get_speaker_timings(df: pd.DataFrame, start: float, end: float) -> list:
    interviewer_timings = get_speaker_timings(
        speaker_times=df,
        speaker='A'
    )
    interviewee_timings = get_speaker_timings(
        speaker_times=df,
        speaker='B'
    )
    
    spe_time_map = []
    
    for times in interviewee_timings:
        spe_time_map.append((times, "Interviewee"))
        
    for times in interviewer_timings:
        spe_time_map.append((times, "Interviewer"))
        
    spe_time_map = sorted(spe_time_map, key=lambda x:x[0][0])
    
    start_idx = 0
    end_idx = 0
    
    for i, map in enumerate(spe_time_map):
        if (map[0][0] <= start) and (map[0][1] > start):
            start_idx = i
            break
        
    for i, map in enumerate(spe_time_map):
        if (map[0][0] <= end) and (map[0][1] > end):
            end_idx = i
            break
    
    return spe_time_map[start_idx : end_idx + 1]

In [68]:
g_get_speaker_timings(
    df=df[['Time', 'speaker']],
    start = 30,
    end = 200
)

[((11.5, 140.0), 'Interviewee'),
 ((140.0, 163.5), 'Interviewer'),
 ((163.5, 288.5), 'Interviewee')]

In [69]:
df.columns

Index(['blinking_data', 'gaze_data', 'jaw_movement_data', 'smile_data',
       'loudness_data', 'average_pitch_data', 'pitch_standard_deviation',
       'words_per_sec', 'filler_words_usage', 'pauses_taken', 'Time', 'words',
       'text_concat', 'speaker'],
      dtype='object')

In [70]:
df["text_concat"]

0               We're starting
1                     now. [*]
2                             
3                   So welcome
4                       to the
                 ...          
1259                        So
1260                     yeah,
1261                  that's a
1262    really solid approach.
1263                          
Name: text_concat, Length: 1264, dtype: object

In [88]:
def get_transcript(df:pd.DataFrame, start: float, end: float):
    transcript = {}
    speaker_timings = g_get_speaker_timings(
        df=df[['Time', 'speaker']],
        start=start,
        end=end,
    )
    
    for map in speaker_timings:
        transcript[map[0]] = " ".join(df.loc[(df["Time"] >= map[0][0]) & (df["Time"] < map[0][1]), "text_concat"].values)
        
    return transcript

In [89]:
" ".join(df.loc[(df["Time"] >= 11.5) & (df["Time"] < 140.0), "text_concat"].values)

"[*]  right now I'm in third year of my [*] college.  I mean, I'm going to be in the sixth semester,  but [*] yeah, [*]  from [*] the start of my [*] college, I think from my second semester towards [*] the end [*] of it, I got interested in [*]  like [*] machine learning. I [*] got to know about [*] machine learning [*] from some [*] channels, from YouTube channels and all. [*] And I thought, yeah, it is an interesting field.  I got to know [*] some things about [*] it. I took [*] a course, a very famous course from Andrew Ng, [*] which is ML [*] specialization course,  which is on course era. So in the [*] break, in the summer break, which I've got in the college, I actually completed that course. [*] And actually I was very intrigued [*] by [*]  how   like we do all this stuff, what we do in the [*] like machine learning field, right? So [*] it was very interesting. [*] And [*]  like [*]  after that, I was just doing some courses, then I got into deep learning.  I then got into some

In [90]:
get_transcript(
    df=df[['Time', 'text_concat', 'speaker']],
    start=30,
    end=200,
)

{(11.5,
  140.0): "[*]  right now I'm in third year of my [*] college.  I mean, I'm going to be in the sixth semester,  but [*] yeah, [*]  from [*] the start of my [*] college, I think from my second semester towards [*] the end [*] of it, I got interested in [*]  like [*] machine learning. I [*] got to know about [*] machine learning [*] from some [*] channels, from YouTube channels and all. [*] And I thought, yeah, it is an interesting field.  I got to know [*] some things about [*] it. I took [*] a course, a very famous course from Andrew Ng, [*] which is ML [*] specialization course,  which is on course era. So in the [*] break, in the summer break, which I've got in the college, I actually completed that course. [*] And actually I was very intrigued [*] by [*]  how   like we do all this stuff, what we do in the [*] like machine learning field, right? So [*] it was very interesting. [*] And [*]  like [*]  after that, I was just doing some courses, then I got into deep learning.  I 