# Prior to starting the application, you need to obtain your free Groq API key from https://console.groq.com/keys. ⛔

**Replace the key at this point in the code below ⬇️**

GROQ_API_KEY = "abcdefghijkl_1234567890"  # Replace with your Groq API key

In [None]:
GROQ_API_KEY_USER = "gsk_djt1FIWJ6TjVrSamnbkOWGdyb3FYIPajsahfjshlfd"  # Replace with your Groq API key

# Set the value for scoring criteria. Set "True" for whichever component you want to use and leave it "False" for the rest.

In [None]:
# Visibility Settings — set exactly one to True
only_hands_visible = False
both_hands_and_legs_visible = False
no_hands_no_legs_visible = True



---



# To run the application Go to Runtime button in the Navigation bar above and click on the Run All option. The Code will now run automatically.

# Sometimes, there could be an error that occurs when the Stage 2 code is running. Do not worry. Simply go to Runtime > Restart Session > Run All. It will start again but this time the error will not be there. It occurs because of some installation defaults in the code (No need to bother).

**NOTE - Remember you have added the API key above prior to doing this step.**

**For any concerns write to info@alphaai.biz**






---



# Stage 1 - The dependencies will install automatically. Do not terminate the session, close the browser tab or interrrupt the execution by any means possible.

In [None]:
# Uninstall conflicting packages
!pip uninstall -y numpy pandas mediapipe librosa speechrecognition opencv-python ffmpeg-python langchain-groq

# Install compatible versions
!pip install numpy==1.26.4
!pip install mediapipe==0.10.14
!pip install pandas==2.2.2
!pip install librosa==0.10.2
!pip install speechrecognition==3.10.4
!pip install opencv-python==4.10.0.84
!pip install ffmpeg-python==0.2.0
!pip install langchain-groq==0.3.0

# Install Whisper for speech recognition
!pip install openai-whisper==20231117

# If the above installation is successful then there would be a number inclosed within the square bracket. For example [1] or [2].



---



# Stage 2 - Here the code will run automatically and ask you to upload your video file for analysis.

In [None]:
import groq
import httpx
print(f"groq: {groq.__version__}, httpx: {httpx.__version__}")

In [None]:
from groq import Groq
client = Groq(api_key=GROQ_API_KEY_USER)
print("Groq client initialized!")

**NOTE - In case you do not wish to work with ranges then refer to the code below. Nothing that technical just look for the System Prompt and Report Generation and refer to the Comment/Un-Comment instructions there.**

In [None]:
import cv2
import mediapipe as mp
import ffmpeg
import librosa
import numpy as np
import os
from groq import Groq
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from google.colab import files

# Initialize MediaPipe Holistic
try:
    mp_holistic = mp.solutions.holistic
    holistic = mp_holistic.Holistic(
        static_image_mode=False,
        min_detection_confidence=0.5,
        min_tracking_confidence=0.5
    )
except Exception as e:
    print(f"Failed to initialize MediaPipe: {e}")
    exit(1)

# Initialize Groq client
GROQ_API_KEY = GROQ_API_KEY_USER
try:
    client = Groq(api_key=GROQ_API_KEY)
except Exception as e:
    print(f"Failed to initialize Groq client: {e}")
    exit(1)

# Initialize Groq LLM for report generation
try:
    llm = ChatGroq(
        model="llama-3.3-70b-versatile",
        temperature=0.7,
        max_tokens=2000,
        api_key=GROQ_API_KEY
    )
except Exception as e:
    print(f"Failed to initialize Groq LLM: {e}")
    exit(1)
### UNCOMMENT THE BELOW PROMPT IF YOU WANT SINGULAR VALUES INSTEAD OF RANGES ###

# # System prompt for three-flag logic
# prompt = ChatPromptTemplate.from_messages([
#     ("system", """
# You are an expert in evaluating public speaking skills for management students, using a scoring system aligned with uSpeek's criteria (all scores out of 5). Given data on body language, facial expressions, speech transcript, audio characteristics, and component scores, generate a detailed report. The report must:

# - **Body Language**: Evaluate posture, gestures, engagement according to flags:
#   - `no_hands_no_legs_visible=True`: compute dynamic baseline by applying posture+gesture formula but skip visibility penalties.
#   - `only_hands_visible=True`: score purely on gesture frequency (0–5) minus engagement penalty (0.5 if freq < 0.5 else 0.2).
#   - `both_hands_and_legs_visible=True`: apply full formula: baseline 3.5 minus posture penalty plus gesture bonus minus engagement penalty.
#   Target ~{body_language_target}/5.

# - **Facial Expressions**: Score 2.0/5 for smiling ratio <10%, 3.0/5 for ≥10%.

# - **Speech Quality**: Analyze modulation (cap at 4.0), pitch (~300 Hz) using YIN, volume (~60 dB). Penalize deviations: pitch penalty abs(avg_pitch-300)/500, volume penalty abs(avg_volume-60)/30. Target ~{speech_quality_target}/5.

# - **Content Quality**: Evaluate clarity, relevance, impact. Penalize filler words (ratio >5%, reduce by 0.15/5). Target ~{content_quality_target}/5.

# - **Final Score**: Average component scores, targeting ~{final_score_target}/5. Scale: 1 = Many areas to improve, …, 5 = Super Star.

# - **Recommendations**: Suggest improvements for each component.

# Use a professional tone and clear structure.
# """),
#     ("human", """
# Body Language: {body_language}
# Facial Expressions: {facial_expressions}
# Speech Transcript: {transcript}
# Audio Characteristics: Pitch std: {pitch_std}, Volume std: {volume_std}, Avg pitch: {avg_pitch} Hz, Avg volume: {avg_volume} dB
# Filler Words: {filler_words}
# Pet Words: {pet_words}
# Preliminary Scores:
# - Body Language: {body_language_score}/5
# - Facial Expressions: {facial_expressions_score}/5
# - Speech Quality: {speech_quality_score}/5
# - Content Quality: {content_quality_score}/5
# """),
# ])

### COMMENT THE BELOW PROMPT IF YOU WANT TO USE SINGLE VALUES FOR THE SCORE INSTEAD OF RANGES ###
prompt = ChatPromptTemplate.from_messages([
    ("system", """
You are an expert in evaluating public speaking skills for management students, using a scoring system aligned with uSpeek's criteria (all scores out of 5). Given data on body language, facial expressions, speech transcript, audio characteristics, and component scores, generate a detailed report. The report must:

- **Body Language**: Evaluate posture, gestures, engagement according to flags:
  - `no_hands_no_legs_visible=True`: compute dynamic baseline by applying posture+gesture formula but skip visibility penalties.
  - `only_hands_visible=True`: score purely on gesture frequency (0–5) minus engagement penalty (0.5 if freq < 0.5 else 0.2).
  - `both_hands_and_legs_visible=True`: apply full formula: baseline 3.5 minus posture penalty plus gesture bonus minus engagement penalty.
  Target ~{body_language_target}/5.

- **Facial Expressions**: Score 2.0/5 for smiling ratio <10%, 3.0/5 for ≥10%.

- **Speech Quality**: Analyze modulation (cap at 4.0), pitch (~300 Hz) using YIN, volume (~60 dB). Penalize deviations: pitch penalty abs(avg_pitch-300)/500, volume penalty abs(avg_volume-60)/30. Target ~{speech_quality_target}/5.

- **Content Quality**: Evaluate clarity, relevance, impact. Penalize filler words (ratio >5%, reduce by 0.15/5). Target ~{content_quality_target}/5.

- **Final Score**: Compute a single score by averaging components, then present a range of +/- 0.2 around that score (clamped to [1,5]).

- **Recommendations**: Suggest improvements for each component.

Use a professional tone and clear structure.
"""),
    ("human", """
Body Language: {body_language}
Facial Expressions: {facial_expressions}
Speech Transcript: {transcript}
Audio Characteristics: Pitch std: {pitch_std}, Volume std: {volume_std}, Avg pitch: {avg_pitch} Hz, Avg volume: {avg_volume} dB
Filler Words: {filler_words}
Pet Words: {pet_words}
Preliminary Scores:
- Body Language: {body_language_score}/5
- Facial Expressions: {facial_expressions_score}/5
- Speech Quality: {speech_quality_score}/5
- Content Quality: {content_quality_score}/5
"""),
])

# Audio extraction

def extract_audio(video_path, audio_path):
    stream = ffmpeg.input(video_path)
    stream = ffmpeg.output(stream, audio_path, acodec='pcm_s16le', ar=16000, ac=1)
    ffmpeg.run(stream, overwrite_output=True, capture_stdout=True, capture_stderr=True)

# Transcription

def transcribe_audio(audio_path):
    try:
        with open(audio_path, 'rb') as f:
            res = client.audio.transcriptions.create(
                file=(os.path.basename(audio_path), f.read()),
                model='whisper-large-v3', response_format='json', language='en', temperature=0.0
            )
        return res.text
    except:
        return ''

# Filler & pet words

def analyze_filler_pet_words(transcript):
    words = transcript.lower().split()
    fillers = ['and','that','really','now','just','um','uh','like']
    pets = ['i','to','the','of']
    fcnt = {w:words.count(w) for w in fillers if words.count(w)>0}
    pcnt = {w:words.count(w) for w in pets if words.count(w)>0}
    total = len(words)
    fr = sum(fcnt.values())/total if total else 0
    return fcnt, pcnt, fr

# Audio analysis

def analyze_audio(audio_path):
    y, sr = librosa.load(audio_path)
    f0 = librosa.yin(y, fmin=80, fmax=500, sr=sr)
    avg_pitch = float(np.median(f0[f0>0])) if np.any(f0>0) else 300.0
    pitch_std = float(np.std(f0[f0>0])) if np.any(f0>0) else 0.0
    rms = librosa.feature.rms(y=y)[0]
    avg_volume = float(20*np.log10(np.mean(rms)+1e-10)+60)
    volume_std = float(np.std(rms))
    mod_score = min(4.0,(pitch_std/100 + volume_std/0.01)*0.8)
    pitch_pen = abs(avg_pitch-300)/500
    vol_pen = abs(avg_volume-60)/30
    speech_score = min(5,max(1,mod_score - pitch_pen - vol_pen))
    return pitch_std, volume_std, avg_pitch, avg_volume, speech_score

# Content analysis

def analyze_content(transcript, fr):
    p = ChatPromptTemplate.from_messages([
        ('system','Evaluate clarity, relevance, impact. Penalize filler >5% by -0.15.'),
        ('human','{transcript}')
    ])
    resp = (p | llm).invoke({'transcript':transcript,'filler_ratio':fr})
    try:
        base = float(resp.content.strip())
    except:
        base = 3.5
    penalty = 0.15 if fr>0.05 else 0
    return min(5,max(1,base-penalty))

# Video analysis

def analyze_video(video_path, only_hands_visible=False, both_hands_and_legs_visible=False, no_hands_no_legs_visible=False):
    if [only_hands_visible, both_hands_and_legs_visible, no_hands_no_legs_visible].count(True)!=1:
        raise ValueError('Set exactly one visibility flag to True')
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened(): raise ValueError('Cannot open video')
    cnt=gest=0; post_list=[]; expr=[]
    while True:
        ret, frm = cap.read()
        if not ret: break
        rgb=cv2.cvtColor(frm,cv2.COLOR_BGR2RGB)
        res=holistic.process(rgb)
        if res.pose_landmarks:
            ls=res.pose_landmarks.landmark[mp_holistic.PoseLandmark.LEFT_SHOULDER]
            rs=res.pose_landmarks.landmark[mp_holistic.PoseLandmark.RIGHT_SHOULDER]
            post_list.append(abs(ls.y-rs.y))
            if res.left_hand_landmarks or res.right_hand_landmarks: gest+=1
        if res.face_landmarks:
            m1=res.face_landmarks.landmark[61]; m2=res.face_landmarks.landmark[291]
            expr.append('Smiling' if np.hypot(m2.x-m1.x,m2.y-m1.y)>0.05 else 'Neutral')
        cnt+=1
    cap.release()
    gf=gest/cnt if cnt else 0
    eng_pen=0.5 if gf<0.5 else 0.2
    avg_post=np.mean(post_list) if post_list else 0
    post_pen=(avg_post/0.1)*0.3
    if no_hands_no_legs_visible:
        raw=3.5-post_pen+gf*2-eng_pen
        desc=f'Dynamic baseline: posture avg={avg_post:.3f}, gesture freq={gf:.3f}'
    elif only_hands_visible:
        raw=gf*5-eng_pen; desc=f'Gestures only (freq={gf:.3f})'
    else:
        raw=3.5-post_pen+gf*2-eng_pen; desc=f'Full-body: posture avg={avg_post:.3f}, gesture freq={gf:.3f}'
    body_sc=round(min(5,max(1,raw)),1)
    if expr:
        sr = expr.count('Smiling')/len(expr)
        fe_sc=2.0 if sr<0.1 else 3.0; fe_desc=f'Smiling {sr*100:.1f}%'
    else:
        fe_sc=2.0; fe_desc='No facial data'
    return desc, fe_desc, body_sc, fe_sc

# Report generation

def generate_report(video_path,output_path,only_hands_visible=False,both_hands_and_legs_visible=False,no_hands_no_legs_visible=False):
    ap='temp.wav'
    try:
        extract_audio(video_path,ap)
        tr=transcribe_audio(ap)
        ps,vs,ap_pitch,av,ss=analyze_audio(ap)
        fcnt,pcnt,fr=analyze_filler_pet_words(tr)
        cs=analyze_content(tr,fr)
        bl_desc,fe_desc,bl_sc,fe_sc=analyze_video(video_path,only_hands_visible,both_hands_and_legs_visible,no_hands_no_legs_visible)
        final=np.mean([bl_sc,fe_sc,ss,cs])
        # Targets
        blt=2.5 if ps<800 else 3.7; sqt=3.0 if ps<800 else 3.5; cqt=3.2 if fr>0.05 else 3.5; fst=2.9 if ps<800 else 3.4
        avg_score=np.mean([bl_sc,fe_sc,ss,cs])
        low  = round(max(1, avg_score - 0.2), 1)
        high = round(min(5, avg_score + 0.2), 1)

        resp=(prompt|llm).invoke({
            'body_language':bl_desc,
            'facial_expressions':fe_desc,
            'transcript':tr,
            'pitch_std':ps,
            'volume_std':vs,
            'avg_pitch':ap_pitch,
            'avg_volume':av,
            'filler_words':str(fcnt),
            'pet_words':str(pcnt),
            'body_language_score':bl_sc,
            'facial_expressions_score':fe_sc,
            'speech_quality_score':ss,
            'content_quality_score':cs,
            'body_language_target':blt,
            'speech_quality_target':sqt,
            'content_quality_target':cqt,
            'final_score_target':fst
        })
        ### UNCOMMENT THE BELOW LINE TO ENABLE SINGLE VALUE SCORES ###
        # report=f"{resp.content}\n\n**Final Score**: {round(final,1)}/5"

        ### COMMENT THE BELOW LINE TO ENABLE SINGLE VALUE SCORES ###
        report=f"{resp.content}\n\n**Final Score Range**: {low}-{high}/5"
        with open(output_path,'w') as f: f.write(report)
        return report
    finally:
        if os.path.exists(ap): os.remove(ap)

# Main

def main():
    print('Upload your video')
    u=files.upload()
    if not u: print('No file'); return
    vp=list(u.keys())[0]; op='evaluation_report.txt'
    rep=generate_report(vp,op,only_hands_visible,both_hands_and_legs_visible,no_hands_no_legs_visible)
    print(rep)
    files.download(op)

if __name__=='__main__': main()

# Thank for using our free tool! If it is posssilbe for you then do share it along and let others benefit from the same 🤗

**Credits: Alpha AI Team (www.alphaai.biz)**