# Prior to starting the application, you need to obtain your free Groq API key from https://console.groq.com/keys. ⛔

**Replace the key at this point in the code below ⬇️**

GROQ_API_KEY = "abcdefghijkl_1234567890"  # Replace with your Groq API key

In [None]:
GROQ_API_KEY_USER = "gsk_djt1FIWJ6TjVrSamnbkOWGdyasfhljsahdflkhsalhalskh"  # Replace with your Groq API key



---



# To run the application Go to Runtime button in the Navigation bar above and click on the Run All option. The Code will now run automatically.

**NOTE - Remember you have added the API key above prior to doing this step.**

**For any concerns write to info@alphaai.biz**






---



# Stage 1 - The dependencies will install automatically. Do not terminate the session, close the browser tab or interrrupt the execution by any means possible.

In [None]:
# Uninstall conflicting packages
!pip uninstall -y numpy pandas mediapipe librosa speechrecognition opencv-python ffmpeg-python langchain-groq

# Install compatible versions
!pip install numpy==1.26.4
!pip install mediapipe==0.10.14
!pip install pandas==2.2.2
!pip install librosa==0.10.2
!pip install speechrecognition==3.10.4
!pip install opencv-python==4.10.0.84
!pip install ffmpeg-python==0.2.0
!pip install langchain-groq==0.3.0

# Install Whisper for speech recognition
!pip install openai-whisper==20231117

# If the above installation is successful then there would be a number inclosed within the square bracket. For example [1] or [2].



---



# Stage 2 - Here the code will run automatically and ask you to upload your video file for analysis.

In [None]:
import groq
import httpx
print(f"groq: {groq.__version__}, httpx: {httpx.__version__}")

In [None]:
from groq import Groq
client = Groq(api_key=GROQ_API_KEY_USER)
print("Groq client initialized!")

In [None]:
import cv2
import mediapipe as mp
import ffmpeg
import librosa
import numpy as np
import os
from groq import Groq
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from google.colab import files
import re

# Initialize MediaPipe Holistic
try:
    mp_holistic = mp.solutions.holistic
    mp_drawing = mp.solutions.drawing_utils
    holistic = mp_holistic.Holistic(static_image_mode=False, min_detection_confidence=0.5, min_tracking_confidence=0.5)
except Exception as e:
    print(f"Failed to initialize MediaPipe: {e}")
    exit(1)

# Initialize Groq client
GROQ_API_KEY = GROQ_API_KEY_USER
try:
    client = Groq(api_key=GROQ_API_KEY)
except Exception as e:
    print(f"Failed to initialize Groq client: {e}")
    exit(1)

# Initialize Groq LLM for report generation
try:
    llm = ChatGroq(
        model="llama-3.3-70b-versatile",
        temperature=0.7,
        max_tokens=2000,
        api_key=GROQ_API_KEY
    )
except Exception as e:
    print(f"Failed to initialize Groq LLM: {e}")
    exit(1)

# Prompt for uSpeek's criteria
prompt = ChatPromptTemplate.from_messages([
    ("system", """You are an expert in evaluating public speaking skills for management students, using a scoring system aligned with uSpeek's criteria (all scores out of 5). Given data on body language, facial expressions, speech transcript, audio characteristics, and preliminary scores, generate a detailed report. The report must:
    - **Body Language**: Evaluate posture, gestures, engagement. If upper_torso=True, focus on upper body without leg penalty. If upper_torso=False, penalize missing legs (0.5/5). Penalize missing hands (0.5/5) and low engagement (0.5/5 if gesture frequency <0.5, else 0.2/5). Reward frequent gestures and balanced posture. Target ~{body_language_target}/5.
    - **Facial Expressions**: Score 2.0/5 for smiling ratio <10%, 3.0/5 for ≥10%. Target ~2.0/5 for limited expressiveness.
    - **Speech Quality**: Analyze modulation, pitch (~300 Hz), volume (~60 dB). Cap at 4.0/5 for modulation. Penalize pitch/volume deviations. Target ~{speech_quality_target}/5.
    - **Content Quality**: Evaluate clarity, relevance, impact. Penalize filler words (ratio >5%, reduce by 0.3/5). Target ~{content_quality_target}/5.
    - **Final Score**: Average component scores, targeting ~{final_score_target}/5. Scale: 1 = Many areas to improve, 2 = Improve your show, 3 = Good show, 4 = Fabulous show, 5 = Super Star.
    - **Recommendations**: Suggest improvements (e.g., reduce fillers, increase gestures).
    Use a professional tone, structure clearly, and prioritize engagement and clarity."""),
    ("human", """Body Language: {body_language}
Facial Expressions: {facial_expressions}
Speech Transcript: {transcript}
Audio Characteristics: Pitch variation (std): {pitch_std}, Volume variation (std): {volume_std}, Average pitch: {avg_pitch} Hz, Average volume: {avg_volume} dB
Filler Words: {filler_words}
Pet Words: {pet_words}
Preliminary Scores:
- Body Language: {body_language_score}/5
- Facial Expressions: {facial_expressions_score}/5
- Speech Quality: {speech_quality_score}/5
- Content Quality: {content_quality_score}/5
Please generate the evaluation report, including justifications for each score and the final aggregated score. You can be critical with you scoring if the apt reasoning is there from your side.""")
])

def extract_audio(video_path, audio_path):
    """Extract audio from video using FFmpeg."""
    try:
        stream = ffmpeg.input(video_path)
        stream = ffmpeg.output(stream, audio_path, acodec='pcm_s16le', ar=16000, ac=1)
        ffmpeg.run(stream, overwrite_output=True, capture_stdout=True, capture_stderr=True)
    except ffmpeg.Error as e:
        print(f"FFmpeg error: {e.stderr.decode()}")
        raise

def transcribe_audio(audio_path):
    """Transcribe audio to text using Groq's Whisper API."""
    try:
        with open(audio_path, "rb") as file:
            transcription = client.audio.transcriptions.create(
                file=(os.path.basename(audio_path), file.read()),
                model="whisper-large-v3",
                response_format="json",
                language="en",
                temperature=0.0
            )
            return transcription.text
    except Exception as e:
        print(f"Groq transcription error: {e}")
        return ""

def analyze_filler_pet_words(transcript):
    """Analyze transcript for filler and pet words."""
    words = transcript.lower().split()
    filler_words = ['and', 'that', 'really', 'now', 'just', 'um', 'uh', 'like']
    pet_words = ['i', 'to', 'the', 'of']

    filler_count = {word: words.count(word) for word in filler_words if words.count(word) > 0}
    pet_count = {word: words.count(word) for word in pet_words if words.count(word) > 0}

    total_words = len(words)
    filler_ratio = sum(filler_count.values()) / total_words if total_words > 0 else 0

    return filler_count, pet_count, filler_ratio

def analyze_audio(audio_path):
    """Analyze audio for pitch, volume, and compute speech quality score."""
    try:
        y, sr = librosa.load(audio_path)
        pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
        pitch_values = pitches[magnitudes > 0]
        pitch_std = np.std(pitch_values) if len(pitch_values) > 0 else 0
        avg_pitch = np.mean(pitch_values) if len(pitch_values) > 0 else 300

        rms = librosa.feature.rms(y=y)[0]
        volume_std = np.std(rms) if len(rms) > 0 else 0
        avg_volume = 20 * np.log10(np.mean(rms) + 1e-10) + 60 if np.mean(rms) > 0 else 60

        # Speech score
        modulation_score = min(4.0, (pitch_std / 100 + volume_std / 0.01) * 0.8)
        pitch_penalty = abs(avg_pitch - 300) / 1000 if avg_pitch > 0 else 0
        volume_penalty = abs(avg_volume - 60) / 30 if avg_volume > 0 else 0
        speech_score = min(5, max(1, modulation_score - pitch_penalty - volume_penalty))

        print(f"Pitch std: {pitch_std}, Volume std: {volume_std}, Avg pitch: {avg_pitch}, Avg volume: {avg_volume}, Speech score: {speech_score}")
        return pitch_std, volume_std, avg_pitch, avg_volume, speech_score
    except Exception as e:
        print(f"Audio analysis error: {e}")
        return 0, 0, 300, 60, 3

def analyze_content(transcript, filler_ratio):
    """Analyze transcript for content quality using Groq LLM."""
    content_prompt = ChatPromptTemplate.from_messages([
        ("system", """Evaluate the clarity, relevance, and impact of the following speech transcript for a management student presentation. Assign a score out of 5 based on:
        - Clarity: Is the message clear and well-structured?
        - Relevance: Is the content pertinent to management topics?
        - Impact: Does it engage and persuade the audience?
        Penalize for filler words (ratio: {filler_ratio:.2%}, reduce by 0.3/5 if >5%) and lengthy sentences. Target ~{content_quality_target}/5 for clear, engaging content. Provide the score only."""),
        ("human", "{transcript}")
    ])
    # Dynamic target based on filler ratio (proxy for content quality)
    content_quality_target = 3.2 if filler_ratio > 0.05 else 3.5
    chain = content_prompt | llm
    response = chain.invoke({"transcript": transcript, "filler_ratio": filler_ratio, "content_quality_target": content_quality_target})
    try:
        score = float(response.content.strip())
        # Penalize for high filler ratio
        score = score - 0.3 if filler_ratio > 0.05 else score
        return min(5, max(1, score))
    except ValueError:
        return 3.5  # Default

def analyze_video(video_path, upper_torso=True):
    """Analyze video for body language and facial expressions using MediaPipe."""
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise ValueError("Could not open video file.")

    body_language_data = []
    expression_data = []
    frame_count = 0
    gesture_count = 0
    posture_scores = []
    hands_visible = False
    legs_visible = False

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = holistic.process(frame_rgb)

        if results.pose_landmarks:
            left_shoulder = results.pose_landmarks.landmark[mp_holistic.PoseLandmark.LEFT_SHOULDER]
            right_shoulder = results.pose_landmarks.landmark[mp_holistic.PoseLandmark.RIGHT_SHOULDER]
            shoulder_diff = abs(left_shoulder.y - right_shoulder.y)
            posture_scores.append(shoulder_diff)

            if results.left_hand_landmarks or results.right_hand_landmarks:
                gesture_count += 1
                hands_visible = True

            if not upper_torso:
                if results.pose_landmarks.landmark[mp_holistic.PoseLandmark.LEFT_KNEE].visibility > 0.5 or \
                   results.pose_landmarks.landmark[mp_holistic.PoseLandmark.RIGHT_KNEE].visibility > 0.5:
                    legs_visible = True

        if results.face_landmarks:
            mouth_left = results.face_landmarks.landmark[61]
            mouth_right = results.face_landmarks.landmark[291]
            mouth_dist = np.sqrt((mouth_right.x - mouth_left.x)**2 + (mouth_right.y - mouth_left.y)**2)
            expression = "Smiling" if mouth_dist > 0.05 else "Neutral"
            expression_data.append(expression)

        frame_count += 1

    cap.release()

    avg_posture_score = np.mean(posture_scores) if posture_scores else 0
    posture_desc = "Balanced" if avg_posture_score < 0.05 else "Unbalanced"
    gesture_freq = gesture_count / frame_count if frame_count > 0 else 0
    gesture_desc = "Frequent" if gesture_freq > 0.1 else "Minimal"

    # Penalties and bonuses
    visibility_penalty = 0
    if not hands_visible:
        visibility_penalty += 0.5
    if not upper_torso and not legs_visible:
        visibility_penalty += 0.5
    engagement_penalty = 0.5 if gesture_freq < 0.5 else 0.2

    body_language = (f"Posture: {posture_desc} (avg shoulder diff: {avg_posture_score:.3f}). "
                    f"Gestures: {gesture_desc} (frequency: {gesture_freq:.3f}). "
                    f"Hands visible: {hands_visible}. "
                    f"{'Upper torso only evaluated.' if upper_torso else f'Legs visible: {legs_visible}.'}")
    body_language_score = 3.5 - (avg_posture_score / 0.1 * 0.3) + (gesture_freq * 2) - (visibility_penalty + engagement_penalty)
    body_language_score = min(5, max(1, body_language_score))

    if expression_data:
        smile_ratio = expression_data.count("Smiling") / len(expression_data)
        expression_summary = f"Smiling {smile_ratio*100:.1f}% of the time, Neutral otherwise."
        expression_score = 2.0 if smile_ratio < 0.1 else 3.0
    else:
        expression_summary = "No facial landmarks detected."
        expression_score = 2.0

    return body_language, expression_summary, body_language_score, expression_score

def generate_report(video_path, output_path, upper_torso=True):
    """Generate evaluation report with scores for the video."""
    audio_path = "temp_audio.wav"
    try:
        extract_audio(video_path, audio_path)
        transcript = transcribe_audio(audio_path)
        pitch_std, volume_std, avg_pitch, avg_volume, speech_quality_score = analyze_audio(audio_path)
        filler_count, pet_count, filler_ratio = analyze_filler_pet_words(transcript)
        content_quality_score = analyze_content(transcript, filler_ratio)
        body_language, facial_expressions, body_language_score, facial_expressions_score = analyze_video(video_path, upper_torso=upper_torso)
        final_score = np.mean([body_language_score, facial_expressions_score, speech_quality_score, content_quality_score])

        # Prompt parameters (dynamic based on metrics)
        body_language_target = 2.5 if pitch_std < 800 else 3.7  # Lower for weaker modulation
        speech_quality_target = 3.0 if pitch_std < 800 else 3.5
        content_quality_target = 3.2 if filler_ratio > 0.05 else 3.5
        final_score_target = 2.9 if pitch_std < 800 else 3.4

        chain = prompt | llm
        response = chain.invoke({
            "body_language": body_language,
            "facial_expressions": facial_expressions,
            "transcript": transcript,
            "pitch_std": pitch_std,
            "volume_std": volume_std,
            "avg_pitch": avg_pitch,
            "avg_volume": avg_volume,
            "filler_words": str(filler_count),
            "pet_words": str(pet_count),
            "body_language_score": round(body_language_score, 1),
            "facial_expressions_score": round(facial_expressions_score, 1),
            "speech_quality_score": round(speech_quality_score, 1),
            "content_quality_score": round(content_quality_score, 1),
            "body_language_target": body_language_target,
            "speech_quality_target": speech_quality_target,
            "content_quality_target": content_quality_target,
            "final_score_target": final_score_target
        })

        report_content = f"{response.content}\n\n**Final Score**: {round(final_score, 1)}/5"
        with open(output_path, "w") as f:
            f.write(report_content)

        return report_content
    finally:
        if os.path.exists(audio_path):
            os.remove(audio_path)

def main():
    # Upload video file
    print("Please upload your video file (e.g., speak.mp4)")
    uploaded = files.upload()
    if not uploaded:
        print("No file uploaded.")
        return

    video_path = list(uploaded.keys())[0]
    output_path = "evaluation_report.txt"

    # Set upper_torso flag
    upper_torso = True

    try:
        report = generate_report(video_path, output_path, upper_torso=upper_torso)
        print("Report generated successfully. Saved to", output_path)
        print("\nReport Preview:\n", report[:500], "...")

        # Download the report
        files.download(output_path)
    except Exception as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    main()

# Thank for using our free tool! If it is posssilbe for you then do share it along and let others benefit from the same 🤗

**Credits: Alpha AI Team (www.alphaai.biz)**