# Install and Import Dependencies

In [1]:
!pip install numpy pandas librosa groq load_dotenv

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
!pip install nltk tiktoken parselmouth pydub psutil

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting parselmouth
  Downloading parselmouth-1.1.1.tar.gz (33 kB)
  Installing build dependencies: started
  Installing build dependencies: still running...
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting googleads==3.8.0 (from parselmouth)
  Downloading googleads-3.8.0.tar.gz (23 kB)
  Installing build dependencies: started
  Installing build dependencies: still running...
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'error'


  error: subprocess-exited-with-error
  
  × Getting requirements to build wheel did not run successfully.
  │ exit code: 1
  ╰─> [1 lines of output]
      error in googleads setup command: use_2to3 is invalid.
      [end of output]
  
  note: This error originates from a subprocess, and is likely not a problem with pip.

[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip
error: subprocess-exited-with-error

× Getting requirements to build wheel did not run successfully.
│ exit code: 1
╰─> See above for output.

note: This error originates from a subprocess, and is likely not a problem with pip.


In [15]:
# Imports
import io
import os
import re
import joblib
import asyncio
import librosa
import tiktoken
import numpy as np
import parselmouth
import soundfile as sf
from pydub import AudioSegment
from nltk.corpus import cmudict
from parselmouth.praat import call
from groq import Groq, AsyncClient
from groq.types.audio import Transcription

# Load environment file
from load_dotenv import load_dotenv
print(load_dotenv('../../.env.local'))

assert os.environ.get('GROQ_API_KEY'), "Groq API key not found in .env file, please set the key before starting this notebook"

# Global variables
client = AsyncClient()
encoder = tiktoken.get_encoding('gpt2')
fluency_model = joblib.load('../fluency/weights/xgboost_model.pkl')

try:
    cmu_dict = cmudict.dict()
except:
    import nltk
    nltk.download('cmudict')
    cmu_dict = cmudict.dict()    

True


# Feature Extraction
Features extracted:
* ZCR
* Pitch
* Jitter
* Shimmer
* Harmonic-to-Noise ratio
* RMS
* MFCC
* DeltaMFCC
* SpeakingRate
* PauseCount
* PauseDuration

In [16]:
# Async Transcription
def split_audio_in_memory(audio_data, max_mb=24):
    if isinstance(audio_data, io.BytesIO):
        audio_data.seek(0)
        audio = AudioSegment.from_file(audio_data, format='wav')
    else:
        audio = AudioSegment.from_wav(audio_data)
    
    bytes_per_ms = (audio.frame_rate * audio.frame_width * audio.channels) / 1000
    max_bytes = max_mb * 1024 * 1024
    chunk_duration_ms = int(max_bytes / bytes_per_ms)

    chunks = []
    for i in range(0, len(audio), chunk_duration_ms):
        chunk = audio[i:i+chunk_duration_ms]
        buffer = io.BytesIO()
        chunk.export(buffer, format="wav")
        buffer.seek(0)
        chunks.append((f"chunk_{i//chunk_duration_ms}.wav", buffer))

    return chunks


async def transcribe_chunk(filename, audio_buffer):
    return await client.audio.transcriptions.create(
        file=(filename, audio_buffer),
        model="whisper-large-v3-turbo",
        response_format="verbose_json",
        timestamp_granularities=["word"]
    )


async def transcribe_audio(audio):
    chunks = split_audio_in_memory(audio)
    tasks = [transcribe_chunk(name, buffer) for name, buffer in chunks]
    all_transcripts = await asyncio.gather(*tasks)

    full_transcript = ""
    all_words = []
    total_duration = 0.0

    for chunk in all_transcripts:
        full_transcript += chunk.text
        all_words.extend(getattr(chunk, "words", []))
        total_duration += chunk.duration          # type: ignore
    
    return Transcription(text=full_transcript, words=all_words, duration=total_duration)   # type: ignore

In [17]:
# Helper functions for calculating syllables speaking rate
def get_word_syllable_count(word):
    word = word.lower().strip(".,?!;:")
    if word in cmu_dict:
        return len([p for p in cmu_dict[word][0] if p[-1].isdigit()])
    return max(1, len(re.findall(r'[aeiouy]+', word)))


def estimate_syllable_rate(transcript, duration_sec):
    words = transcript.split()
    total_syllables = sum(get_word_syllable_count(word) for word in words)
    return total_syllables / duration_sec if duration_sec > 0 else 0


In [18]:
# Extract Pitch statistics, Jitter, Shimmer, and HNR ratio through Parselmouth
def extract_parselmouth_features(data, sr):
    snd = parselmouth.Sound(values=data, sampling_frequency=sr)

    pitch_obj = snd.to_pitch()
    pitch_mean = call(pitch_obj, "Get mean", 0, 0, "Hertz")
    pitch_std = call(pitch_obj, "Get standard deviation", 0, 0, "Hertz")

    point_process = call(snd, "To PointProcess (periodic, cc)", 75, 500)
    jitter = call(point_process, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3)
    shimmer = call([snd, point_process], "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6)

    harmonicity = call(snd, "To Harmonicity (cc)", 0.01, 75, 0.1, 1.0)
    hnr = call(harmonicity, "Get mean", 0, 0)

    return {
        "pitch_mean": pitch_mean,
        "pitch_std": pitch_std,
        "pitch_var": pitch_std**2,
        "jitter_local": jitter,
        "shimmer_local": shimmer,
        "hnr": hnr
    }

async def async_extract_parselmouth_features(data, sr, executor):
    return await asyncio.get_event_loop().run_in_executor(
        executor, extract_parselmouth_features, data, sr
    )

In [19]:
# Extract RMS Energy, ZCR, MFCC and Deltas using librosa
def extract_librosa_features(data, sr):
    zcr = np.mean(librosa.feature.zero_crossing_rate(data))
    
    rms = librosa.feature.rms(y=data)[0]
    rms_mean = np.mean(rms)
    rms_std = np.std(rms)
    rms_var = np.var(rms)

    mfcc = librosa.feature.mfcc(y=data, sr=sr, n_mfcc=13)
    delta = librosa.feature.delta(mfcc)
    mfcc_mean = np.mean(mfcc)
    delta_mean = np.mean(delta)

    return {
        "zcr": zcr,
        "rms_mean": rms_mean,
        "rms_std": rms_std,
        "rms_var": rms_var,
        "mfcc_mean": mfcc_mean,
        "delta_mean": delta_mean
    }

    
async def async_extract_librosa_features(data, sr, executor):
    return await asyncio.get_event_loop().run_in_executor(
        executor, extract_librosa_features, data, sr
    )

In [38]:
def extract_features_from_wave(data, sr):
    return {
        **extract_librosa_features(data, sr),
        **extract_parselmouth_features(data, sr)
    }
 
    
async def async_extract_features_from_wave(data, sr, executor):
    # Start both tasks concurrently
    librosa_task = asyncio.create_task(async_extract_librosa_features(data, sr, executor))
    parselmouth_task = asyncio.create_task(async_extract_parselmouth_features(data, sr, executor))

    # Wait for both
    librosa_feats, parselmouth_feats = await asyncio.gather(librosa_task, parselmouth_task)

    return {**librosa_feats, **parselmouth_feats}


# Full function to extract all the features of the audio file
async def extract_features(audio_data):
    # -------------- Load the audio file --------------
    if isinstance(audio_data, io.BytesIO):
        data, sr = sf.read(audio_data)
    else:
        data, sr = sf.read(audio_data)
    
    assert len(data) > 160, "Your audio file appears to contain no content. Please input a valid file"
    
    # Convert to mono channel and resample
    if data.ndim > 1:
        data = data.mean(axis=1)
    data = librosa.resample(data, orig_sr=sr, target_sr=16000)

    duration_in_secs = len(data) / 16000
    baseline_duration = max(10.0, duration_in_secs * 0.05)      # Minimum duration for baseline is 10 seconds

    # Start the transcription job
    print("Started transcription job")
    transcription_task = asyncio.create_task(transcribe_audio(audio_data))

    # -------------- Get features of baseline and full wave --------------
    baseline_data = data[:min(len(data), int(16000 * baseline_duration))]
    baseline_feats = extract_features_from_wave(baseline_data, sr)
    print("Gotten full wave features")
    full_feats = extract_features_from_wave(data, sr)
    print("Gotten full wave features")

    # Get fluency ratings
    features = ['zcr', 'pitch_mean', 'pitch_std', 'rms_mean', 'rms_std', 'rms_var', 'mfcc_mean', 'delta_mean']
    rating_map = ['Low', 'Medium', 'High']
        
    baseline_fluency_features = np.array([baseline_feats[key] for key in baseline_feats if key in features])
    full_fluency_features = np.array([full_feats[key] for key in full_feats if key in features])

    res = fluency_model.predict(np.vstack((baseline_fluency_features, full_fluency_features)))
    baseline_fluency = rating_map[res[0].argmax()]
    full_fluency = rating_map[res[1].argmax()]
    print("Get fluency ratings")
    
    # Get Relative features
    relative_feats = {}
    for key in full_feats:
        if key not in ['mfcc', 'delta_mfcc']:
            base = baseline_feats.get(key, 0.0)
            full = full_feats[key]
            relative_feats[f'{key}_delta'] = full - base

    # -------------- Get transcription and speaking rates --------------
    transcription_json = await transcription_task
    print("Gotten transcriptions")
    
    # Baseline speaking rate
    baseline_transcript = [word_segment['word'] for word_segment in transcription_json.words if word_segment['start'] <= baseline_duration]  # type: ignore
    baseline_word_count = len(baseline_transcript)
    baseline_transcript = " ".join(baseline_transcript)
    baseline_speaking_rate = baseline_word_count / baseline_duration
    baseline_syllables_rate = estimate_syllable_rate(baseline_transcript, baseline_duration)

    # Full data speaking rate
    transcript = transcription_json.text
    word_count = len(transcript.split())
    speaking_rate = word_count / duration_in_secs
    syllables_rate = estimate_syllable_rate(transcript, duration_in_secs)
        
    # -------------- Pause detection --------------
    intervals = librosa.effects.split(data, top_db=30)
    pauses = [(intervals[i][0] - intervals[i - 1][1]) / sr
              for i in range(1, len(intervals))
              if (intervals[i][0] - intervals[i - 1][1]) / sr > 1.0]
    
    long_pause_count = len(pauses)
    long_pause_total = sum(pauses)

    # -------------- Return full feedback for prompt generation --------------
    return {
        "transcript": transcript,
        "duration": duration_in_secs,
        "baseline_duration": int(baseline_duration),
        "speaking_rate": speaking_rate,
        "syllables_rate": syllables_rate,
        "baseline_speaking_rate": baseline_speaking_rate,
        "baseline_syllables_rate": baseline_syllables_rate,
        "long_pause_count": long_pause_count,
        "long_pause_duration": long_pause_total,
        "fluency_rating": full_fluency,
        "baseline_fluency_rating": baseline_fluency,
        **full_feats,
        **{f'baseline_{k}': v for k, v in baseline_feats.items()},
        **relative_feats,
    }

# Send to GPT for feedback

In [33]:
def get_prompt(audio_features, posture_features = None):
    prompt = f"""
You are a professional voice coach and delivery analyst tasked with evaluating a speaker's performance based on a variety of acoustic and prosodic features. Below is a detailed snapshot of the speaker’s delivery — both baseline and full-clip — along with their changes. Use this to deliver personalized, context-aware feedback.

## NOTE:
- The **first {int(audio_features['baseline_duration'])} seconds** of the speech are used to define the speaker's personal baseline.
- All relative metrics (e.g., deltas, ratios) are calculated with respect to this baseline.
- Interpret *changes* from baseline as signs of adaptation or stress — not necessarily flaws.
- **Avoid quoting any raw values** in your response. Use intuitive, narrative insights only.
- An 86% accurate ML model was used to rate the fluency of the speech, and that rating has also been provided to you.

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
📝 TRANSCRIPT
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
<transcript>
{audio_features['transcript']}
</transcript>

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
📏 BASELINE METRICS (First {int(audio_features['baseline_duration'])} seconds)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Fluency & Tempo
- Fluency rating: {audio_features['baseline_fluency_rating']}
- Words/sec: {audio_features['baseline_speaking_rate']:.2f}
- Syllables/sec: {audio_features['baseline_syllables_rate']:.2f}

## Voice Modulation
- Pitch (Mean / Std / Var): {audio_features['baseline_pitch_mean']:.2f} / {audio_features['baseline_pitch_std']:.2f} / {audio_features['baseline_pitch_var']:.2f}
- Jitter (local): {audio_features['baseline_jitter_local']:.3f}
- Shimmer (local): {audio_features['baseline_shimmer_local']:.3f}
- Harmonic-to-Noise Ratio (HNR): {audio_features['baseline_hnr']:.2f}

## Energy & Dynamics
- RMS Energy (Mean / Std / Var): {audio_features['baseline_rms_mean']:.2f} / {audio_features['baseline_rms_std']:.2f} / {audio_features['baseline_rms_var']:.2f}
- Zero Crossing Rate: {audio_features['baseline_zcr']:.3f}

## Timbre & Articulation
- MFCC Mean: {audio_features['baseline_mfcc_mean']:.2f}
- Delta MFCC Mean: {audio_features['baseline_delta_mean']:.6f}

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
📊 FULL CLIP METRICS
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Fluency & Tempo
- Fluency rating: {audio_features['fluency_rating']}
- Words/sec: {audio_features['speaking_rate']:.2f}
- Syllables/sec: {audio_features['syllables_rate']:.2f}
- Long pauses (>1s): {audio_features['long_pause_count']}
- Total pause duration: {audio_features['long_pause_duration']:.2f} sec

## Voice Modulation
- Pitch (Mean / Std / Var): {audio_features['pitch_mean']:.2f} / {audio_features['pitch_std']:.2f} / {audio_features['pitch_var']:.2f}
- Jitter (local): {audio_features['jitter_local']:.3f}
- Shimmer (local): {audio_features['shimmer_local']:.3f}
- Harmonic-to-Noise Ratio (HNR): {audio_features['hnr']:.2f}

## Energy & Dynamics
- RMS Energy (Mean / Std / Var): {audio_features['rms_mean']:.2f} / {audio_features['rms_std']:.2f} / {audio_features['rms_var']:.2f}
- Zero Crossing Rate: {audio_features['zcr']:.3f}

## Timbre & Articulation
- MFCC Mean: {audio_features['mfcc_mean']:.2f}
- Delta MFCC Mean: {audio_features['delta_mean']:.6f}

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
📈 RELATIVE CHANGES FROM BASELINE
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Tempo & Fluency
- Speaking rate ratio: {audio_features['speaking_rate'] / audio_features['baseline_speaking_rate']:.2f}
- Syllable rate ratio: {audio_features['syllables_rate'] / audio_features['baseline_syllables_rate']:.2f}

## Modulation
- Pitch std delta: {audio_features['pitch_std_delta']:+.2f}
- Jitter delta: {audio_features['jitter_local_delta']:+.3f}
- Shimmer delta: {audio_features['shimmer_local_delta']:+.3f}
- HNR delta: {audio_features['hnr_delta']:+.2f}

## Energy
- RMS mean delta: {audio_features['rms_mean_delta']:+.2f}
- RMS std delta: {audio_features['rms_std_delta']:+.2f}
- ZCR delta: {audio_features['zcr_delta']:+.3f}

## Timbre
- MFCC mean delta: {audio_features['mfcc_mean_delta']:+.2f}
- Delta MFCC mean delta: {audio_features['delta_mean_delta']:+.6f}

🧠 **Interpretation Tips** (for internal use only):
- A **negative pitch_std_delta** might suggest monotony or nervousness; a positive value implies expressive modulation.
- **Decreased RMS or HNR** may imply loss of vocal energy or confidence.
- **Increased jitter/shimmer** may reflect stress or instability.
- A **low syllable rate ratio** suggests slowing down relative to their natural pace, which may imply hesitation or deliberate pacing.
- **ZCR changes** may reflect articulation style or clarity.

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
🧭 INSTRUCTIONS FOR FEEDBACK GENERATION
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Using the data above, write a highly personalized and supportive **narrative-style voice coaching paragraph**. Do not cite any specific numerical values. Your tone should be professional, encouraging, and practical.

Structure your feedback in **three sections**:

1. ✅ **What the speaker did well** — Highlight strengths or improvements in vocal control, energy, fluency, or confidence.
2. 🛠️ **What they can improve** — Tactfully mention areas that deviated from their baseline and might affect clarity or delivery.
3. 📊 **Confidence & fluency rating** — Conclude with your overall impression of their vocal confidence and fluency (e.g., low, moderate, high), based on relative metrics.

DO NOT compare to average speakers. DO NOT be generic. Focus only on deviations from this speaker's own baseline and the emotional/functional impact of those changes.
"""
    return prompt

In [34]:
from typing import List
from pydantic import BaseModel, Field

class FluencyEvaluator(BaseModel):
    comment: str = Field(..., description="3-5 sentence feedback on fluency, including pace, fillers, pauses, flow.")
    score: int = Field(..., description="Fluency score (0-100).", ge=1, le=100)

class ContentEvaluator(BaseModel):
    strengths: List[str] = Field(..., description="Strengths in content, structure, language, and grammar.", min_length=2, max_length=5)
    improvements: List[str] = Field(..., description="Suggestions for improvement in content, structure, language, grammar.", min_length=2, max_length=5)
    structure_score: int = Field(..., description="Score (0-100) for logical organization and transitions.", ge=1, le=100)
    grammar_score: int = Field(..., description="Score (0-100) for correctness of language use.", ge=1, le=100)

class SpeechEvaluator(BaseModel):
    strengths: List[str] = Field(..., description="Strengths in clarity, delivery, and perceived confidence.", min_length=2, max_length=5)
    improvements: List[str] = Field(..., description="Suggestions for improvement in clarity, delivery, perceived confidence.", min_length=2, max_length=5)
    clarity_score: int = Field(..., description="Score (0-100) for clarity of speech.", ge=1, le=100)
    confidence_score: int = Field(..., description="Score (0-100) for perceived speaker confidence.", ge=1, le=100)

class Feedback(BaseModel):
    fluency_evaluator: FluencyEvaluator
    language_evaluator: ContentEvaluator
    speech_evaluator: SpeechEvaluator

In [None]:
import json

def get_prompt_with_schema(audio_features, response_schema = Feedback):
    prompt = f"""
You are a professional voice coach and delivery analyst tasked with evaluating the user's performance based on a variety of acoustic and prosodic features. Below is a detailed snapshot of the speaker's delivery — both baseline and full-clip — along with their changes. Use this to deliver personalized, context-aware feedback.

## NOTE:
- The **first {audio_features['baseline_duration']} seconds** of the speech are used to define the speaker's personal baseline.
- All relative metrics (e.g., deltas, ratios) are calculated with respect to this baseline.
- Interpret *changes* from baseline as signs of adaptation or stress — not necessarily flaws.
- **Avoid quoting any raw values** in your response. Use intuitive, narrative insights only.
- An 86% accurate ML model was used to rate the fluency of the speech, and that rating has also been provided to you.

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
📝 TRANSCRIPT
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
<transcript>
{audio_features['transcript']}
</transcript>

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
📏 BASELINE METRICS (First {audio_features['baseline_duration']} seconds)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Fluency & Tempo
- Fluency rating: {audio_features['baseline_fluency_rating']}
- Words/sec: {audio_features['baseline_speaking_rate']:.2f}
- Syllables/sec: {audio_features['baseline_syllables_rate']:.2f}

## Voice Modulation
- Pitch (Mean / Std / Var): {audio_features['baseline_pitch_mean']:.2f} / {audio_features['baseline_pitch_std']:.2f} / {audio_features['baseline_pitch_var']:.2f}
- Jitter (local): {audio_features['baseline_jitter_local']:.3f}
- Shimmer (local): {audio_features['baseline_shimmer_local']:.3f}
- Harmonic-to-Noise Ratio (HNR): {audio_features['baseline_hnr']:.2f}

## Energy & Dynamics
- RMS Energy (Mean / Std / Var): {audio_features['baseline_rms_mean']:.2f} / {audio_features['baseline_rms_std']:.2f} / {audio_features['baseline_rms_var']:.2f}
- Zero Crossing Rate: {audio_features['baseline_zcr']:.3f}

## Timbre & Articulation
- MFCC Mean: {audio_features['baseline_mfcc_mean']:.2f}
- Delta MFCC Mean: {audio_features['baseline_delta_mean']:.6f}

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
📊 FULL CLIP METRICS
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Fluency & Tempo
- Fluency rating: {audio_features['fluency_rating']}
- Words/sec: {audio_features['speaking_rate']:.2f}
- Syllables/sec: {audio_features['syllables_rate']:.2f}
- Long pauses (>1s): {audio_features['long_pause_count']}
- Total pause duration: {audio_features['long_pause_duration']:.2f} sec

## Voice Modulation
- Pitch (Mean / Std / Var): {audio_features['pitch_mean']:.2f} / {audio_features['pitch_std']:.2f} / {audio_features['pitch_var']:.2f}
- Jitter (local): {audio_features['jitter_local']:.3f}
- Shimmer (local): {audio_features['shimmer_local']:.3f}
- Harmonic-to-Noise Ratio (HNR): {audio_features['hnr']:.2f}

## Energy & Dynamics
- RMS Energy (Mean / Std / Var): {audio_features['rms_mean']:.2f} / {audio_features['rms_std']:.2f} / {audio_features['rms_var']:.2f}
- Zero Crossing Rate: {audio_features['zcr']:.3f}

## Timbre & Articulation
- MFCC Mean: {audio_features['mfcc_mean']:.2f}
- Delta MFCC Mean: {audio_features['delta_mean']:.6f}

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
📈 RELATIVE CHANGES FROM BASELINE
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Tempo & Fluency
- Speaking rate ratio: {audio_features['speaking_rate'] / audio_features['baseline_speaking_rate']:.2f}
- Syllable rate ratio: {audio_features['syllables_rate'] / audio_features['baseline_syllables_rate']:.2f}

## Modulation
- Pitch std delta: {audio_features['pitch_std_delta']:+.2f}
- Jitter delta: {audio_features['jitter_local_delta']:+.3f}
- Shimmer delta: {audio_features['shimmer_local_delta']:+.3f}
- HNR delta: {audio_features['hnr_delta']:+.2f}

## Energy
- RMS mean delta: {audio_features['rms_mean_delta']:+.2f}
- RMS std delta: {audio_features['rms_std_delta']:+.2f}
- ZCR delta: {audio_features['zcr_delta']:+.3f}

## Timbre
- MFCC mean delta: {audio_features['mfcc_mean_delta']:+.2f}
- Delta MFCC mean delta: {audio_features['delta_mean_delta']:+.6f}

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
🧭 INSTRUCTIONS FOR FEEDBACK GENERATION
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

You are expected to evaluate the user's delivery **by taking on four different professional roles, one at a time**, and provide clear, structured feedback and scoring for each role.

Below are your personas and what you focus on in each.

---

🎩 **1️⃣ Fluency Coach**
You are a delivery specialist who analyzes the *flow* of speech.

What to focus on:
- Speaking pace and rhythm
- Pauses and hesitations
- Use of filler words (e.g., “um,” “uh,” “like”)
- Smoothness and flow between sentences

Your goals:
- Provide a **concise, professional comment** on fluency in 3-5 sentences
- Suggest whether the delivery felt smooth, hesitant, rushed, or confident
- Assign a **fluency_score** (0-100), reflecting overall fluency

---

🎩 **2️⃣ Language Coach**
You are an expert in the *content and language* of speaking.

What to focus on:
- Quality, relevance, and organization of ideas (content)
- Logical structure and transitions between points
- Accuracy and appropriateness of grammar
- Vocabulary choice and variation
- Sentence structure and clarity

Your goals:
- List 2-5 **strengths** in content, structure, language, and grammar
- List 2-5 **areas for improvement** in those same areas
- Assign:
  - **structure_score** (0-100): Logical organization and flow
  - **grammar_score** (0-100): Correctness of language use

---

🎩 **3️⃣ Speech Evaluator**
You are a holistic evaluator of *communication impact*.

What to focus on:
- Clarity of pronunciation and articulation
- Ease of understanding for the listener
- Delivery style (tone, energy, vocal modulation)
- Signs of confidence or nervousness
- Audience engagement and persuasive power

Your goals:
- List 2-5 **strengths** in clarity, delivery, and perceived confidence
- List 2-5 **areas for improvement** in those same areas
- Assign:
  - **clarity_score** (0-100): How clear and understandable the speech is
  - **confidence_score** (0-100): How confident, convincing, and assured the speaker seems

---

✅ **IMPORTANT OUTPUT RULES**
- Do not necessarily interpret *relative changes from baseline* as flaws.
- Be supportive, specific, and context-aware.
- Avoid quoting or mentioning any raw numerical feature values.
- Avoid mentioning baseline changes or the baseline.

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
📄 OUTPUT FORMAT
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Strictly follow this JSON format:
{{
  "fluency_evaluator": {{
    "comment": str,
    "fluency_score": int
  }},
  "language_evaluator": {{
    "strengths": [str],
    "improvements": [str],
    "structure_score": int,
    "grammar_score": int
  }},
  "speech_evaluator": {{
    "strengths": [str],
    "improvements": [str],
    "clarity_score": int,
    "confidence_score": int
  }},
}}

This is the schema you must follow:
{json.dumps(response_schema.model_json_schema(), indent=2)}
"""
    return prompt

In [36]:
def generate_feedback(audio_features, posture_features = None, response_schema = None, llm_model : str = "llama-3.3-70b-versatile"):
    prompt = get_prompt_with_schema(audio_features)

    client = Groq()
    completion = client.chat.completions.create(
        model=llm_model,
        messages=[
        {
            "role": "user",
            "content": prompt
        }
        ],
        temperature=0.5,
        max_completion_tokens=32768,
        top_p=1,
        response_format={"type": "json_object"},
        stream=False,
        stop=None,
    )

    return completion.choices[0].message

# Run Pipeline

In [39]:
# Test on your speech sample
path = "../../samples/tim-urban.wav"                                   # ENTER PATH TO YOUR AUDIO FILE HERE

if path:
    features = await extract_features(path)
    feedback = generate_feedback(features)
else:
    feedback = "No audio path provided, feedback cannot be given"

Started transcription job
Gotten full wave features
Gotten full wave features
Get fluency ratings
Gotten transcriptions


In [55]:
import IPython.display as ipd
if isinstance(feedback, str):
    ipd.display(ipd.Markdown(feedback))
else:
    print(json.dumps(json.loads(feedback.content), indent=4))

{
    "fluency_evaluator": {
        "comment": "The speaker's delivery was marked by a notable decrease in speaking rate, suggesting a more deliberate and thoughtful pace. However, this change also introduced a sense of hesitancy, with the speaker seeming to choose words more carefully. The flow between sentences was generally smooth, but the overall impression was one of caution rather than confidence. The use of pauses was minimal, which helped maintain a sense of continuity.",
        "fluency_score": 70
    },
    "language_evaluator": {
        "strengths": [
            "The speaker demonstrated a strong ability to organize ideas in a logical and coherent manner.",
            "The use of anecdotes and personal experiences added depth and engagement to the content.",
            "Vocabulary choice was varied and appropriate, contributing to the overall clarity of the message."
        ],
        "improvements": [
            "Transitions between ideas could be smoother, with mor