# Install and Import Dependencies

In [52]:
!pip install numpy pandas librosa torch groq load_dotenv

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [53]:
!pip install tiktoken pytubefix

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [73]:
# Imports
import io
import os
import re
import joblib
import asyncio
import librosa
import tiktoken
import numpy as np
import parselmouth
from pydub import AudioSegment
from nltk.corpus import cmudict
from parselmouth.praat import call
from groq import Groq, AsyncClient
from groq.types.audio import Transcription

# Load environment file
from load_dotenv import load_dotenv
print(load_dotenv('.env.local'))

assert os.environ.get('GROQ_API_KEY'), "Groq API key not found in .env file, please set the key before starting this notebook"

# Global variables
client = AsyncClient()
encoder = tiktoken.get_encoding('gpt2')
fluency_model = joblib.load('fluency/models/weights/xgboost_model.pkl')

try:
    cmu_dict = cmudict.dict()
except:
    import nltk
    nltk.download('cmudict')
    cmu_dict = cmudict.dict()    

True


## Monitor CPU resources

In [74]:
import psutil
import time
import functools

def monitor_resources(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        process = psutil.Process(os.getpid())

        # Get memory and CPU before
        mem_before = process.memory_info().rss / (1024 ** 2)  # MB
        cpu_before = process.cpu_percent(interval=None)

        # Start time and CPU
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()

        # Get memory and CPU after
        mem_after = process.memory_info().rss / (1024 ** 2)  # MB
        cpu_after = process.cpu_percent(interval=0.1)

        # Get number of CPUs used
        cpu_affinity = process.cpu_affinity()
        
        print(f"Function: {func.__name__}")
        print(f"Execution Time: {end_time - start_time:.2f} sec")
        print(f"Memory Usage: {mem_after - mem_before:.2f} MB")
        print(f"CPU Usage: {cpu_after:.2f}%")
        print(f"CPU Cores Used: {cpu_affinity}")

        return result
    return wrapper

def limit_to_one_core(core_id=0):
    """
    Set process to run only on one CPU core (default: core 0).
    """
    def decorator(func):
        @functools.wraps(func)
        def wrapper(*args, **kwargs):
            process = psutil.Process(os.getpid())

            # Store the current affinity to restore later
            original_affinity = process.cpu_affinity()
            
            try:
                # Set affinity to a single core
                process.cpu_affinity([core_id])
                print(f"Running {func.__name__} on CPU core {core_id}")
                return func(*args, **kwargs)
            finally:
                # Restore original affinity
                process.cpu_affinity(original_affinity)
        return wrapper
    return decorator

# # Limit NumPy, OpenBLAS etc to use only one CPU core
# os.environ["OMP_NUM_THREADS"] = "1"
# os.environ["OPENBLAS_NUM_THREADS"] = "1"
# os.environ["MKL_NUM_THREADS"] = "1"
# os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
# os.environ["NUMEXPR_NUM_THREADS"] = "1"

# Feature Extraction
Features extracted:
* ZCR
* Pitch
* Jitter
* Shimmer
* Harmonic-to-Noise ratio
* RMS
* MFCC
* DeltaMFCC
* SpeakingRate
* PauseCount
* PauseDuration

In [75]:
# Async Transcription
def split_audio_in_memory(audio_path, max_mb=24):
    audio = AudioSegment.from_wav(audio_path)
    bytes_per_ms = (audio.frame_rate * audio.frame_width * audio.channels) / 1000
    max_bytes = max_mb * 1024 * 1024
    chunk_duration_ms = int(max_bytes / bytes_per_ms)

    chunks = []
    for i in range(0, len(audio), chunk_duration_ms):
        chunk = audio[i:i+chunk_duration_ms]
        buffer = io.BytesIO()
        chunk.export(buffer, format="wav")
        buffer.seek(0)
        chunks.append((f"chunk_{i//chunk_duration_ms}.wav", buffer))

    return chunks

async def transcribe_chunk(filename, audio_buffer):
    return await client.audio.transcriptions.create(
        file=(filename, audio_buffer.read()),
        model="distil-whisper-large-v3-en",
        response_format="verbose_json",
        timestamp_granularities=["word"]
    )


async def transcribe_audio(audio_path, client=client):
    """Transcribe an audio file without saving the chunks to disk"""
    chunks = split_audio_in_memory(audio_path)
    tasks = [transcribe_chunk(name, buffer) for name, buffer in chunks]
    all_transcripts = await asyncio.gather(*tasks)

    transcript_parts = []
    all_words = []
    total_duration = 0.0

    for chunk in all_transcripts:
        transcript_parts.append(chunk.text)
        all_words.extend(getattr(chunk, "words", []))
        total_duration += chunk.duration

    transcript = "".join(transcript_parts)
    
    return Transcription(text=transcript, words=all_words, duration=total_duration)


transcript = await transcribe_audio("samples/confident.wav")
transcript

Transcription(text=" Hi, my name is Adkarsh Malaya. I'm a student and right now what I'm trying to do is I'm trying to get a model and I'm trying to use it to transcribe some filler words. I am very scared. I don't know what will happen and I really really hope this works.", words=[{'word': 'Hi,', 'start': 0.78, 'end': 1.18}, {'word': 'my', 'start': 1.18, 'end': 1.52}, {'word': 'name', 'start': 1.52, 'end': 1.66}, {'word': 'is', 'start': 1.66, 'end': 1.82}, {'word': 'Adkarsh', 'start': 1.82, 'end': 2.04}, {'word': 'Malaya.', 'start': 2.04, 'end': 2.52}, {'word': "I'm", 'start': 2.52, 'end': 3.12}, {'word': 'a', 'start': 3.12, 'end': 3.26}, {'word': 'student', 'start': 3.26, 'end': 3.76}, {'word': 'and', 'start': 3.76, 'end': 4.22}, {'word': 'right', 'start': 4.22, 'end': 4.62}, {'word': 'now', 'start': 4.62, 'end': 4.84}, {'word': 'what', 'start': 4.84, 'end': 5.1}, {'word': "I'm", 'start': 5.1, 'end': 5.26}, {'word': 'trying', 'start': 5.26, 'end': 5.38}, {'word': 'to', 'start': 5.38,

In [76]:
# Helper functions for calculating syllables speaking rate
def get_word_syllable_count(word):
    word = word.lower().strip(".,?!;:")
    if word in cmu_dict:
        return len([p for p in cmu_dict[word][0] if p[-1].isdigit()])
    return max(1, len(re.findall(r'[aeiouy]+', word)))


def estimate_syllable_rate(transcript, duration_sec):
    words = transcript.split()
    total_syllables = sum(get_word_syllable_count(word) for word in words)
    return total_syllables / duration_sec if duration_sec > 0 else 0

In [77]:
# Extract Pitch statistics, Jitter, Shimmer, and HNR ratio through Parselmouth
@monitor_resources
def extract_parselmouth_features(data, sr):
    snd = parselmouth.Sound(values=data, sampling_frequency=sr)

    pitch_obj = snd.to_pitch()
    pitch_mean = call(pitch_obj, "Get mean", 0, 0, "Hertz")
    pitch_std = call(pitch_obj, "Get standard deviation", 0, 0, "Hertz")

    point_process = call(snd, "To PointProcess (periodic, cc)", 75, 500)
    jitter = call(point_process, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3)
    shimmer = call([snd, point_process], "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6)

    harmonicity = call(snd, "To Harmonicity (cc)", 0.01, 75, 0.1, 1.0)
    hnr = call(harmonicity, "Get mean", 0, 0)

    return {
        "pitch_mean": pitch_mean,
        "pitch_std": pitch_std,
        "pitch_var": pitch_std**2,
        "jitter_local": jitter,
        "shimmer_local": shimmer,
        "hnr": hnr
    }

async def async_extract_parselmouth_features(data, sr, executor):
    return await asyncio.get_event_loop().run_in_executor(
        executor, extract_parselmouth_features, data, sr
    )

In [78]:
# Extract RMS Energy, ZCR, MFCC and Deltas using librosa
@monitor_resources
def extract_librosa_features(data, sr):
    zcr = np.mean(librosa.feature.zero_crossing_rate(data))
    
    rms = librosa.feature.rms(y=data)[0]
    rms_mean = np.mean(rms)
    rms_std = np.std(rms)
    rms_var = np.var(rms)

    mfcc = librosa.feature.mfcc(y=data, sr=sr, n_mfcc=13)
    delta = librosa.feature.delta(mfcc)
    mfcc_mean = np.mean(mfcc)
    delta_mean = np.mean(delta)

    return {
        "zcr": zcr,
        "rms_mean": rms_mean,
        "rms_std": rms_std,
        "rms_var": rms_var,
        "mfcc": mfcc.mean(axis=1),
        "delta_mfcc": delta.mean(axis=1),
        "mfcc_mean": mfcc_mean,
        "delta_mean": delta_mean
    }
    
async def async_extract_librosa_features(data, sr, executor):
    return await asyncio.get_event_loop().run_in_executor(
        executor, extract_librosa_features, data, sr
    )

In [79]:
def extract_features_from_wave(data, sr):
    return {
        **extract_librosa_features(data, sr),
        **extract_parselmouth_features(data, sr)
    }
    
async def async_extract_features_from_wave(data, sr, executor):
    # Start both tasks concurrently
    librosa_task = asyncio.create_task(async_extract_librosa_features(data, sr, executor))
    parselmouth_task = asyncio.create_task(async_extract_parselmouth_features(data, sr, executor))

    # Wait for both
    librosa_feats, parselmouth_feats = await asyncio.gather(librosa_task, parselmouth_task)

    return {**librosa_feats, **parselmouth_feats}


@monitor_resources
async def extract_features(audio_path, baseline_duration: float = 0.0, fluency_model=fluency_model):
    data, sr = librosa.load(audio_path)
    assert len(data) != 0, "Your audio file appears to contain no content. Please input a valid file"
    
    duration_sec = librosa.get_duration(y=data, sr=sr)
    baseline_duration = baseline_duration or max(10.0, duration_sec * 0.05)
    
    # Baseline from first few seconds
    baseline_data = data[:min(len(data), int(sr * baseline_duration))]
    baseline_feats = extract_features_from_wave(baseline_data, sr)
    full_feats = extract_features_from_wave(data, sr)

    # Get fluency ratings
    features = ['zcr', 'pitch_mean', 'pitch_std', 'rms_mean', 'rms_std', 'rms_var', 'mfcc_mean', 'delta_mean']
    rating_map = ['Low', 'Medium', 'High']
        
    baseline_fluency_features = np.array([baseline_feats[key] for key in baseline_feats if key in features])
    full_fluency_features = np.array([full_feats[key] for key in full_feats if key in features])

    res = fluency_model.predict(np.vstack((baseline_fluency_features, full_fluency_features)))
    baseline_fluency = rating_map[res[0].argmax()]
    full_fluency = rating_map[res[1].argmax()]

    relative_feats = {}
    for key in full_feats:
        if key not in ['mfcc', 'delta_mfcc']:
            base = baseline_feats.get(key, 0.0)
            full = full_feats[key]
            relative_feats[f'{key}_delta'] = full - base
            relative_feats[f'{key}_ratio'] = full / base if base != 0 else 0

    # Transcription and Speaking Rates
    transcription_json = await transcribe_audio(audio_path)
    duration_sec = transcription_json.duration # type: ignore

    assert duration_sec != 0, "File duration appears to be 0 after transcription?"
    
    # Full data speaking rate
    transcript = transcription_json.text
    word_count = len(transcript.split())
    speaking_rate = word_count / duration_sec
    syllables_rate = estimate_syllable_rate(transcript, duration_sec)
    
    # Baseline speaking rate
    baseline_transcript = [word_segment['word'] for word_segment in transcription_json.words if word_segment['start'] <= baseline_duration]  # type: ignore
    baseline_word_count = len(baseline_transcript)
    baseline_transcript = " ".join(baseline_transcript)
    baseline_speaking_rate = baseline_word_count / baseline_duration
    baseline_syllables_rate = estimate_syllable_rate(baseline_transcript, baseline_duration)
    
    # Pause detection
    intervals = librosa.effects.split(data, top_db=30)
    pauses = [(intervals[i][0] - intervals[i - 1][1]) / sr
              for i in range(1, len(intervals))
              if (intervals[i][0] - intervals[i - 1][1]) / sr > 1.0]
    
    long_pause_count = len(pauses)
    long_pause_total = sum(pauses)

    return {
        "transcript": transcript,
        "duration": duration_sec,
        "baseline_duration": baseline_duration,
        "speaking_rate": speaking_rate,
        "syllables_rate": syllables_rate,
        "baseline_speaking_rate": baseline_speaking_rate,
        "baseline_syllables_rate": baseline_syllables_rate,
        "long_pause_count": long_pause_count,
        "long_pause_duration": long_pause_total,
        "fluency_rating": full_fluency,
        "baseline_fluency_rating": baseline_fluency,
        **full_feats,
        **{f'baseline_{k}': v for k, v in baseline_feats.items()},
        **relative_feats,
    }


In [80]:
features = await extract_features('samples/tim-urban.wav')

Function: extract_features
Execution Time: 0.00 sec
Memory Usage: 0.00 MB
CPU Usage: 0.00%
CPU Cores Used: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
Function: extract_librosa_features
Execution Time: 0.07 sec
Memory Usage: 0.91 MB
CPU Usage: 0.00%
CPU Cores Used: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
Function: extract_parselmouth_features
Execution Time: 0.28 sec
Memory Usage: -2.73 MB
CPU Usage: 0.00%
CPU Cores Used: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
Function: extract_librosa_features
Execution Time: 1.24 sec
Memory Usage: 0.95 MB
CPU Usage: 0.00%
CPU Cores Used: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
Function: extract_parselmouth_features
Execution Time: 4.71 sec
Memory Usage: 2.18 MB
CPU Usage: 0.00%
CPU Cores Used: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]


In [81]:
features

{'transcript': " Reviewer.pxs. So in college, I was a government major, which means I had to write a lot of papers. Now, when a normal student writes a paper, they might spread the work out a little like this. So, you know, you get started maybe a little slowly, but you get enough done in the first week that with some heavier days later on, everything gets done and things stay civil. And I would want to do that like that. That would be the plan. I would have it all ready to go, but then actually the paper would come along, and then I would kind of do this. And that would happen every single paper. But then came my 90-page senior thesis, a paper you're supposed to spend a year on. I knew for a paper like that, my normal workflow was not an option, it was way too big a project. So I planned things out and I decided I kind of had to go something like this. This is how the year would go. So I'd start off light, and I'd bump it up in the middle months. And then at the end, I would kick it u

# Send to GPT for feedback

In [82]:
# def get_prompt(features):
#     prompt = f"""
# You are a professional voice coach and delivery analyst tasked with evaluating a speaker's performance based on a variety of acoustic and prosodic features. Below is an in-depth description of the speech signal, including baseline characteristics, absolute values, and relative shifts.

# ## NOTE:
# - The **first 10 seconds** of the speech are used to define the speaker's personal baseline.
# - All relative metrics (e.g., deltas, ratios) are calculated with respect to this baseline.
# - Your feedback should interpret the *changes* from baseline — not just absolute values — as indicators of intentional modulation or stress, not necessarily flaws.

# ## TRANSCRIPT
# <transcript> 
# {features['transcript']} 
# </transcript>

# ## BASELINE METRICS

# ### Fluency & Tempo
# - Words/sec: {features['baseline_speaking_rate']:.2f}
# - Syllables/sec: {features['baseline_syllables_rate']:.2f}

# ## Voice Modulation
# - Pitch (Mean / Std / Var): {features['baseline_pitch_mean']:.2f} / {features['baseline_pitch_std']:.2f} / {features['baseline_pitch_var']:.2f}
# - Jitter (local): {features['baseline_jitter_local']:.3f}
# - Shimmer (local): {features['baseline_shimmer_local']:.3f}
# - Harmonic-to-Noise Ratio (HNR): {features['baseline_hnr']:.2f}

# ## Energy & Dynamics
# - RMS Energy (Mean / Std / Var): {features['baseline_rms_mean']:.2f} / {features['baseline_rms_std']:.2f} / {features['baseline_rms_var']:.2f}
# - Zero Crossing Rate: {features['baseline_zcr']:.3f}

# ## Timbre & Articulation
# - MFCC Mean: {features['baseline_mfcc_mean']:.2f}
# - Delta MFCC Mean: {features['baseline_delta_mean']:.6f}

# ## RAW METRICS (FOR THE WHOLE SPEECH)
# - Speaking Rate: {features['speaking_rate']:.2f} words/sec
# - Speaking rate: {features['baseline_syllables_rate']:.2f} syllables/sec
# - Long Pauses: {features['long_pause_count']} (>1s)
# - Total Long Pause Duration: {features['long_pause_duration']:.2f} sec
# - Pitch (Mean; Standard deviation; Variation): {features['pitch_mean']:.2f}; {features['pitch_std']:.2f}; {features['pitch_var']:.2f}
# - RMS Energy (Mean; Standard deviation; Variation): {features['rms_mean']:.2f}; {features['rms_std']:.2f}; {features['rms_var']:.2f}
# - ZCR: {features['zcr']:.2f}
# - MFCC and Delta MFCC Mean: {features['mfcc_mean']:.2f}; {features['delta_mean']:.2f}

# ## RELATIVE CHANGES FROM BASELINE
# - Pitch variation change (std): {features['pitch_std_delta']:+.2f}
# - RMS Energy mean change: {features['rms_mean_delta']:+.2f}
# - Speaking rate ratio: {features['speaking_rate'] / features['baseline_speaking_rate']}
# - Interpretation Tip:
#     - A Pitch variation change > 0 may suggest more modulation than usual; < 0 may suggest flattening.
#     - RMS mean delta > 0 = more vocal energy than the beginning few seconds.
#     - Speaking rate ratio < 1 = speaker slowed down as compared to the start of their speech.
#     NOTE: This tip should not be used as an absolute, a speaking rate slowing could mean anxiety as well, infer that from the script

# ## INSTRUCTION

# Now, based on this input, write a narrative-style feedback giving clear, constructive, and context-aware feedback. 

# DO NOT judge the speaker based on universal norms; instead, use their own baseline as reference to detect signs of:
# - Increased or decreased vocal control,
# - Confidence shifts,
# - Monotony vs. modulation,
# - Hesitation or fluency issues.

# You are a closed source model. So you are expected not to reference any specific acoustic features and their values in your feedback.

# Split your feedback in 3 parts: What they did correctly, what they could improve on, and rate their confidence and fluency levels based on the relative metrics.
# """
#     return prompt
def get_prompt(features):
    prompt = f"""
You are a professional voice coach and delivery analyst tasked with evaluating a speaker's performance based on a variety of acoustic and prosodic features. Below is a detailed snapshot of the speaker’s delivery — both baseline and full-clip — along with their changes. Use this to deliver personalized, context-aware feedback.

## NOTE:
- The **first {int(features['baseline_duration'])} seconds** of the speech are used to define the speaker's personal baseline.
- All relative metrics (e.g., deltas, ratios) are calculated with respect to this baseline.
- Interpret *changes* from baseline as signs of adaptation or stress — not necessarily flaws.
- **Avoid quoting any raw values** in your response. Use intuitive, narrative insights only.
- An 86% accurate ML model was used to rate the fluency of the speech, and that rating has also been provided to you.

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
📝 TRANSCRIPT
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
<transcript>
{features['transcript']}
</transcript>

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
📏 BASELINE METRICS (First {int(features['baseline_duration'])} seconds)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Fluency & Tempo
- Fluency rating: {features['baseline_fluency_rating']}
- Words/sec: {features['baseline_speaking_rate']:.2f}
- Syllables/sec: {features['baseline_syllables_rate']:.2f}

## Voice Modulation
- Pitch (Mean / Std / Var): {features['baseline_pitch_mean']:.2f} / {features['baseline_pitch_std']:.2f} / {features['baseline_pitch_var']:.2f}
- Jitter (local): {features['baseline_jitter_local']:.3f}
- Shimmer (local): {features['baseline_shimmer_local']:.3f}
- Harmonic-to-Noise Ratio (HNR): {features['baseline_hnr']:.2f}

## Energy & Dynamics
- RMS Energy (Mean / Std / Var): {features['baseline_rms_mean']:.2f} / {features['baseline_rms_std']:.2f} / {features['baseline_rms_var']:.2f}
- Zero Crossing Rate: {features['baseline_zcr']:.3f}

## Timbre & Articulation
- MFCC Mean: {features['baseline_mfcc_mean']:.2f}
- Delta MFCC Mean: {features['baseline_delta_mean']:.6f}

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
📊 FULL CLIP METRICS
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Fluency & Tempo
- Fluency rating: {features['fluency_rating']}
- Words/sec: {features['speaking_rate']:.2f}
- Syllables/sec: {features['syllables_rate']:.2f}
- Long pauses (>1s): {features['long_pause_count']}
- Total pause duration: {features['long_pause_duration']:.2f} sec

## Voice Modulation
- Pitch (Mean / Std / Var): {features['pitch_mean']:.2f} / {features['pitch_std']:.2f} / {features['pitch_var']:.2f}
- Jitter (local): {features['jitter_local']:.3f}
- Shimmer (local): {features['shimmer_local']:.3f}
- Harmonic-to-Noise Ratio (HNR): {features['hnr']:.2f}

## Energy & Dynamics
- RMS Energy (Mean / Std / Var): {features['rms_mean']:.2f} / {features['rms_std']:.2f} / {features['rms_var']:.2f}
- Zero Crossing Rate: {features['zcr']:.3f}

## Timbre & Articulation
- MFCC Mean: {features['mfcc_mean']:.2f}
- Delta MFCC Mean: {features['delta_mean']:.6f}

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
📈 RELATIVE CHANGES FROM BASELINE
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Tempo & Fluency
- Speaking rate ratio: {features['speaking_rate'] / features['baseline_speaking_rate']:.2f}
- Syllable rate ratio: {features['syllables_rate'] / features['baseline_syllables_rate']:.2f}

## Modulation
- Pitch std delta: {features['pitch_std_delta']:+.2f}
- Jitter delta: {features['jitter_local_delta']:+.3f}
- Shimmer delta: {features['shimmer_local_delta']:+.3f}
- HNR delta: {features['hnr_delta']:+.2f}

## Energy
- RMS mean delta: {features['rms_mean_delta']:+.2f}
- RMS std delta: {features['rms_std_delta']:+.2f}
- ZCR delta: {features['zcr_delta']:+.3f}

## Timbre
- MFCC mean delta: {features['mfcc_mean_delta']:+.2f}
- Delta MFCC mean delta: {features['delta_mean_delta']:+.6f}

🧠 **Interpretation Tips** (for internal use only):
- A **negative pitch_std_delta** might suggest monotony or nervousness; a positive value implies expressive modulation.
- **Decreased RMS or HNR** may imply loss of vocal energy or confidence.
- **Increased jitter/shimmer** may reflect stress or instability.
- A **low syllable rate ratio** suggests slowing down relative to their natural pace, which may imply hesitation or deliberate pacing.
- **ZCR changes** may reflect articulation style or clarity.

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
🧭 INSTRUCTIONS FOR FEEDBACK GENERATION
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Using the data above, write a highly personalized and supportive **narrative-style voice coaching paragraph**. Do not cite any specific numerical values. Your tone should be professional, encouraging, and practical.

Structure your feedback in **three sections**:

1. ✅ **What the speaker did well** — Highlight strengths or improvements in vocal control, energy, fluency, or confidence.
2. 🛠️ **What they can improve** — Tactfully mention areas that deviated from their baseline and might affect clarity or delivery.
3. 📊 **Confidence & fluency rating** — Conclude with your overall impression of their vocal confidence and fluency (e.g., low, moderate, high), based on relative metrics.

DO NOT compare to average speakers. DO NOT be generic. Focus only on deviations from this speaker's own baseline and the emotional/functional impact of those changes.
"""
    return prompt


def generate_feedback(features):
    prompt = get_prompt(features)

    client = Groq()
    completion = client.chat.completions.create(
        model="llama-3.3-70b-versatile",
        messages=[
        {
            "role": "user",
            "content": prompt
        }
        ],
        temperature=0.5,
        max_completion_tokens=32768,
        top_p=1,
        stream=False,
        stop=None,
    )

    return completion.choices[0].message

# Run Pipeline

In [83]:
def get_n_tokens(features): return len(encoder.encode(get_prompt(features)))

In [84]:
# An unconfident speech
path = "samples/unconfident.wav"
features = await extract_features(path)
feedback = generate_feedback(features)

get_n_tokens(features)

Function: extract_features
Execution Time: 0.00 sec
Memory Usage: 0.00 MB
CPU Usage: 15.60%
CPU Cores Used: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
Function: extract_librosa_features
Execution Time: 0.02 sec
Memory Usage: 1.88 MB
CPU Usage: 0.00%
CPU Cores Used: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
Function: extract_parselmouth_features
Execution Time: 0.07 sec
Memory Usage: 1.53 MB
CPU Usage: 0.00%
CPU Cores Used: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
Function: extract_librosa_features
Execution Time: 0.05 sec
Memory Usage: 0.03 MB
CPU Usage: 0.00%
CPU Cores Used: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
Function: extract_parselmouth_features
Execution Time: 0.16 sec
Memory Usage: 1.88 MB
CPU Usage: 0.00%
CPU Cores Used: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]


1438

In [85]:
import IPython.display as ipd
ipd.Markdown(feedback.content)

As you began speaking, it was clear that you took a moment to settle into your pace, and once you did, you demonstrated a notable increase in your speaking rate, which suggests that you were able to find a rhythm that worked for you. Your voice showed a slight decrease in pitch variability, which may indicate a tendency towards a more monotone delivery when you're discussing topics that make you nervous, but this also shows that you're able to maintain a consistent tone. Additionally, your articulation remained relatively clear, which is a testament to your ability to enunciate even when feeling anxious.

One area where you might focus on improving is in managing your nervous energy. There were moments where your voice reflected a slight increase in instability, which could be a sign of stress or apprehension. You also had a brief pause, which, while not uncommon, might indicate a moment of hesitation. Working on techniques to manage your breath and calm your nerves could help you feel more grounded and confident in your delivery. Furthermore, paying attention to your vocal energy and trying to maintain a consistent level of engagement could help keep your audience more invested in what you're saying.

Overall, your vocal confidence and fluency rating is on the lower end, largely due to the noticeable effects of your nervousness on your delivery. However, it's essential to recognize that these are common challenges, especially when discussing a topic that makes you uncomfortable. With practice and patience, you can work on building your confidence and developing strategies to manage your nerves, which will, in turn, improve your fluency and overall delivery. Remember, the fact that you're acknowledging and working on your fears is a significant step forward, and with time, you'll see improvements in your public speaking skills.

In [86]:
# A confident speech
path = "samples/confident.wav"
features = await extract_features(path)
feedback = generate_feedback(features)

get_n_tokens(features)

Function: extract_features
Execution Time: 0.00 sec
Memory Usage: 0.00 MB
CPU Usage: 0.00%
CPU Cores Used: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
Function: extract_librosa_features
Execution Time: 0.02 sec
Memory Usage: 0.84 MB
CPU Usage: 0.00%
CPU Cores Used: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
Function: extract_parselmouth_features
Execution Time: 0.15 sec
Memory Usage: 0.04 MB
CPU Usage: 0.00%
CPU Cores Used: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
Function: extract_librosa_features
Execution Time: 0.04 sec
Memory Usage: 0.00 MB
CPU Usage: 0.00%
CPU Cores Used: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
Function: extract_parselmouth_features
Execution Time: 0.11 sec
Memory Usage: 1.00 MB
CPU Usage: 0.00%
CPU Cores Used: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]


1456

In [87]:
features

{'transcript': " Hi, my name is Adkarsh Malaya. I'm a student and right now what I'm trying to do is I'm trying to get a model and I'm trying to use it to transcribe some filler words. I am very scared. I don't know what will happen and I really really hope this works.",
 'duration': 15.72,
 'baseline_duration': 10.0,
 'speaking_rate': 3.3078880407124682,
 'syllables_rate': 4.134860050890585,
 'baseline_speaking_rate': 3.3,
 'baseline_syllables_rate': 4.1,
 'long_pause_count': 0,
 'long_pause_duration': 0,
 'fluency_rating': 'Low',
 'baseline_fluency_rating': 'Low',
 'zcr': np.float64(0.08035611403023599),
 'rms_mean': np.float32(0.0131712835),
 'rms_std': np.float32(0.010488089),
 'rms_var': np.float32(0.00011000002),
 'mfcc': array([-467.24243   ,  134.46071   ,  -29.663124  ,   35.92738   ,
           7.637987  ,    9.713081  ,    9.715767  ,    2.3445826 ,
         -13.212228  ,    0.70066255,   -0.62327105,  -11.099308  ,
          -2.377678  ], dtype=float32),
 'delta_mfcc': arra

In [88]:
ipd.Markdown(feedback.content)

As you began speaking, it was clear that you had a strong foundation to build upon, with a consistent pace that allowed your words to flow smoothly. Your voice modulation showed expressive qualities, suggesting a good range of emotional expression. Notably, your ability to maintain a relatively stable energy level throughout the speech was impressive, indicating a level of comfort with the material. 

However, there were moments where your delivery deviated from your baseline, potentially impacting the clarity and confidence of your message. For instance, your pitch variation became slightly more pronounced, which could be a sign of nervousness or an attempt to add emphasis to certain points. Additionally, the subtle changes in your articulation and timbre might have affected the overall crispness of your words. It's also worth exploring how you can leverage your natural speaking rate to enhance the engagement of your audience, as there were moments where the pace felt slightly hurried or cautious.

Overall, your vocal confidence and fluency came across as moderate, with a clear desire to convey your message effectively. The low fluency rating from the ML model suggests that there's room for improvement in terms of smoothness and natural flow. Nonetheless, your speech showed promising signs of expressive modulation and a stable energy level, which are valuable assets for any speaker. With practice and attention to these areas, you have the potential to enhance your delivery, making your messages even more impactful and engaging for your listeners.

In [89]:
tim_urban_path = "samples/tim-urban.wav"
tim_urban_features = await extract_features(tim_urban_path)
tim_urban_feedback = generate_feedback(tim_urban_features)

get_n_tokens(tim_urban_features)

Function: extract_features
Execution Time: 0.00 sec
Memory Usage: 0.00 MB
CPU Usage: 15.60%
CPU Cores Used: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
Function: extract_librosa_features
Execution Time: 0.08 sec
Memory Usage: 0.09 MB
CPU Usage: 0.00%
CPU Cores Used: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
Function: extract_parselmouth_features
Execution Time: 0.28 sec
Memory Usage: -11.90 MB
CPU Usage: 15.50%
CPU Cores Used: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
Function: extract_librosa_features
Execution Time: 1.18 sec
Memory Usage: 2.07 MB
CPU Usage: 0.00%
CPU Cores Used: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
Function: extract_parselmouth_features
Execution Time: 4.56 sec
Memory Usage: 2.50 MB
CPU Usage: 0.00%
CPU Cores Used: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]


4165

In [90]:
tim_urban_features

{'transcript': " Reviewer.pxs. So in college, I was a government major, which means I had to write a lot of papers. Now, when a normal student writes a paper, they might spread the work out a little like this. So, you know, you get started maybe a little slowly, but you get enough done in the first week that with some heavier days later on, everything gets done and things stay civil. And I would want to do that like that. That would be the plan. I would have it all ready to go, but then actually the paper would come along, and then I would kind of do this. And that would happen every single paper. But then came my 90-page senior thesis, a paper you're supposed to spend a year on. I knew for a paper like that, my normal workflow was not an option, it was way too big a project. So I planned things out and I decided I kind of had to go something like this. This is how the year would go. So I'd start off light, and I'd bump it up in the middle months. And then at the end, I would kick it u

In [91]:
import IPython.display as ipd
ipd.Markdown(tim_urban_feedback.content)

As you reflect on your speech, it's clear that you have a natural ability to engage your audience with your storytelling style, and your voice plays a significant role in conveying the emotional depth of your experiences. One of your strengths is your ability to modulate your pitch to emphasize key points, which helps to keep your listeners invested in your narrative. Additionally, you demonstrate a good sense of pacing, allowing your audience to follow your thoughts and reflect on the insights you're sharing. Your energy levels remain relatively consistent, which is commendable given the length and personal nature of your talk.

However, there are moments where your speaking rate slows down significantly compared to your natural pace, which might suggest hesitation or a deliberate attempt to emphasize certain points. This slowing down could also be related to the complexity of the topics you're discussing, as you navigate through explanations of procrastination and its effects. Your vocal modulation shows some signs of becoming more monotone at times, which could be a sign of nervousness or the challenge of maintaining expressive variation over the course of your talk. Furthermore, there are noticeable pauses throughout your speech, which, while sometimes effective for dramatic effect, at other times might disrupt the flow of your narrative. Being mindful of these pauses and working on smoother transitions between ideas could enhance the overall delivery of your message.

Overall, your vocal confidence and fluency are moderate, considering the deviations from your baseline metrics. While you show a strong ability to connect with your audience through your content, there are areas where your delivery could be refined to better match your natural fluency and energy. With practice and attention to pacing, pitch variation, and pause management, you have the potential to further engage your listeners and convey your messages with even greater impact. Your unique voice and perspective are considerable strengths, and with some focused development, you could enhance your already compelling storytelling ability.