# Install and Import Dependencies

In [1]:
%pip install numpy pandas librosa torch groq load_dotenv

^C
Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install tiktoken pytubefix

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Imports
import os
import re
import numpy as np
import librosa
from groq import Groq
from nltk.corpus import cmudict

# Load environment file
from load_dotenv import load_dotenv
print(load_dotenv('../../.env.local'))

assert os.environ.get('GROQ_API_KEY'), "Groq API key not found in .env file, please set the key before starting this notebook"

# Global variables
client = Groq()

try:
    cmu_dict = cmudict.dict()
except:
    import nltk
    nltk.download('cmudict')
    cmu_dict = cmudict.dict()    

True


# Feature Extraction
Features extracted:
	•	ZCR
	•	Pitch
	•	RMS
	•	MFCC
	•	DeltaMFCC
	•	SpeakingRate
	•	PauseCount
	•	PauseDuration

In [3]:
def transcribe_audio(audio_path, client=client):
    with open(audio_path, "rb") as file:
        transcription = client.audio.transcriptions.create(
        file=(audio_path, file.read()),
        model="distil-whisper-large-v3-en",
        response_format="verbose_json",
        timestamp_granularities=['word', 'segment']
        )

    return transcription


def get_word_syllable_count(word):
    word = word.lower().strip(".,?!;:")
    if word in cmu_dict:
        return len([p for p in cmu_dict[word][0] if p[-1].isdigit()])
    return max(1, len(re.findall(r'[aeiouy]+', word)))


def estimate_syllable_rate(transcript, duration_sec):
    words = transcript.split()
    total_syllables = sum(get_word_syllable_count(word) for word in words)
    return total_syllables / duration_sec if duration_sec > 0 else 0


def extract_features_from_wave(data, sr, prefix: str = ""):
    zcr = np.mean(librosa.feature.zero_crossing_rate(data))

    pitch = librosa.yin(data, fmin=librosa.note_to_hz("C2"), fmax=librosa.note_to_hz("C7"), sr=sr)
    pitch = np.nan_to_num(pitch)
    pitch_mean = np.mean(pitch)
    pitch_std = np.std(pitch)
    pitch_var = np.var(pitch)

    rms = librosa.feature.rms(y=data)[0]
    rms_mean = np.mean(rms)
    rms_std = np.std(rms)
    rms_var = np.var(rms)

    mfcc = librosa.feature.mfcc(y=data, sr=sr, n_mfcc=13)
    delta = librosa.feature.delta(mfcc)
    mfcc_mean = np.mean(mfcc)
    delta_mean = np.mean(delta)

    return {
        "zcr": zcr,
        "pitch_mean": pitch_mean,
        "pitch_std": pitch_std,
        "pitch_var": pitch_var,
        "rms_mean": rms_mean,
        "rms_std": rms_std,
        "rms_var": rms_var,
        "mfcc_mean": mfcc_mean,
        "delta_mean": delta_mean
    }


def extract_features(audio_path, baseline_duration=10.0):
    data, sr = librosa.load(audio_path)

    assert len(data) != 0, "Your audio file appears to contain no content. Please input a valid file"
    assert baseline_duration != 0, "Baseline cannot be 0!"
    assert baseline_duration < len(data) / sr, "Baseline cannot be greater than the length of the audio file input"
    
    # Baseline from first few seconds
    baseline_data = data[:min(len(data), int(sr * baseline_duration))]
    baseline_feats = extract_features_from_wave(baseline_data, sr)
    full_feats = extract_features_from_wave(data, sr)

    relative_feats = {}
    for key in full_feats:
        base = baseline_feats.get(key, 0.0)
        full = full_feats[key]
        relative_feats[f'{key}_delta'] = full - base
        relative_feats[f'{key}_ratio'] = full / base if base != 0 else 0

    # Transcription and Speaking Rates
    transcription_json = transcribe_audio(audio_path)
    duration_sec = transcription_json.duration # type: ignore
    baseline_duration = max(10.0, duration_sec * 0.05)
    print(baseline_duration)

    assert duration_sec != 0, "File duration appears to be 0 after transcription?"
    
    # Full data speaking rate
    transcript = transcription_json.text
    word_count = len(transcript.split())
    speaking_rate = word_count / duration_sec
    syllables_rate = estimate_syllable_rate(transcript, duration_sec)
    
    # Baseline speaking rate
    baseline_transcript = [word_segment['word'] for word_segment in transcription_json.words if word_segment['start'] <= baseline_duration]  # type: ignore
    baseline_word_count = len(baseline_transcript)
    baseline_transcript = " ".join(baseline_transcript)
    baseline_speaking_rate = baseline_word_count / baseline_duration
    baseline_syllables_rate = estimate_syllable_rate(baseline_transcript, baseline_duration)
    
    # Pause detection
    intervals = librosa.effects.split(data, top_db=30)
    pauses = [(intervals[i][0] - intervals[i - 1][1]) / sr
              for i in range(1, len(intervals))
              if (intervals[i][0] - intervals[i - 1][1]) / sr > 1.0]
    
    long_pause_count = len(pauses)
    long_pause_total = sum(pauses)

    return {
        "transcript": transcript,
        "duration": duration_sec,
        "baseline_duration": baseline_duration,
        "speaking_rate": speaking_rate,
        "syllables_rate": syllables_rate,
        "baseline_speaking_rate": baseline_speaking_rate,
        "baseline_syllables_rate": baseline_syllables_rate,
        "long_pause_count": long_pause_count,
        "long_pause_duration": long_pause_total,
        **full_feats,
        **{f'baseline_{k}': v for k, v in baseline_feats.items()},
        **relative_feats,
    }


In [4]:
features = extract_features('../../samples/unconfident.m4a')

  data, sr = librosa.load(audio_path)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


10.0


# Send to GPT for feedback

In [5]:
def get_prompt(features):
    prompt = f"""
You are a professional voice coach and delivery analyst tasked with evaluating a speaker's performance based on a variety of acoustic and prosodic features. Below is an in-depth description of the speech signal, including baseline characteristics, absolute values, and relative shifts.

## NOTE:
- The **first 10 seconds** of the speech are used to define the speaker's personal baseline.
- All relative metrics (e.g., deltas, ratios) are calculated with respect to this baseline.
- Your feedback should interpret the *changes* from baseline — not just absolute values — as indicators of intentional modulation or stress, not necessarily flaws.

## TRANSCRIPT
<transcript> 
{features['transcript']} 
</transcript>

## BASELINE METRICS
- Speaking rate: {features['baseline_speaking_rate']:.2f} words/sec
- Speaking rate: {features['baseline_syllables_rate']:.2f} syllables/sec
- Pitch (Mean; Standard deviation; Variation): {features['baseline_pitch_mean']:.2f}; {features['baseline_pitch_std']:.2f}; {features['baseline_pitch_var']:.2f}
- RMS Energy (Mean; Standard deviation; Variation): {features['baseline_rms_mean']:.2f}; {features['baseline_rms_std']:.2f}; {features['baseline_rms_var']:.2f}
- ZCR: {features['baseline_zcr']:.2f}
- MFCC and Delta MFCC Mean: {features['baseline_mfcc_mean']:.2f}; {features['baseline_delta_mean']:.2f}

## RAW METRICS (FOR THE WHOLE SPEECH)
- Speaking Rate: {features['speaking_rate']:.2f} words/sec
- Speaking rate: {features['baseline_syllables_rate']:.2f} syllables/sec
- Long Pauses: {features['long_pause_count']} (>1s)
- Total Long Pause Duration: {features['long_pause_duration']:.2f} sec
- Pitch (Mean; Standard deviation; Variation): {features['pitch_mean']:.2f}; {features['pitch_std']:.2f}; {features['pitch_var']:.2f}
- RMS Energy (Mean; Standard deviation; Variation): {features['rms_mean']:.2f}; {features['rms_std']:.2f}; {features['rms_var']:.2f}
- ZCR: {features['zcr']:.2f}
- MFCC and Delta MFCC Mean: {features['mfcc_mean']:.2f}; {features['delta_mean']:.2f}

## RELATIVE CHANGES FROM BASELINE
- Pitch variation change (std): {features['pitch_std_delta']:+.2f}
- RMS Energy mean change: {features['rms_mean_delta']:+.2f}
- Speaking rate ratio: {features['speaking_rate'] / features['baseline_speaking_rate']}
- Interpretation Tip:
    - A Pitch variation change > 0 may suggest more modulation than usual; < 0 may suggest flattening.
    - RMS mean delta > 0 = more vocal energy than the beginning few seconds.
    - Speaking rate ratio < 1 = speaker slowed down as compared to the start of their speech.
    NOTE: This tip should not be used as an absolute, a speaking rate slowing could mean anxiety as well, infer that from the script

## INSTRUCTION

Now, based on this input, write a narrative-style feedback giving clear, constructive, and context-aware feedback. 

DO NOT judge the speaker based on universal norms; instead, use their own baseline as reference to detect signs of:
- Increased or decreased vocal control,
- Confidence shifts,
- Monotony vs. modulation,
- Hesitation or fluency issues.

You are a closed source model. So you are expected not to reference any specific acoustic features and their values in your feedback.

Split your feedback in 3 parts: What they did correctly, what they could improve on, and rate their confidence and fluency levels based on the relative metrics.
"""
    return prompt


def generate_feedback(features):
    prompt = get_prompt(features)

    client = Groq()
    completion = client.chat.completions.create(
        model="llama-3.3-70b-versatile",
        messages=[
        {
            "role": "user",
            "content": prompt
        }
        ],
        temperature=0.5,
        max_completion_tokens=32768,
        top_p=1,
        stream=False,
        stop=None,
    )

    return completion.choices[0].message

In [6]:
import tiktoken

In [7]:
encoder = tiktoken.get_encoding('gpt2')
len(encoder.encode(get_prompt(features)))

787

# Run Pipeline

In [8]:
# An unconfident speech
path = "../../samples/unconfident.m4a"
features = extract_features(path)
feedback = generate_feedback(features)

  data, sr = librosa.load(audio_path)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


10.0


In [9]:
import IPython.display as ipd
ipd.Markdown(feedback.content)

## Part 1: Strengths

The speaker demonstrates a notable ability to modulate their voice, suggesting an attempt to convey emotions and emphasize certain points. This modulation is a positive aspect, as it adds variety to the speech and helps maintain the listener's engagement. The speaker's overall pace, while slightly adjusted from their baseline, still allows for clear understanding of the message. It's commendable that despite admitting to a fear of public speaking, the speaker pushes through and delivers their message, showing resilience.

## Part 2: Areas for Improvement

There are moments where the speaker seems to hesitate, indicated by repetitive filler words and slight pauses. These hesitations could be interpreted as signs of nervousness or lack of confidence in what is being said. Working on reducing these filler words and pauses could help improve the overall fluency of the speech. Additionally, there are instances where the speaker's voice could benefit from more consistent energy, as there are subtle shifts that might suggest moments of heightened anxiety or doubt. Practicing speech in a comfortable environment could help in achieving a more balanced delivery.

## Part 3: Confidence and Fluency Assessment

Based on the relative changes from the speaker's baseline, it appears that there is a moderate level of confidence, albeit with noticeable dips. The speaker's attempt to modulate their voice suggests an effort to engage the audience, which is a confident trait. However, the presence of filler words and slight pauses indicates some hesitation, which might suggest that the speaker is not entirely comfortable with public speaking, as they themselves admitted. The fluency is somewhat affected by these hesitations but not severely impaired, allowing the core message to be understood. On a scale of 1 to 10, with 10 being the highest, I would rate the speaker's confidence a 6 and their fluency a 7. With practice and experience, these scores could potentially increase as the speaker becomes more comfortable with expressing themselves publicly.

In [10]:
# A confident speech
path = "../../samples/confident.m4a"
features = extract_features(path)
feedback = generate_feedback(features)

  data, sr = librosa.load(audio_path)


10.0


In [11]:
features

{'transcript': " Hi, my name is Adkarsh Malaya. I'm a student and right now what I'm trying to do is I'm trying to get a model and I'm trying to use it to transcribe some filler words. I am very scared. I don't know what will happen and I really really hope this works.",
 'duration': 15.72,
 'baseline_duration': 10.0,
 'speaking_rate': 3.3078880407124682,
 'syllables_rate': 4.134860050890585,
 'baseline_speaking_rate': 3.3,
 'baseline_syllables_rate': 4.1,
 'long_pause_count': 0,
 'long_pause_duration': 0,
 'zcr': 0.08035611403023599,
 'pitch_mean': 287.1815062705668,
 'pitch_std': 466.33901796330355,
 'pitch_var': 217472.07967497836,
 'rms_mean': 0.0131712835,
 'rms_std': 0.010488089,
 'rms_var': 0.00011000002,
 'mfcc_mean': -24.901373,
 'delta_mean': 0.01626812,
 'baseline_zcr': 0.07430711644431555,
 'baseline_pitch_mean': 272.55173088127896,
 'baseline_pitch_std': 445.4344329724631,
 'baseline_pitch_var': 198411.83407749975,
 'baseline_rms_mean': 0.013894684,
 'baseline_rms_std': 0.

In [12]:
ipd.Markdown(feedback.content)

## Part 1: Strengths
The speaker demonstrates a strong ability to maintain a consistent pace throughout their speech, showing minimal deviation from their initial speaking rate. This suggests a good level of comfort with the topic and an ability to articulate their thoughts without significant hesitation. Additionally, the speaker's vocal energy remains relatively stable, indicating a consistent level of engagement and enthusiasm for the subject matter. Overall, the speaker's baseline characteristics provide a solid foundation for effective communication.

## Part 2: Areas for Improvement
There are some subtle indications that the speaker may be experiencing a slight increase in emotional arousal or modulation during certain parts of the speech. This could be interpreted as a natural response to the topic, but it may also suggest that the speaker is not entirely at ease with the material or is experiencing some level of anxiety. Furthermore, while the speaker's pace remains relatively consistent, there are no significant pauses or breaks to allow for emphasis or dramatic effect, which could make the speech feel somewhat monotone or flat at times. By introducing more deliberate pauses or variations in pace, the speaker could add more nuance and interest to their delivery.

## Part 3: Confidence and Fluency Assessment
Based on the relative changes from the speaker's baseline, it appears that their confidence levels may be slightly impacted by the topic or the situation. The subtle increase in modulation suggests that the speaker may be experiencing some level of emotional arousal, which could be related to anxiety or nervousness. However, this does not seem to significantly impair their fluency, as they are still able to articulate their thoughts and maintain a consistent pace. I would rate the speaker's confidence level as moderately affected, with a score of 6 out of 10. Their fluency level, on the other hand, remains relatively high, with a score of 8 out of 10, indicating that they are able to communicate their ideas clearly and effectively, despite some minor hints of hesitation or emotional influence. Overall, the speaker demonstrates a good level of fluency and some areas for improvement in terms of confidence and emotional control.

In [13]:
# Installing the audio from Tim Urban's TED talk and using that to assess the model
from pytubefix import YouTube

if not os.path.exists("../../samples/tim-urban.m4a"):
    yt = YouTube('https://www.youtube.com/watch?v=arj7oStGLkU')
    yt.streams.get_audio_only().download('../../samples', 'tim-urban.m4a')

In [14]:
tim_urban_path = "../../samples/tim-urban.m4a"
tim_urban_features = extract_features(tim_urban_path)
tim_urban_feedback = generate_feedback(tim_urban_features)

  data, sr = librosa.load(audio_path)


42.1872517


In [15]:
len(encoder.encode(get_prompt(tim_urban_features)))

3478

In [16]:
tim_urban_features

{'transcript': " Reviewer.pxs. So in college, I was a government major, which means I had to write a lot of papers. Now, when a normal student writes a paper, they might spread the work out a little like this. So, you know, you get started maybe a little slowly, but you get enough done in the first week that with some heavier days later on, everything gets done and things stay civil. And I would want to do that like that. That would be the plan. I would have it all ready to go, but then actually the paper would come along, and then I would kind of do this. And that would happen every single paper. But then came my 90-page senior thesis, a paper you're supposed to spend a year on. I knew for a paper like that, my normal workflow was not an option, it was way too big a project. So I planned things out and I decided I kind of had to go something like this. This is how the year would go. So I'd start off light, and I'd bump it up in the middle months, and then at the end, I would kick it u

In [17]:
import IPython.display as ipd
ipd.Markdown(tim_urban_feedback.content)

## Part 1: Strengths

The speaker demonstrates a strong ability to engage with their audience through their narrative, showcasing a clear and relatable storytelling style. Their use of personal anecdotes and humorous examples effectively conveys their message, making the content more accessible and enjoyable for the listeners. The speaker's vocal delivery is generally fluent, with a good balance of pace and pauses that allows the audience to follow their thoughts easily. This suggests a level of comfort with the material, which is essential for maintaining audience interest. Furthermore, the speaker's ability to convey a sense of vulnerability and shared experience through their discussion of procrastination and personal struggles adds a layer of authenticity to their presentation.

## Part 2: Areas for Improvement

One area where the speaker could improve is in maintaining consistent vocal energy and modulation throughout the presentation. There are moments where the speaker's voice becomes somewhat flat, which could be due to a decrease in vocal control or a shift in confidence levels. These fluctuations might make certain sections of the speech less engaging than others. Additionally, while the speaker's use of pauses is generally effective for emphasis and clarity, there are instances where the pauses might be slightly too long or too frequent, potentially disrupting the flow of the narrative. Working on smoothing out these transitions could enhance the overall delivery.

## Part 3: Confidence and Fluency Assessment

Based on the speaker's performance, it's evident that they possess a good level of confidence in their material, which is reflected in their generally fluent delivery and engaging storytelling. However, there are subtle hints of hesitation or decreased vocal control in certain segments, which might suggest slight dips in confidence or comfort with specific topics. The speaker's fluency is notable, with a clear and coherent presentation of ideas, indicating strong preparation and familiarity with the subject matter. Overall, I would rate the speaker's confidence level as high, with minor fluctuations that do not significantly impact the effectiveness of their presentation. Their fluency level is also high, with a well-structured narrative that is easy to follow, suggesting strong communication skills. The minor areas for improvement are primarily related to fine-tuning their vocal delivery to maintain a consistent level of engagement throughout the speech.