# Install and Import Dependencies

In [1]:
!pip install numpy pandas librosa torch groq load_dotenv

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [2]:
!pip install tiktoken pytubefix

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [3]:
# Imports
import os
import re
import numpy as np
import librosa
from groq import Groq
from nltk.corpus import cmudict

# Load environment file
from load_dotenv import load_dotenv
print(load_dotenv('.env.local'))

assert os.environ.get('GROQ_API_KEY'), "Groq API key not found in .env file, please set the key before starting this notebook"

# Global variables
client = Groq()

try:
    cmu_dict = cmudict.dict()
except:
    import nltk
    nltk.download('cmudict')
    cmu_dict = cmudict.dict()    

True


# Feature Extraction
Features extracted:
	•	ZCR
	•	Pitch
	•	RMS
	•	MFCC
	•	DeltaMFCC
	•	SpeakingRate
	•	PauseCount
	•	PauseDuration

In [4]:
def transcribe_audio(audio_path, client=client):
    with open(audio_path, "rb") as file:
        transcription = client.audio.transcriptions.create(
        file=(audio_path, file.read()),
        model="distil-whisper-large-v3-en",
        response_format="verbose_json",
        timestamp_granularities=['word', 'segment']
        )

    return transcription


def get_word_syllable_count(word):
    word = word.lower().strip(".,?!;:")
    if word in cmu_dict:
        return len([p for p in cmu_dict[word][0] if p[-1].isdigit()])
    return max(1, len(re.findall(r'[aeiouy]+', word)))


def estimate_syllable_rate(transcript, duration_sec):
    words = transcript.split()
    total_syllables = sum(get_word_syllable_count(word) for word in words)
    return total_syllables / duration_sec if duration_sec > 0 else 0


def extract_features_from_wave(data, sr, prefix: str = ""):
    zcr = np.mean(librosa.feature.zero_crossing_rate(data))

    pitch = librosa.yin(data, fmin=librosa.note_to_hz("C2"), fmax=librosa.note_to_hz("C7"), sr=sr)
    pitch = np.nan_to_num(pitch)
    pitch_mean = np.mean(pitch)
    pitch_std = np.std(pitch)
    pitch_var = np.var(pitch)

    rms = librosa.feature.rms(y=data)[0]
    rms_mean = np.mean(rms)
    rms_std = np.std(rms)
    rms_var = np.var(rms)

    mfcc = librosa.feature.mfcc(y=data, sr=sr, n_mfcc=13)
    delta = librosa.feature.delta(mfcc)
    mfcc_mean = np.mean(mfcc)
    delta_mean = np.mean(delta)

    return {
        "zcr": zcr,
        "pitch_mean": pitch_mean,
        "pitch_std": pitch_std,
        "pitch_var": pitch_var,
        "rms_mean": rms_mean,
        "rms_std": rms_std,
        "rms_var": rms_var,
        "mfcc_mean": mfcc_mean,
        "delta_mean": delta_mean
    }


def extract_features(audio_path, baseline_duration=10.0):
    data, sr = librosa.load(audio_path)

    assert len(data) != 0, "Your audio file appears to contain no content. Please input a valid file"
    assert baseline_duration != 0, "Baseline cannot be 0!"
    assert baseline_duration < len(data) / sr, "Baseline cannot be greater than the length of the audio file input"
    
    # Baseline from first few seconds
    baseline_data = data[:min(len(data), int(sr * baseline_duration))]
    baseline_feats = extract_features_from_wave(baseline_data, sr)
    full_feats = extract_features_from_wave(data, sr)

    relative_feats = {}
    for key in full_feats:
        base = baseline_feats.get(key, 0.0)
        full = full_feats[key]
        relative_feats[f'{key}_delta'] = full - base
        relative_feats[f'{key}_ratio'] = full / base if base != 0 else 0

    # Transcription and Speaking Rates
    transcription_json = transcribe_audio(audio_path)
    duration_sec = transcription_json.duration # type: ignore
    baseline_duration = max(10.0, duration_sec * 0.05)
    print(baseline_duration)

    assert duration_sec != 0, "File duration appears to be 0 after transcription?"
    
    # Full data speaking rate
    transcript = transcription_json.text
    word_count = len(transcript.split())
    speaking_rate = word_count / duration_sec
    syllables_rate = estimate_syllable_rate(transcript, duration_sec)
    
    # Baseline speaking rate
    baseline_transcript = [word_segment['word'] for word_segment in transcription_json.words if word_segment['start'] <= baseline_duration]  # type: ignore
    baseline_word_count = len(baseline_transcript)
    baseline_transcript = " ".join(baseline_transcript)
    baseline_speaking_rate = baseline_word_count / baseline_duration
    baseline_syllables_rate = estimate_syllable_rate(baseline_transcript, baseline_duration)
    
    # Pause detection
    intervals = librosa.effects.split(data, top_db=30)
    pauses = [(intervals[i][0] - intervals[i - 1][1]) / sr
              for i in range(1, len(intervals))
              if (intervals[i][0] - intervals[i - 1][1]) / sr > 1.0]
    
    long_pause_count = len(pauses)
    long_pause_total = sum(pauses)

    return {
        "transcript": transcript,
        "duration": duration_sec,
        "baseline_duration": baseline_duration,
        "speaking_rate": speaking_rate,
        "syllables_rate": syllables_rate,
        "baseline_speaking_rate": baseline_speaking_rate,
        "baseline_syllables_rate": baseline_syllables_rate,
        "long_pause_count": long_pause_count,
        "long_pause_duration": long_pause_total,
        **full_feats,
        **{f'baseline_{k}': v for k, v in baseline_feats.items()},
        **relative_feats,
    }


In [5]:
features = extract_features('samples/unconfident.m4a')

  data, sr = librosa.load(audio_path)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


10.0


# Send to GPT for feedback

In [6]:
def get_prompt(features):
    prompt = f"""
You are a professional voice coach and delivery analyst tasked with evaluating a speaker's performance based on a variety of acoustic and prosodic features. Below is an in-depth description of the speech signal, including baseline characteristics, absolute values, and relative shifts.

## NOTE:
- The **first 10 seconds** of the speech are used to define the speaker's personal baseline.
- All relative metrics (e.g., deltas, ratios) are calculated with respect to this baseline.
- Your feedback should interpret the *changes* from baseline — not just absolute values — as indicators of intentional modulation or stress, not necessarily flaws.

## TRANSCRIPT
<transcript> 
{features['transcript']} 
</transcript>

## BASELINE METRICS
- Speaking rate: {features['baseline_speaking_rate']:.2f} words/sec
- Speaking rate: {features['baseline_syllables_rate']:.2f} syllables/sec
- Pitch (Mean; Standard deviation; Variation): {features['baseline_pitch_mean']:.2f}; {features['baseline_pitch_std']:.2f}; {features['baseline_pitch_var']:.2f}
- RMS Energy (Mean; Standard deviation; Variation): {features['baseline_rms_mean']:.2f}; {features['baseline_rms_std']:.2f}; {features['baseline_rms_var']:.2f}
- ZCR: {features['baseline_zcr']:.2f}
- MFCC and Delta MFCC Mean: {features['baseline_mfcc_mean']:.2f}; {features['baseline_delta_mean']:.2f}

## RAW METRICS (FOR THE WHOLE SPEECH)
- Speaking Rate: {features['speaking_rate']:.2f} words/sec
- Speaking rate: {features['baseline_syllables_rate']:.2f} syllables/sec
- Long Pauses: {features['long_pause_count']} (>1s)
- Total Long Pause Duration: {features['long_pause_duration']:.2f} sec
- Pitch (Mean; Standard deviation; Variation): {features['pitch_mean']:.2f}; {features['pitch_std']:.2f}; {features['pitch_var']:.2f}
- RMS Energy (Mean; Standard deviation; Variation): {features['rms_mean']:.2f}; {features['rms_std']:.2f}; {features['rms_var']:.2f}
- ZCR: {features['zcr']:.2f}
- MFCC and Delta MFCC Mean: {features['mfcc_mean']:.2f}; {features['delta_mean']:.2f}

## RELATIVE CHANGES FROM BASELINE
- Pitch variation change (std): {features['pitch_std_delta']:+.2f}
- RMS Energy mean change: {features['rms_mean_delta']:+.2f}
- Speaking rate ratio: {features['speaking_rate'] / features['baseline_speaking_rate']}
- Interpretation Tip:
    - A Pitch variation change > 0 may suggest more modulation than usual; < 0 may suggest flattening.
    - RMS mean delta > 0 = more vocal energy than the beginning few seconds.
    - Speaking rate ratio < 1 = speaker slowed down as compared to the start of their speech.
    NOTE: This tip should not be used as an absolute, a speaking rate slowing could mean anxiety as well, infer that from the script

## INSTRUCTION

Now, based on this input, write a narrative-style feedback giving clear, constructive, and context-aware feedback. 

DO NOT judge the speaker based on universal norms; instead, use their own baseline as reference to detect signs of:
- Increased or decreased vocal control,
- Confidence shifts,
- Monotony vs. modulation,
- Hesitation or fluency issues.

You are a closed source model. So you are expected not to reference any specific acoustic features and their values in your feedback.

Split your feedback in 3 parts: What they did correctly, what they could improve on, and rate their confidence and fluency levels based on the relative metrics.
"""
    return prompt


def generate_feedback(features):
    prompt = get_prompt(features)

    client = Groq()
    completion = client.chat.completions.create(
        model="llama-3.3-70b-versatile",
        messages=[
        {
            "role": "user",
            "content": prompt
        }
        ],
        temperature=0.5,
        max_completion_tokens=32768,
        top_p=1,
        stream=False,
        stop=None,
    )

    return completion.choices[0].message

In [None]:
import tiktoken

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [8]:
encoder = tiktoken.get_encoding('gpt2')
len(encoder.encode(get_prompt(features)))

787

# Run Pipeline

In [9]:
# An unconfident speech
path = "samples/unconfident.m4a"
features = extract_features(path)
feedback = generate_feedback(features)

  data, sr = librosa.load(audio_path)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


10.0


In [10]:
import IPython.display as ipd
ipd.Markdown(feedback.content)

## Part 1: Strengths

The speaker demonstrates a notable ability to convey their emotions and thoughts, despite admitting to a fear of public speaking. Their speech is filled with personal touches, such as expressing their fear, which shows a level of vulnerability and authenticity. This ability to connect with the audience on a personal level is a significant strength. Additionally, the speaker's attempts to articulate their goals, such as trying to find a model to recognize filler words, indicate a clear direction and purpose in their speech. These aspects suggest that the speaker has a good foundation in terms of content and personal connection, which are crucial elements of effective public speaking.

## Part 2: Areas for Improvement

There are areas where the speaker could enhance their delivery to improve the overall impact of their speech. One noticeable aspect is the speaker's tendency to hesitate, as evidenced by the frequent use of filler words. This could be an indication of nervousness or a lack of confidence in their speaking abilities. Furthermore, the speaker's vocal modulation could be more varied, as there are moments where the speech sounds somewhat flat or monotonous. Working on varying the tone and pitch could help keep the audience engaged and interested in the message being conveyed. It's also worth noting that the speaker's pace could be more consistent, as there are instances where the speech slows down, potentially due to anxiety or a lack of fluency.

## Part 3: Confidence and Fluency Assessment

Based on the relative changes from the speaker's baseline, it appears that there are shifts in confidence and fluency throughout the speech. The speaker's confidence level seems to waver, particularly when discussing their fear of public speaking, which is a sensitive topic for them. This emotional vulnerability, while commendable, also reveals a degree of nervousness that affects their fluency. The speaker's fluency level is somewhat impacted by their hesitation and the use of filler words, which disrupts the smooth flow of their speech. On a scale of 1 to 10, with 1 being the lowest and 10 being the highest, I would rate the speaker's confidence level at around 6, as they do show moments of clarity and purpose but are also clearly held back by their fear. Their fluency level would be around 5, as the frequent filler words and hesitations detract from the overall smoothness of their delivery. With practice and focusing on building confidence, the speaker has the potential to significantly improve both their confidence and fluency levels.

In [11]:
# A confident speech
path = "samples/confident.m4a"
features = extract_features(path)
feedback = generate_feedback(features)

  data, sr = librosa.load(audio_path)


10.0


In [12]:
features

{'transcript': " Hi, my name is Adkarsh Malaya. I'm a student and right now what I'm trying to do is I'm trying to get a model and I'm trying to use it to transcribe some filler words. I am very scared. I don't know what will happen and I really really hope this works.",
 'duration': 15.72,
 'baseline_duration': 10.0,
 'speaking_rate': 3.3078880407124682,
 'syllables_rate': 4.134860050890585,
 'baseline_speaking_rate': 3.3,
 'baseline_syllables_rate': 4.1,
 'long_pause_count': 0,
 'long_pause_duration': 0,
 'zcr': np.float64(0.08035611403023599),
 'pitch_mean': np.float64(287.1815062705668),
 'pitch_std': np.float64(466.33901796330355),
 'pitch_var': np.float64(217472.07967497836),
 'rms_mean': np.float32(0.0131712835),
 'rms_std': np.float32(0.010488089),
 'rms_var': np.float32(0.00011000002),
 'mfcc_mean': np.float32(-24.901373),
 'delta_mean': np.float32(0.016268123),
 'baseline_zcr': np.float64(0.07430711644431555),
 'baseline_pitch_mean': np.float64(272.55173088127896),
 'baseline

In [13]:
ipd.Markdown(feedback.content)

## Part 1: Strengths
The speaker demonstrates a strong ability to maintain a consistent pace throughout their speech, which suggests a good level of comfort with the topic they are discussing. This consistency is a notable strength, as it helps to engage the listener and convey the message more effectively. Additionally, the speaker's ability to articulate their thoughts and express their feelings in a clear manner is commendable. The initial introduction is smooth, and the speaker's personal baseline characteristics are well-established, providing a solid foundation for the rest of the speech.

## Part 2: Areas for Improvement
There are moments where the speaker's vocal modulation increases, which may indicate a heightened emotional state or a slight loss of control over their voice. This is particularly noticeable when the speaker expresses uncertainty or fear, such as when mentioning being scared or hoping something works. While this emotional expression can be engaging, it also slightly detracts from the overall fluency of the speech. Furthermore, the speaker could benefit from practicing ways to manage their vocal energy when discussing sensitive or uncertain topics, as this would help to maintain a more even tone throughout the speech.

## Part 3: Confidence and Fluency Assessment
Based on the relative changes from the speaker's baseline, it appears that their confidence levels are somewhat affected by the content of their speech. When discussing personal fears or uncertainties, the speaker's vocal characteristics shift in a way that suggests a decrease in confidence. However, this shift is not drastic and is likely a natural response to the topic at hand. In terms of fluency, the speaker performs well, with no significant hesitations or long pauses that would disrupt the flow of the speech. Overall, I would rate the speaker's confidence level as moderately affected by the topic, but still within a manageable range. Their fluency level is good, with only minor adjustments needed to achieve a smoother, more controlled delivery. On a scale of 1 to 10, with 10 being the highest, I would rate their confidence a 7 and their fluency an 8. With practice and experience, the speaker has the potential to develop even greater control over their voice and message, leading to more effective and engaging communications.

In [14]:
# Installing the audio from Tim Urban's TED talk and using that to assess the model
from pytubefix import YouTube

if not os.path.exists("samples/tim-urban.m4a"):
    yt = YouTube('https://www.youtube.com/watch?v=arj7oStGLkU')
    yt.streams.get_audio_only().download('samples', 'tim-urban.m4a')

In [15]:
tim_urban_path = "samples/tim-urban.m4a"
tim_urban_features = extract_features(tim_urban_path)
tim_urban_feedback = generate_feedback(tim_urban_features)

  data, sr = librosa.load(audio_path)


42.1872517


In [16]:
len(encoder.encode(get_prompt(tim_urban_features)))

3478

In [17]:
tim_urban_features

{'transcript': " Reviewer.pxs. So in college, I was a government major, which means I had to write a lot of papers. Now, when a normal student writes a paper, they might spread the work out a little like this. So, you know, you get started maybe a little slowly, but you get enough done in the first week that with some heavier days later on, everything gets done and things stay civil. And I would want to do that like that. That would be the plan. I would have it all ready to go, but then actually the paper would come along, and then I would kind of do this. And that would happen every single paper. But then came my 90-page senior thesis, a paper you're supposed to spend a year on. I knew for a paper like that, my normal workflow was not an option, it was way too big a project. So I planned things out and I decided I kind of had to go something like this. This is how the year would go. So I'd start off light, and I'd bump it up in the middle months, and then at the end, I would kick it u

In [18]:
import IPython.display as ipd
ipd.Markdown(tim_urban_feedback.content)

**Part 1: Strengths**
The speaker demonstrates a good level of vocal control, particularly in terms of modulation. There are noticeable variations in their tone, which helps to convey enthusiasm and emphasis on certain points. This is evident in the way they share personal anecdotes and explain complex concepts, such as the "instant gratification monkey" and the "panic monster." The speaker's ability to modulate their voice adds depth and engagement to their narrative, making it more enjoyable to listen to. Additionally, their speaking rate is well-balanced, allowing them to convey their ideas clearly without rushing or dragging.

**Part 2: Areas for Improvement**
One area where the speaker could improve is in maintaining a consistent level of vocal energy. At times, their voice seems to lack the energy and emphasis present at the beginning of the speech. This could be due to the speaker settling into a comfortable rhythm, but it also results in some sections feeling slightly flat. Furthermore, the speaker's use of long pauses, while sometimes effective for dramatic effect, can also disrupt the flow of their narrative. It would be beneficial for the speaker to practice varying their pause lengths and using them more strategically to enhance their message.

**Part 3: Confidence and Fluency Assessment**
Based on the relative metrics, I would rate the speaker's confidence level as moderate to high. Their ability to share personal stories and explain complex ideas suggests a good level of self-assurance. However, the decrease in vocal energy and the use of long pauses may indicate some hesitation or uncertainty. In terms of fluency, the speaker generally demonstrates a good level of articulation and coherence. Their speaking rate is well-balanced, and they are able to convey their ideas clearly. However, there are moments where their fluency is disrupted by pauses or a slight decrease in vocal energy. Overall, I would give the speaker a confidence rating of 7.5/10 and a fluency rating of 8/10. With some practice and attention to maintaining consistent vocal energy and strategic pause use, the speaker has the potential to deliver even more engaging and effective presentations.