# Install and Import Dependencies

In [None]:
# Install compatible versions
!pip install numpy==1.23.5 librosa==0.10.0.post2 transformers openai soundfile --quiet

In [None]:
# Imports
import os
import numpy as np
import librosa
import soundfile as sf
import torch
from transformers import pipeline
import openai
from datetime import timedelta

# Transcribe using Whisper

In [None]:
# HuggingFace Whisper transcription
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base")

def transcribe_audio(path):
    return transcriber(path)["text"]

# Feature Extraction
Features extracted:
	•	ZCR
	•	Pitch
	•	RMS
	•	MFCC
	•	DeltaMFCC
	•	SpeakingRate
	•	PauseCount
	•	PauseDuration

In [None]:
def extract_features(audio_path, transcribe_fn):
    data, sr = librosa.load(audio_path)

    # 1. Zero-Crossing Rate (ZCR)
    zcr = np.mean(librosa.feature.zero_crossing_rate(data))

    # 2. Pitch: Mean and Std Deviation
    pitch = librosa.yin(data, fmin=librosa.note_to_hz("C2"),
                        fmax=librosa.note_to_hz("C7"), sr=sr)
    pitch = np.nan_to_num(pitch, nan=0.0, posinf=0.0, neginf=0.0)
    pitch_mean = np.mean(pitch)
    pitch_std = np.std(pitch)

    # 3. Energy: RMS Mean, Std, Variance
    rms = librosa.feature.rms(y=data)[0]
    rms_mean = np.mean(rms)
    rms_std = np.std(rms)
    rms_var = np.var(rms)

    # 4. MFCC and Delta MFCC Mean
    mfcc = librosa.feature.mfcc(y=data, sr=sr, n_mfcc=13)
    delta = librosa.feature.delta(mfcc)
    mfcc_mean = np.mean(mfcc)
    delta_mean = np.mean(delta)

    # 5. Transcription
    transcript = transcribe_fn(audio_path)  # Whisper or API call

    # 6. Speaking Rate: Words per second
    duration_sec = librosa.get_duration(y=data, sr=sr)
    word_count = len(transcript.split())
    speaking_rate = word_count / duration_sec if duration_sec > 0 else 0

    # 7. Long Pauses (>1s)
    intervals = librosa.effects.split(data, top_db=30)
    pauses = []
    for i in range(1, len(intervals)):
        prev_end = intervals[i-1][1]
        cur_start = intervals[i][0]
        silence_duration = (cur_start - prev_end) / sr
        if silence_duration > 1.0:
            pauses.append(silence_duration)
    long_pause_count = len(pauses)
    long_pause_total = sum(pauses)

    return {
        "transcript": transcript,
        "zcr": zcr,
        "pitch_mean": pitch_mean,
        "pitch_std": pitch_std,
        "rms_mean": rms_mean,
        "rms_std": rms_std,
        "rms_var": rms_var,
        "speaking_rate": speaking_rate,
        "long_pause_count": long_pause_count,
        "long_pause_duration": long_pause_total,
        "mfcc_mean": mfcc_mean,
        "delta_mean": delta_mean
    }

# Send to GPT for feedback

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()  # Load .env variables into environment
openai.api_key = os.getenv("OPENAI_API_KEY")

In [None]:
def generate_feedback(features):
    prompt = f"""
You're a communication coach. Analyze the following features of a speaker:

- Transcript: {features['transcript']}
- ZCR: {features['zcr']}
- Mean pitch: {features['pitch_mean']}
- Std pitch: {features['pitch_std']}
- RMS (mean/std/var): {features['rms_mean']}, {features['rms_std']}, {features['rms_var']}
- Speaking rate: {features['speaking_rate']} words/sec
- Long pauses: {features['long_pause_count']} pauses totaling {features['long_pause_duration']} sec
- MFCC mean: {features['mfcc_mean']}
- Delta MFCC mean: {features['delta_mean']}

Based on this data, provide feedback on the user's fluency, confidence, and delivery.
"""

    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7
    )
    return response['choices'][0]['message']['content']

# Run Pipeline

In [None]:
path = "/path/to/audio.wav"
features = extract_features(path)
feedback = generate_feedback(features)
print("Generated Feedback:\n", feedback)