In [1]:
import os
from huggingface_hub import InferenceClient
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Initialize Hugging Face client
client = InferenceClient(
    provider="fal-ai",
    api_key=os.getenv("HF_TOKEN")
)

# Path to your audio file
audio_file_path = r"E:\gamified_public_speaking\test_gps_1.mp3"

try:
    # Basic transcription
    output = client.automatic_speech_recognition(
        audio_file_path, 
        model="openai/whisper-large-v3"
    )
    
    print("Transcription:")
    print("-" * 40)
    print(f"Text: {output.text}")
    
    print("\nDetailed Chunks with Timestamps:")
    print("-" * 40)
    for chunk in output.chunks:
        start_time = chunk.timestamp[0]
        end_time = chunk.timestamp[1]
        text = chunk.text
        print(f"[{start_time:4.1f}s - {end_time:4.1f}s]: {text}")
    
    # Check for filler words and speech patterns
    full_text = output.text.lower()
    filler_words = ['um', 'uh', 'ah', 'like', 'you know', 'so', 'well', 'actually', 'basically', 'er', 'hmm', 'yeah']
    
    found_fillers = [word for word in filler_words if word in full_text]
    
    print(f"\n📊 Analysis:")
    print(f"Languages detected: {output.inferred_languages}")
    
    if found_fillers:
        print(f"✓ Filler words/patterns detected: {found_fillers}")
    else:
        print("⚠️ No obvious filler words detected")
    
    # Look for speech patterns (hesitations, self-corrections)
    speech_patterns = []
    if "..." in output.text:
        speech_patterns.append("hesitation/pause (...)")
    if "meaning to" in output.text:
        speech_patterns.append("self-correction")
    if "so," in output.text.lower():
        speech_patterns.append("'so' as filler")
    if "yeah" in output.text.lower():
        speech_patterns.append("'yeah' as filler")
    
    if speech_patterns:
        print(f"✓ Speech patterns detected: {speech_patterns}")
    
except Exception as e:
    print(f"Error: {e}")

  from .autonotebook import tqdm as notebook_tqdm


Transcription:
----------------------------------------
Text:  Hey guys, this is Dharinesh. I'm here to talk about myself. So, I'm studying... meaning to study data science. Yeah, this is very nice.

Detailed Chunks with Timestamps:
----------------------------------------
[ 0.0s -  2.5s]:  Hey guys, this is Dharinesh.
[ 2.5s -  7.0s]:  I'm here to talk about myself.
[ 7.0s -  9.0s]:  So, I'm studying...
[ 9.0s - 13.0s]:  meaning to study data science.
[13.0s - 15.0s]:  Yeah, this is very nice.

📊 Analysis:
Languages detected: ['en']
✓ Filler words/patterns detected: ['ah', 'so', 'er', 'yeah']
✓ Speech patterns detected: ['hesitation/pause (...)', 'self-correction', "'so' as filler", "'yeah' as filler"]


In [3]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import librosa

# Load model locally (preserves more details)
model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True
)
processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    torch_dtype=torch.float16,
)

# Load and transcribe audio
audio, sr = librosa.load(r"E:\gamified_public_speaking\test_gps_1.mp3", sr=16000)
result = pipe(audio)

print("Raw transcription (should preserve 'uh', 'um'):")
print(result["text"])

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use cpu
Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


Raw transcription (should preserve 'uh', 'um'):
 Hey guys, this is Dharanesh. I'm here to talk about myself. So I'm studying meaning to study data science. This is very nice.


In [4]:
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import librosa
import numpy as np

# Load model and processor
model_name = "openai/whisper-large-v3"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

# Load audio
audio, sr = librosa.load(r"E:\gamified_public_speaking\test_gps_1.mp3", sr=16000)

# Process audio
input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features

# Generate with parameters that preserve filler words
with torch.no_grad():
    predicted_ids = model.generate(
        input_features,
        max_length=448,
        num_beams=1,  # Greedy decoding (less cleaning)
        do_sample=False,
        temperature=1.0,
        suppress_tokens=[],  # Don't suppress any tokens
        condition_on_prev_tokens=False,  # More literal
        compression_ratio_threshold=None,  # Disable compression filtering
        logprob_threshold=None,  # Include uncertain words
        no_speech_threshold=None  # Don't filter silence
    )

# Decode without skipping special tokens that might be filler words
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False, normalize=False)[0]

print("Raw transcription (should include 'uh', 'um'):")
print(transcription)

# Also try with normalized version
transcription_clean = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
print("\nCleaned transcription:")
print(transcription_clean)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Raw transcription (should include 'uh', 'um'):
 Hey guys, this is Dharinesh. I'm here to talk about myself. So, I'm studying... meaning to study data science. Yeah, this is very nice.

Cleaned transcription:
 Hey guys, this is Dharinesh. I'm here to talk about myself. So, I'm studying... meaning to study data science. Yeah, this is very nice.


In [2]:
import os
import requests
import time
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

def transcribe_with_filler_words(audio_file_path):
    API_TOKEN = os.getenv("assembly_key")
    headers = {'authorization': API_TOKEN}
    
    print("Uploading audio file...")
    
    # Upload the audio file
    with open(audio_file_path, 'rb') as f:
        response = requests.post('https://api.assemblyai.com/v2/upload',
                               headers=headers, files={'file': f})
    
    if response.status_code != 200:
        return f"Upload failed: {response.text}"
    
    audio_url = response.json()['upload_url']
    print("Audio uploaded successfully!")
    
    # Request transcription with disfluencies enabled
    data = {
        'audio_url': audio_url,
        'disfluencies': True,  # Preserves "uh", "um", etc.
        'filter_profanity': False,
        'punctuate': True
    }
    
    print("Starting transcription...")
    response = requests.post('https://api.assemblyai.com/v2/transcript',
                           json=data, headers=headers)
    
    if response.status_code != 200:
        return f"Transcription request failed: {response.text}"
    
    transcript_id = response.json()['id']
    
    # Wait for completion
    while True:
        response = requests.get(f'https://api.assemblyai.com/v2/transcript/{transcript_id}',
                              headers=headers)
        result = response.json()
        
        if result['status'] == 'completed':
            return result['text']
        elif result['status'] == 'error':
            return f"Error: {result['error']}"
        
        print("Processing...")
        time.sleep(3)

# Your audio file path
audio_file_path = r"E:\gamified_public_speaking\user_1_1755468800.mp3"

# Transcribe
try:
    transcription = transcribe_with_filler_words(audio_file_path)
    print("\nTranscription with filler words:")
    print("-" * 50)
    print(transcription)
    print("-" * 50)
except Exception as e:
    print(f"Error: {e}")

Uploading audio file...
Audio uploaded successfully!
Starting transcription...
Processing...

Transcription with filler words:
--------------------------------------------------
Learning is like riding a bicycle. Because once you get the hang of it, you never forget.
--------------------------------------------------


In [1]:
import os
import requests
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

def test_assembly_ai_key():
    """Test if AssemblyAI API key is working"""
    API_TOKEN = os.getenv("assembly_key")
    headers = {'authorization': API_TOKEN}
    
    print(f"Testing API key: {API_TOKEN[:10]}..." if API_TOKEN else "No API key found!")
    
    # Test with a simple API call
    try:
        response = requests.get(
            'https://api.assemblyai.com/v2/transcript',
            headers=headers
        )
        
        print(f"Status Code: {response.status_code}")
        
        if response.status_code == 200:
            print("✅ API key is valid!")
            return True
        elif response.status_code == 401:
            print("❌ API key is invalid or expired")
            return False
        else:
            print(f"❌ Unexpected response: {response.text}")
            return False
            
    except Exception as e:
        print(f"❌ Error testing API: {e}")
        return False

if __name__ == "__main__":
    test_assembly_ai_key()

Testing API key: 3918b6714a...
Status Code: 200
✅ API key is valid!
