# Topic 7: Speech Recognition - SOLUTIONS

Complete solutions for Speech Recognition exercises using various approaches and libraries.

In [None]:
# Essential imports
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Try to import speech processing libraries
try:
    import soundfile as sf
    import librosa
    AUDIO_LIBS_AVAILABLE = True
    print("Audio processing libraries available!")
except ImportError:
    print("Audio libraries not available. Please install: pip install soundfile librosa")
    AUDIO_LIBS_AVAILABLE = False

try:
    from transformers import pipeline
    import torch
    TRANSFORMERS_AVAILABLE = True
    print("Transformers library available for ASR!")
except ImportError:
    print("Transformers not available. Please install: pip install transformers torch")
    TRANSFORMERS_AVAILABLE = False

try:
    import speech_recognition as sr
    SPEECH_RECOGNITION_AVAILABLE = True
    print("SpeechRecognition library available!")
except ImportError:
    print("SpeechRecognition not available. Please install: pip install SpeechRecognition")
    SPEECH_RECOGNITION_AVAILABLE = False

## Solution 1: Audio File Setup and Processing

In [None]:
def setup_audio_environment():
    """Set up directories and create sample audio files."""
    
    PROJECT_ROOT = Path.cwd()
    AUDIO_DIR = PROJECT_ROOT / 'data' / 'audio'
    AUDIO_DIR.mkdir(parents=True, exist_ok=True)
    
    print("Setting up audio environment...")
    print(f"Audio directory: {AUDIO_DIR}")
    
    return AUDIO_DIR

def create_sample_audio(audio_dir, filename="example_de.wav", duration=2.0):
    """Create a sample audio file with synthetic speech-like sounds."""
    
    if not AUDIO_LIBS_AVAILABLE:
        print("Audio libraries not available for creating sample audio.")
        return None
    
    sample_path = audio_dir / filename
    
    if sample_path.exists():
        print(f"Audio file already exists: {sample_path}")
        return sample_path
    
    # Create synthetic audio that resembles speech patterns
    sr = 16000  # Standard sample rate for speech
    t = np.linspace(0, duration, int(sr * duration), False)
    
    # Create a more complex waveform that resembles speech
    # Mix of different frequencies to simulate formants
    f1, f2, f3 = 800, 1200, 2400  # Typical formant frequencies
    
    signal = (0.3 * np.sin(2 * np.pi * f1 * t) * np.exp(-2 * t) +
              0.2 * np.sin(2 * np.pi * f2 * t) * np.exp(-1.5 * t) +
              0.1 * np.sin(2 * np.pi * f3 * t) * np.exp(-1 * t))
    
    # Add some noise to make it more realistic
    noise = 0.05 * np.random.normal(0, 1, len(signal))
    signal += noise
    
    # Normalize
    signal = signal / np.max(np.abs(signal)) * 0.8
    
    # Save audio file
    sf.write(str(sample_path), signal, sr)
    print(f"Created sample audio: {sample_path}")
    
    return sample_path

def analyze_audio_file(audio_path):
    """Analyze an audio file and display its properties."""
    
    if not AUDIO_LIBS_AVAILABLE or not audio_path or not audio_path.exists():
        print("Cannot analyze audio file.")
        return None
    
    # Load audio file
    audio, sr = librosa.load(str(audio_path), sr=None)
    
    print(f"Audio Analysis for: {audio_path.name}")
    print("=" * 40)
    print(f"Duration: {len(audio) / sr:.2f} seconds")
    print(f"Sample rate: {sr} Hz")
    print(f"Channels: 1 (mono)")
    print(f"Total samples: {len(audio)}")
    print(f"Audio range: [{audio.min():.3f}, {audio.max():.3f}]")
    
    # Create visualization
    plt.figure(figsize=(12, 8))
    
    # Plot 1: Waveform
    plt.subplot(3, 1, 1)
    time = np.linspace(0, len(audio) / sr, len(audio))
    plt.plot(time, audio, linewidth=0.5)
    plt.title('Audio Waveform')
    plt.xlabel('Time (seconds)')
    plt.ylabel('Amplitude')
    plt.grid(True, alpha=0.3)
    
    # Plot 2: Spectrogram
    plt.subplot(3, 1, 2)
    D = librosa.amplitude_to_db(np.abs(librosa.stft(audio)), ref=np.max)
    librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='hz')
    plt.title('Spectrogram')
    plt.colorbar(format='%+2.0f dB')
    
    # Plot 3: MFCC features
    plt.subplot(3, 1, 3)
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    librosa.display.specshow(mfccs, sr=sr, x_axis='time')
    plt.title('MFCC Features')
    plt.colorbar()
    
    plt.tight_layout()
    plt.show()
    
    return {
        'audio': audio,
        'sample_rate': sr,
        'duration': len(audio) / sr,
        'mfccs': mfccs
    }

# Set up audio environment
audio_dir = setup_audio_environment()
sample_audio = create_sample_audio(audio_dir)

if sample_audio:
    audio_analysis = analyze_audio_file(sample_audio)

## Solution 2: Speech Recognition with Transformers (Wav2Vec2)

In [None]:
def setup_wav2vec2_model(language='german'):
    """Set up Wav2Vec2 model for speech recognition."""
    
    if not TRANSFORMERS_AVAILABLE:
        print("Transformers library not available.")
        return None
    
    # German Wav2Vec2 models
    german_models = {
        'wav2vec2-large-xlsr-53-german': 'jonatasgrosman/wav2vec2-large-xlsr-53-german',
        'wav2vec2-base-german': 'maxidl/wav2vec2-large-xlsr-german'
    }
    
    # English models as fallback
    english_models = {
        'wav2vec2-base-960h': 'facebook/wav2vec2-base-960h',
        'wav2vec2-large-960h': 'facebook/wav2vec2-large-960h-lv60-self'
    }
    
    model_name = german_models['wav2vec2-large-xlsr-53-german']
    
    try:
        print(f"Loading {language} ASR model: {model_name}")
        asr_pipeline = pipeline(
            'automatic-speech-recognition',
            model=model_name,
            device=0 if torch.cuda.is_available() else -1
        )
        print("‚úì Model loaded successfully!")
        return asr_pipeline
    
    except Exception as e:
        print(f"Error loading model: {e}")
        print("Trying fallback English model...")
        
        try:
            model_name = english_models['wav2vec2-base-960h']
            asr_pipeline = pipeline(
                'automatic-speech-recognition',
                model=model_name,
                device=0 if torch.cuda.is_available() else -1
            )
            print("‚úì Fallback English model loaded!")
            return asr_pipeline
        except Exception as e2:
            print(f"Could not load any model: {e2}")
            return None

def transcribe_audio_transformers(asr_pipeline, audio_path):
    """Transcribe audio using Transformers pipeline."""
    
    if not asr_pipeline or not audio_path or not audio_path.exists():
        print("Cannot transcribe: missing model or audio file.")
        return None
    
    try:
        print(f"Transcribing: {audio_path.name}")
        
        # Load and preprocess audio
        audio, sr = librosa.load(str(audio_path), sr=16000)  # Wav2Vec2 expects 16kHz
        
        # Transcribe
        result = asr_pipeline(audio)
        
        transcription = result['text'] if isinstance(result, dict) else str(result)
        confidence = result.get('score', 'N/A') if isinstance(result, dict) else 'N/A'
        
        print("Transcription Results:")
        print("=" * 30)
        print(f"Text: {transcription}")
        print(f"Confidence: {confidence}")
        
        return {
            'text': transcription,
            'confidence': confidence,
            'model': 'Wav2Vec2'
        }
    
    except Exception as e:
        print(f"Transcription error: {e}")
        return None

def create_test_speech_samples(audio_dir):
    """Create different types of test audio samples."""
    
    if not AUDIO_LIBS_AVAILABLE:
        print("Cannot create test samples without audio libraries.")
        return []
    
    samples = []
    sr = 16000
    
    # Sample 1: Simple tone sequence (numbers)
    print("Creating test sample 1: Number sequence")
    t1 = np.linspace(0, 3, sr * 3, False)
    # Simulate counting "eins, zwei, drei" with different tones
    signal1 = (0.5 * np.sin(2 * np.pi * 440 * t1[:sr]) +      # "eins"
               0.5 * np.sin(2 * np.pi * 523 * t1[sr:2*sr]) +   # "zwei"  
               0.5 * np.sin(2 * np.pi * 659 * t1[2*sr:]))      # "drei"
    
    path1 = audio_dir / "test_numbers.wav"
    sf.write(str(path1), signal1 * 0.7, sr)
    samples.append(path1)
    
    # Sample 2: White noise (should not transcribe well)
    print("Creating test sample 2: White noise")
    noise = np.random.normal(0, 0.1, sr * 2)
    path2 = audio_dir / "test_noise.wav"
    sf.write(str(path2), noise, sr)
    samples.append(path2)
    
    # Sample 3: Mixed frequency (simulated greeting)
    print("Creating test sample 3: Greeting simulation")
    t3 = np.linspace(0, 2, sr * 2, False)
    greeting = (0.3 * np.sin(2 * np.pi * 200 * t3) * np.exp(-0.5 * t3) +
                0.2 * np.sin(2 * np.pi * 800 * t3) * (1 + 0.5 * np.sin(2 * np.pi * 3 * t3)))
    
    path3 = audio_dir / "test_greeting.wav"
    sf.write(str(path3), greeting * 0.6, sr)
    samples.append(path3)
    
    print(f"Created {len(samples)} test samples")
    return samples

# Set up Wav2Vec2 model
print("Setting up Speech Recognition with Transformers:")
print("=" * 50)

asr_model = setup_wav2vec2_model()

if asr_model and sample_audio:
    # Transcribe the original sample
    result1 = transcribe_audio_transformers(asr_model, sample_audio)
    
    # Create and test additional samples
    test_samples = create_test_speech_samples(audio_dir)
    
    print("\nTesting multiple audio samples:")
    print("=" * 40)
    
    for i, test_file in enumerate(test_samples, 1):
        print(f"\nTest {i}: {test_file.name}")
        result = transcribe_audio_transformers(asr_model, test_file)
else:
    print("Wav2Vec2 transcription not available.")

## Solution 3: Speech Recognition with SpeechRecognition Library

In [None]:
def setup_speech_recognition():
    """Set up SpeechRecognition with different engines."""
    
    if not SPEECH_RECOGNITION_AVAILABLE:
        print("SpeechRecognition library not available.")
        return None
    
    recognizer = sr.Recognizer()
    
    # Test microphone availability
    try:
        mic_list = sr.Microphone.list_microphone_names()
        print(f"Available microphones: {len(mic_list)}")
        for i, name in enumerate(mic_list[:3]):  # Show first 3
            print(f"  {i}: {name}")
    except:
        print("No microphones detected or microphone access unavailable.")
    
    # Configure recognizer
    recognizer.energy_threshold = 300
    recognizer.dynamic_energy_threshold = True
    recognizer.pause_threshold = 0.8
    
    print("SpeechRecognition setup complete!")
    return recognizer

def transcribe_with_google(recognizer, audio_path, language='de-DE'):
    """Transcribe audio using Google Speech Recognition."""
    
    if not recognizer or not audio_path.exists():
        print("Cannot transcribe with Google API.")
        return None
    
    try:
        # Load audio file
        with sr.AudioFile(str(audio_path)) as source:
            # Adjust for ambient noise
            recognizer.adjust_for_ambient_noise(source)
            # Record the audio
            audio_data = recognizer.record(source)
        
        print(f"Transcribing with Google API (language: {language})...")
        
        # Try German first
        try:
            text = recognizer.recognize_google(audio_data, language=language)
            print(f"‚úì German transcription: {text}")
            return {'text': text, 'language': language, 'engine': 'Google'}
        except:
            # Fallback to English
            text = recognizer.recognize_google(audio_data, language='en-US')
            print(f"‚úì English fallback transcription: {text}")
            return {'text': text, 'language': 'en-US', 'engine': 'Google'}
    
    except sr.UnknownValueError:
        print("Google Speech Recognition could not understand the audio")
        return {'text': '[UNRECOGNIZED]', 'language': language, 'engine': 'Google'}
    except sr.RequestError as e:
        print(f"Could not request results from Google: {e}")
        return None
    except Exception as e:
        print(f"Error in Google transcription: {e}")
        return None

def transcribe_with_sphinx(recognizer, audio_path):
    """Transcribe audio using CMU Sphinx (offline)."""
    
    if not recognizer or not audio_path.exists():
        print("Cannot transcribe with Sphinx.")
        return None
    
    try:
        with sr.AudioFile(str(audio_path)) as source:
            recognizer.adjust_for_ambient_noise(source)
            audio_data = recognizer.record(source)
        
        print("Transcribing with CMU Sphinx (offline)...")
        text = recognizer.recognize_sphinx(audio_data)
        print(f"‚úì Sphinx transcription: {text}")
        return {'text': text, 'language': 'en-US', 'engine': 'Sphinx'}
    
    except sr.UnknownValueError:
        print("Sphinx could not understand the audio")
        return {'text': '[UNRECOGNIZED]', 'language': 'en-US', 'engine': 'Sphinx'}
    except sr.RequestError as e:
        print(f"Sphinx error: {e}")
        return None
    except Exception as e:
        print(f"Error in Sphinx transcription: {e}")
        return None

def compare_recognition_engines(recognizer, audio_files):
    """Compare different speech recognition engines."""
    
    if not recognizer or not audio_files:
        print("Cannot compare engines.")
        return
    
    print("Comparing Speech Recognition Engines:")
    print("=" * 50)
    
    results = []
    
    for audio_file in audio_files:
        if not audio_file.exists():
            continue
            
        print(f"\nTesting: {audio_file.name}")
        print("-" * 30)
        
        file_results = {'file': audio_file.name, 'results': []}
        
        # Google API (German)
        google_de = transcribe_with_google(recognizer, audio_file, 'de-DE')
        if google_de:
            file_results['results'].append(google_de)
        
        # Google API (English)
        google_en = transcribe_with_google(recognizer, audio_file, 'en-US')
        if google_en:
            file_results['results'].append(google_en)
        
        # Sphinx (offline)
        sphinx_result = transcribe_with_sphinx(recognizer, audio_file)
        if sphinx_result:
            file_results['results'].append(sphinx_result)
        
        results.append(file_results)
    
    # Summary comparison
    print("\n" + "=" * 50)
    print("COMPARISON SUMMARY:")
    print("=" * 50)
    
    for file_result in results:
        print(f"\nFile: {file_result['file']}")
        for result in file_result['results']:
            engine = result['engine']
            lang = result.get('language', 'N/A')
            text = result['text'][:50] + '...' if len(result['text']) > 50 else result['text']
            print(f"  {engine} ({lang}): {text}")
    
    return results

# Set up SpeechRecognition
print("Setting up SpeechRecognition Library:")
print("=" * 50)

sr_recognizer = setup_speech_recognition()

if sr_recognizer and sample_audio:
    # Test with original sample
    print("\nTesting original sample:")
    google_result = transcribe_with_google(sr_recognizer, sample_audio)
    sphinx_result = transcribe_with_sphinx(sr_recognizer, sample_audio)
    
    # Compare engines on all test files
    all_files = [sample_audio]
    if 'test_samples' in locals():
        all_files.extend(test_samples)
    
    comparison_results = compare_recognition_engines(sr_recognizer, all_files)
else:
    print("SpeechRecognition testing not available.")

## Solution 4: Advanced Audio Processing and Feature Extraction

In [None]:
def extract_advanced_features(audio_path):
    """Extract advanced audio features for speech analysis."""
    
    if not AUDIO_LIBS_AVAILABLE or not audio_path.exists():
        print("Cannot extract features.")
        return None
    
    # Load audio
    audio, sr = librosa.load(str(audio_path), sr=16000)
    
    print(f"Extracting features from: {audio_path.name}")
    print("=" * 40)
    
    features = {}
    
    # 1. MFCC (Mel-frequency cepstral coefficients)
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    features['mfcc'] = {
        'values': mfccs,
        'mean': np.mean(mfccs, axis=1),
        'std': np.std(mfccs, axis=1)
    }
    
    # 2. Spectral features
    spectral_centroids = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
    spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)[0]
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=sr)[0]
    
    features['spectral'] = {
        'centroid_mean': np.mean(spectral_centroids),
        'centroid_std': np.std(spectral_centroids),
        'rolloff_mean': np.mean(spectral_rolloff),
        'rolloff_std': np.std(spectral_rolloff),
        'bandwidth_mean': np.mean(spectral_bandwidth),
        'bandwidth_std': np.std(spectral_bandwidth)
    }
    
    # 3. Zero crossing rate
    zcr = librosa.feature.zero_crossing_rate(audio)[0]
    features['zcr'] = {
        'mean': np.mean(zcr),
        'std': np.std(zcr)
    }
    
    # 4. Chroma features
    chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
    features['chroma'] = {
        'values': chroma,
        'mean': np.mean(chroma, axis=1)
    }
    
    # 5. Tempo and beat
    try:
        tempo, beats = librosa.beat.beat_track(y=audio, sr=sr)
        features['tempo'] = {
            'bpm': tempo,
            'n_beats': len(beats)
        }
    except:
        features['tempo'] = {'bpm': 0, 'n_beats': 0}
    
    # 6. Energy and RMS
    rms = librosa.feature.rms(y=audio)[0]
    features['energy'] = {
        'rms_mean': np.mean(rms),
        'rms_std': np.std(rms),
        'total_energy': np.sum(audio ** 2)
    }
    
    # Print summary
    print(f"MFCC coefficients: {mfccs.shape}")
    print(f"Spectral centroid (avg): {features['spectral']['centroid_mean']:.2f} Hz")
    print(f"Zero crossing rate (avg): {features['zcr']['mean']:.4f}")
    print(f"Tempo: {features['tempo']['bpm']:.1f} BPM")
    print(f"RMS energy (avg): {features['energy']['rms_mean']:.4f}")
    
    return features

def visualize_speech_features(audio_path, features):
    """Create comprehensive visualization of speech features."""
    
    if not features or not AUDIO_LIBS_AVAILABLE:
        print("Cannot create visualizations.")
        return
    
    # Load audio for time axis
    audio, sr = librosa.load(str(audio_path), sr=16000)
    
    plt.figure(figsize=(15, 12))
    
    # Plot 1: Waveform and RMS Energy
    plt.subplot(4, 2, 1)
    time = np.linspace(0, len(audio) / sr, len(audio))
    plt.plot(time, audio, alpha=0.6, label='Waveform')
    rms = librosa.feature.rms(y=audio)[0]
    times_rms = librosa.frames_to_time(range(len(rms)), sr=sr)
    plt.plot(times_rms, rms * 5, 'r-', linewidth=2, label='RMS Energy (x5)')
    plt.title('Waveform and RMS Energy')
    plt.xlabel('Time (s)')
    plt.ylabel('Amplitude')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Plot 2: MFCC
    plt.subplot(4, 2, 2)
    librosa.display.specshow(features['mfcc']['values'], sr=sr, x_axis='time')
    plt.title('MFCC Features')
    plt.colorbar()
    
    # Plot 3: Spectral Centroid and Rolloff
    plt.subplot(4, 2, 3)
    cent = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
    rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)[0]
    times = librosa.frames_to_time(range(len(cent)), sr=sr)
    plt.plot(times, cent, label='Spectral Centroid')
    plt.plot(times, rolloff, label='Spectral Rolloff', alpha=0.7)
    plt.title('Spectral Features')
    plt.xlabel('Time (s)')
    plt.ylabel('Hz')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Plot 4: Zero Crossing Rate
    plt.subplot(4, 2, 4)
    zcr = librosa.feature.zero_crossing_rate(audio)[0]
    times_zcr = librosa.frames_to_time(range(len(zcr)), sr=sr)
    plt.plot(times_zcr, zcr)
    plt.title('Zero Crossing Rate')
    plt.xlabel('Time (s)')
    plt.ylabel('ZCR')
    plt.grid(True, alpha=0.3)
    
    # Plot 5: Chroma Features
    plt.subplot(4, 2, 5)
    librosa.display.specshow(features['chroma']['values'], sr=sr, x_axis='time', y_axis='chroma')
    plt.title('Chroma Features')
    plt.colorbar()
    
    # Plot 6: Mel Spectrogram
    plt.subplot(4, 2, 6)
    mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    librosa.display.specshow(mel_spec_db, sr=sr, x_axis='time', y_axis='mel')
    plt.title('Mel Spectrogram')
    plt.colorbar(format='%+2.0f dB')
    
    # Plot 7: Feature Summary (bar chart)
    plt.subplot(4, 2, 7)
    feature_names = ['MFCC_mean', 'Spectral_Cent', 'ZCR_mean', 'RMS_mean', 'Tempo']
    feature_values = [
        np.mean(features['mfcc']['mean']),
        features['spectral']['centroid_mean'] / 1000,  # Scale to kHz
        features['zcr']['mean'] * 100,  # Scale for visibility
        features['energy']['rms_mean'] * 100,  # Scale for visibility
        features['tempo']['bpm'] / 100  # Scale for visibility
    ]
    
    bars = plt.bar(feature_names, feature_values, color='skyblue')
    plt.title('Feature Summary (Scaled)')
    plt.xticks(rotation=45)
    plt.ylabel('Scaled Values')
    
    # Add value labels on bars
    for bar, val in zip(bars, feature_values):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                f'{val:.2f}', ha='center', va='bottom')
    
    # Plot 8: MFCC Statistics
    plt.subplot(4, 2, 8)
    mfcc_mean = features['mfcc']['mean']
    mfcc_std = features['mfcc']['std']
    coeffs = range(len(mfcc_mean))
    
    plt.errorbar(coeffs, mfcc_mean, yerr=mfcc_std, marker='o', capsize=5)
    plt.title('MFCC Coefficients (Mean ¬± Std)')
    plt.xlabel('MFCC Coefficient')
    plt.ylabel('Value')
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

def speech_quality_assessment(features):
    """Assess speech quality based on extracted features."""
    
    print("\nSpeech Quality Assessment:")
    print("=" * 40)
    
    # Simple quality metrics based on features
    quality_score = 0
    assessments = []
    
    # 1. Energy level assessment
    rms_mean = features['energy']['rms_mean']
    if rms_mean > 0.01:
        quality_score += 20
        assessments.append("‚úì Good energy level")
    else:
        assessments.append("‚ö† Low energy level")
    
    # 2. Spectral richness
    bandwidth_mean = features['spectral']['bandwidth_mean']
    if bandwidth_mean > 1000:
        quality_score += 20
        assessments.append("‚úì Good spectral bandwidth")
    else:
        assessments.append("‚ö† Limited spectral content")
    
    # 3. Voice activity (based on ZCR)
    zcr_mean = features['zcr']['mean']
    if 0.01 < zcr_mean < 0.3:
        quality_score += 20
        assessments.append("‚úì Appropriate voice activity")
    else:
        assessments.append("‚ö† Unusual voice activity pattern")
    
    # 4. Consistency (based on standard deviations)
    mfcc_consistency = np.mean(features['mfcc']['std'])
    if mfcc_consistency < 2.0:
        quality_score += 20
        assessments.append("‚úì Consistent spectral features")
    else:
        assessments.append("‚ö† High spectral variability")
    
    # 5. Tempo assessment
    tempo = features['tempo']['bpm']
    if 60 < tempo < 180:
        quality_score += 20
        assessments.append("‚úì Reasonable speech tempo")
    else:
        assessments.append("‚ö† Unusual tempo detected")
    
    # Overall assessment
    print(f"Overall Quality Score: {quality_score}/100")
    print()
    
    for assessment in assessments:
        print(assessment)
    
    if quality_score >= 80:
        print("\nüéâ High quality speech signal!")
    elif quality_score >= 60:
        print("\nüëç Moderate quality speech signal")
    else:
        print("\n‚ö†Ô∏è Low quality speech signal - consider re-recording")
    
    return quality_score, assessments

# Extract and analyze features for all available audio files
print("Advanced Audio Feature Extraction:")
print("=" * 50)

if sample_audio and AUDIO_LIBS_AVAILABLE:
    # Extract features from main sample
    main_features = extract_advanced_features(sample_audio)
    
    if main_features:
        visualize_speech_features(sample_audio, main_features)
        quality_score, assessments = speech_quality_assessment(main_features)
        
        # Test additional samples if available
        if 'test_samples' in locals() and test_samples:
            print(f"\nTesting {len(test_samples)} additional samples:")
            print("=" * 50)
            
            for i, test_file in enumerate(test_samples[:2], 1):  # Limit to first 2
                print(f"\nSample {i}: {test_file.name}")
                features = extract_advanced_features(test_file)
                if features:
                    score, _ = speech_quality_assessment(features)
else:
    print("Feature extraction not available.")

## Solution 5: Real-time Speech Recognition (Optional)

In [None]:
def setup_realtime_recognition():
    """Set up real-time speech recognition from microphone."""
    
    if not SPEECH_RECOGNITION_AVAILABLE:
        print("SpeechRecognition library required for real-time recognition.")
        return None
    
    recognizer = sr.Recognizer()
    
    # Configure for real-time
    recognizer.energy_threshold = 4000
    recognizer.dynamic_energy_threshold = True
    recognizer.pause_threshold = 0.5
    recognizer.phrase_threshold = 0.3
    
    return recognizer

def test_microphone_setup(recognizer):
    """Test microphone setup and ambient noise calibration."""
    
    if not recognizer:
        print("No recognizer available.")
        return False
    
    try:
        print("Testing microphone setup...")
        
        # List available microphones
        mic_list = sr.Microphone.list_microphone_names()
        print(f"Found {len(mic_list)} microphone(s):")
        for i, name in enumerate(mic_list[:5]):  # Show first 5
            print(f"  {i}: {name}")
        
        # Test with default microphone
        with sr.Microphone() as source:
            print("\nCalibrating for ambient noise... (please be quiet)")
            recognizer.adjust_for_ambient_noise(source, duration=2)
            print(f"‚úì Energy threshold set to: {recognizer.energy_threshold}")
        
        return True
    
    except Exception as e:
        print(f"Microphone test failed: {e}")
        return False

def simulate_realtime_recognition(recognizer, duration=10):
    """Simulate real-time recognition with example code."""
    
    if not recognizer:
        print("No recognizer available for real-time simulation.")
        return
    
    print(f"Real-time Speech Recognition Simulation")
    print("=" * 50)
    print("Note: This is a simulation. For actual real-time recognition:")
    print("1. Ensure microphone permissions are granted")
    print("2. Run in a proper Python environment (not in notebook)")
    print("3. Have stable internet connection for Google API")
    print()
    
    # Show example code for real-time recognition
    realtime_code = '''
# Real-time speech recognition example code:
import speech_recognition as sr
import queue
import threading

def realtime_recognition():
    recognizer = sr.Recognizer()
    microphone = sr.Microphone()
    
    # Adjust for ambient noise
    with microphone as source:
        recognizer.adjust_for_ambient_noise(source)
    
    print("Listening... (speak now)")
    
    def callback(recognizer, audio):
        try:
            # Use threading to avoid blocking
            text = recognizer.recognize_google(audio, language='de-DE')
            print(f"Recognized: {text}")
        except sr.UnknownValueError:
            print("Could not understand audio")
        except sr.RequestError as e:
            print(f"Error: {e}")
    
    # Start listening in the background
    stop_listening = recognizer.listen_in_background(microphone, callback)
    
    # Keep program running
    import time
    try:
        while True:
            time.sleep(0.1)
    except KeyboardInterrupt:
        print("Stopping...")
        stop_listening(wait_for_stop=False)

# Run the real-time recognition
# realtime_recognition()
'''
    
    print("Example Real-time Recognition Code:")
    print("=" * 40)
    print(realtime_code)
    
    # Demonstrate with audio file instead
    if sample_audio and sample_audio.exists():
        print("\nDemonstrating with audio file instead:")
        print("-" * 40)
        
        try:
            with sr.AudioFile(str(sample_audio)) as source:
                audio = recognizer.record(source)
            
            # Simulate real-time processing
            import time
            print("Processing audio... ", end="", flush=True)
            for i in range(3):
                time.sleep(0.5)
                print(".", end="", flush=True)
            print(" Done!")
            
            # Transcribe
            try:
                text = recognizer.recognize_google(audio, language='de-DE')
                print(f"Simulated real-time result: {text}")
            except:
                try:
                    text = recognizer.recognize_google(audio, language='en-US')
                    print(f"Simulated real-time result (EN): {text}")
                except:
                    print("No speech recognized in simulation")
        
        except Exception as e:
            print(f"Simulation error: {e}")

# Set up and test real-time recognition
print("Real-time Speech Recognition Setup:")
print("=" * 50)

rt_recognizer = setup_realtime_recognition()

if rt_recognizer:
    mic_test = test_microphone_setup(rt_recognizer)
    simulate_realtime_recognition(rt_recognizer)
else:
    print("Real-time recognition not available.")

print("\n\nüéâ All speech recognition solutions completed!")
print("Key techniques covered:")
print("‚Ä¢ Audio file processing and analysis")
print("‚Ä¢ Wav2Vec2 transformer-based ASR")
print("‚Ä¢ Multiple recognition engines comparison")
print("‚Ä¢ Advanced audio feature extraction")
print("‚Ä¢ Speech quality assessment")
print("‚Ä¢ Real-time recognition concepts")