In [None]:
# Notebook placeholder for 07_speech_recognition.ipynb

# Exercise 7: Speech Recognition

Welcome to speech recognition! You'll learn how to convert spoken German into text using modern deep learning models.

## Learning Objectives
By the end of this exercise, you will be able to:
1. **Audio Processing**: Handle audio files, sampling rates, and preprocessing
2. **German ASR Models**: Use pre-trained German speech recognition models
3. **Real-time Processing**: Implement streaming speech recognition
4. **Audio Feature Extraction**: Understand MFCCs, spectrograms, and mel-scale features
5. **Model Comparison**: Compare different ASR approaches (Wav2Vec2, Whisper, DeepSpeech)
6. **Post-processing**: Clean and improve transcription results

## What You'll Build
- German speech-to-text system
- Audio file batch processor
- Real-time speech recognition interface
- Transcription quality evaluator
- Multi-speaker recognition system

## Applications
- **Voice Assistants**: Convert voice commands to text
- **Meeting Transcription**: Automatic meeting minutes generation
- **Accessibility**: Voice-controlled interfaces for disabled users
- **Content Creation**: Podcast and video transcription services

**Ready to give voice to your applications?** üé§üîä

## Exercise 1: German Speech Recognition Pipeline

**Goal**: Build a complete German speech recognition system using pre-trained models.

**Your Tasks**: 
1. Set up audio processing pipeline
2. Load and test German ASR models
3. Process different audio types and qualities
4. Evaluate transcription accuracy

**Hints**:
- Use 16kHz sampling rate for most ASR models
- Wav2Vec2 models often work best for German
- Longer audio files may need chunking
- Background noise significantly affects accuracy

### Setup and Imports

In [None]:
# Essential imports for speech recognition
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Try to import audio processing libraries
try:
    import soundfile as sf
    import librosa
    AUDIO_LIBS_AVAILABLE = True
    print("‚úÖ Audio processing libraries (soundfile, librosa) available!")
except ImportError:
    print("‚ùå Audio libraries not available. Install with: pip install soundfile librosa")
    AUDIO_LIBS_AVAILABLE = False

# Try to import speech recognition libraries
try:
    from transformers import pipeline, AutoProcessor, AutoModelForSpeechSeq2Seq
    import torch
    TRANSFORMERS_AVAILABLE = True
    print("‚úÖ Transformers library available for ASR!")
    
    # Check device availability
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"   Using device: {device}")
except ImportError:
    print("‚ùå Transformers not available. Install with: pip install transformers torch")
    TRANSFORMERS_AVAILABLE = False
    device = 'cpu'

# Try speech_recognition library as alternative
try:
    import speech_recognition as sr
    SPEECH_RECOGNITION_AVAILABLE = True
    print("‚úÖ SpeechRecognition library available!")
except ImportError:
    print("‚ùå SpeechRecognition not available. Install with: pip install SpeechRecognition")
    SPEECH_RECOGNITION_AVAILABLE = False

# Setup directories
PROJECT_ROOT = Path.cwd()
AUDIO_DIR = PROJECT_ROOT / 'data' / 'audio'
AUDIO_DIR.mkdir(parents=True, exist_ok=True)

print(f"\nüé§ Speech Recognition Toolkit Status:")
print(f"   Audio processing: {'Available' if AUDIO_LIBS_AVAILABLE else 'Not available'}")
print(f"   Transformers ASR: {'Available' if TRANSFORMERS_AVAILABLE else 'Not available'}")
print(f"   SpeechRecognition: {'Available' if SPEECH_RECOGNITION_AVAILABLE else 'Not available'}")
print(f"   Audio directory: {AUDIO_DIR}")

def create_sample_audio_files():
    """Create sample audio files for testing."""
    
    if not AUDIO_LIBS_AVAILABLE:
        print("Cannot create sample audio - audio libraries not available")
        return []
    
    print("\nüîä Creating Sample Audio Files...")
    
    sample_files = []
    
    # Create different types of sample audio
    sample_configs = [
        {'name': 'tone_440hz.wav', 'freq': 440, 'duration': 1.0, 'description': '440Hz tone (1 second)'},
        {'name': 'tone_880hz.wav', 'freq': 880, 'duration': 0.5, 'description': '880Hz tone (0.5 seconds)'},
        {'name': 'noise_sample.wav', 'freq': None, 'duration': 0.8, 'description': 'White noise sample'}
    ]
    
    for config in sample_configs:
        file_path = AUDIO_DIR / config['name']
        
        if not file_path.exists():
            sr = 16000  # Standard sample rate for speech
            duration = config['duration']
            t = np.linspace(0, duration, int(sr * duration), False)
            
            if config['freq'] is None:
                # Create white noise
                signal = 0.1 * np.random.normal(0, 1, len(t))
            else:
                # Create sine wave tone
                signal = 0.1 * np.sin(2 * np.pi * config['freq'] * t)
            
            sf.write(str(file_path), signal, sr)
            print(f"   Created: {config['name']} - {config['description']}")
        else:
            print(f"   Found existing: {config['name']}")
        
        sample_files.append(file_path)
    
    return sample_files

# Create sample audio files
sample_audio_files = create_sample_audio_files() if AUDIO_LIBS_AVAILABLE else []
print(f"\nüéµ Ready with {len(sample_audio_files)} sample audio files!")

In [None]:
def load_german_asr_models():
    """
    Load and compare different German ASR models.
    
    Returns:
        dict: Dictionary of loaded ASR models
    """
    # TODO: Load multiple German ASR models:
    # 1. Wav2Vec2 models for German
    # 2. Whisper models (if available)
    # 3. Alternative German speech models
    # 4. Handle model loading errors gracefully
    
    if not TRANSFORMERS_AVAILABLE:
        print("Transformers library not available for ASR model loading")
        return {}
    
    print("ü§ñ Loading German ASR Models...")
    print("=" * 50)
    
    german_asr_models = {
        'wav2vec2_german': 'jonatasgrosman/wav2vec2-large-xlsr-53-german',
        'wav2vec2_german_cv': 'facebook/wav2vec2-large-xlsr-53-german',
        'wav2vec2_multilingual': 'facebook/wav2vec2-large-xlsr-53',
    }
    
    loaded_models = {}
    
    for model_name, model_id in german_asr_models.items():
        try:
            print(f"\nLoading {model_name} ({model_id})...")
            
            # Create ASR pipeline
            asr_pipeline = pipeline(
                "automatic-speech-recognition",
                model=model_id,
                device=0 if torch.cuda.is_available() else -1,
                return_timestamps=False
            )
            
            loaded_models[model_name] = {
                'pipeline': asr_pipeline,
                'model_id': model_id,
                'status': 'loaded'
            }
            
            print(f"‚úÖ Successfully loaded {model_name}")
            
        except Exception as e:
            print(f"‚ùå Failed to load {model_name}: {e}")
            loaded_models[model_name] = {
                'pipeline': None,
                'model_id': model_id,
                'status': 'failed',
                'error': str(e)
            }
    
    print(f"\nüéØ Successfully loaded {sum(1 for m in loaded_models.values() if m['status'] == 'loaded')} out of {len(german_asr_models)} models")
    
    return loaded_models

def analyze_audio_file(audio_path):
    """
    Analyze audio file properties and visualize waveform.
    
    Args:
        audio_path (Path): Path to audio file
    
    Returns:
        dict: Audio analysis results
    """
    # TODO: Implement audio analysis:
    # 1. Load audio file and extract properties
    # 2. Calculate duration, sample rate, channels
    # 3. Visualize waveform and spectrogram
    # 4. Detect silence and speech segments
    
    if not AUDIO_LIBS_AVAILABLE:
        print("Audio libraries not available for audio analysis")
        return {}
    
    try:
        print(f"üéµ Analyzing Audio: {audio_path.name}")
        print("=" * 40)
        
        # Load audio file
        audio_data, sample_rate = librosa.load(str(audio_path), sr=None)
        
        # Calculate properties
        duration = len(audio_data) / sample_rate
        max_amplitude = np.max(np.abs(audio_data))
        rms_energy = np.sqrt(np.mean(audio_data**2))
        
        print(f"üìä Audio Properties:")
        print(f"   Duration: {duration:.2f} seconds")
        print(f"   Sample rate: {sample_rate} Hz")
        print(f"   Samples: {len(audio_data)}")
        print(f"   Max amplitude: {max_amplitude:.4f}")
        print(f"   RMS energy: {rms_energy:.4f}")
        
        # Create visualizations
        fig, axes = plt.subplots(2, 1, figsize=(12, 8))
        
        # Waveform plot
        time_axis = np.linspace(0, duration, len(audio_data))
        axes[0].plot(time_axis, audio_data)
        axes[0].set_title(f'Waveform - {audio_path.name}')
        axes[0].set_xlabel('Time (seconds)')
        axes[0].set_ylabel('Amplitude')
        axes[0].grid(True)
        
        # Spectrogram
        spectrogram = librosa.stft(audio_data)
        spectrogram_db = librosa.amplitude_to_db(np.abs(spectrogram))
        
        img = axes[1].imshow(spectrogram_db, aspect='auto', origin='lower', 
                           extent=[0, duration, 0, sample_rate/2])
        axes[1].set_title('Spectrogram')
        axes[1].set_xlabel('Time (seconds)')
        axes[1].set_ylabel('Frequency (Hz)')
        plt.colorbar(img, ax=axes[1], label='Magnitude (dB)')
        
        plt.tight_layout()
        plt.show()
        
        return {
            'duration': duration,
            'sample_rate': sample_rate,
            'samples': len(audio_data),
            'max_amplitude': max_amplitude,
            'rms_energy': rms_energy,
            'audio_data': audio_data
        }
        
    except Exception as e:
        print(f"‚ùå Audio analysis failed: {e}")
        return {}

def transcribe_audio_multiple_models(audio_path, asr_models):
    """
    Transcribe audio using multiple ASR models and compare results.
    
    Args:
        audio_path (Path): Path to audio file
        asr_models (dict): Dictionary of loaded ASR models
    
    Returns:
        dict: Transcription results from different models
    """
    # TODO: Implement multi-model transcription:
    # 1. Transcribe audio with each available model
    # 2. Compare transcription results
    # 3. Measure transcription confidence (if available)
    # 4. Handle different audio formats and lengths
    
    print(f"üéôÔ∏è  Multi-Model Speech Recognition")
    print("=" * 50)
    print(f"Audio file: {audio_path.name}")
    print()
    
    results = {}
    
    for model_name, model_data in asr_models.items():
        if model_data['status'] != 'loaded':
            print(f"‚è≠Ô∏è  Skipping {model_name}: {model_data['status']}")
            continue
        
        try:
            print(f"üîÑ Transcribing with {model_name}...")
            
            # Transcribe audio
            pipeline = model_data['pipeline']
            result = pipeline(str(audio_path))
            
            # Extract text and confidence (if available)
            if isinstance(result, dict):
                transcription = result.get('text', '')
                confidence = result.get('confidence', 'N/A')
            else:
                transcription = str(result)
                confidence = 'N/A'
            
            results[model_name] = {
                'transcription': transcription,
                'confidence': confidence,
                'model_id': model_data['model_id'],
                'status': 'success'
            }
            
            print(f"‚úÖ {model_name}: '{transcription}'")
            if confidence != 'N/A':
                print(f"   Confidence: {confidence}")
            
        except Exception as e:
            print(f"‚ùå {model_name} failed: {e}")
            results[model_name] = {
                'transcription': '',
                'confidence': 'N/A',
                'model_id': model_data['model_id'],
                'status': 'failed',
                'error': str(e)
            }
        
        print()
    
    return results

# Load German ASR models
print("üöÄ Initializing German Speech Recognition System...")
asr_models = load_german_asr_models()

# Analyze sample audio files
if sample_audio_files and AUDIO_LIBS_AVAILABLE:
    print("\nüîç Analyzing Sample Audio Files...")
    for audio_file in sample_audio_files[:1]:  # Analyze first file
        audio_analysis = analyze_audio_file(audio_file)
        break

# Test speech recognition on sample files
if asr_models and sample_audio_files:
    print("\nüéØ Testing Speech Recognition...")
    for audio_file in sample_audio_files[:1]:  # Test first file
        transcription_results = transcribe_audio_multiple_models(audio_file, asr_models)
        break
else:
    print("\n‚ö†Ô∏è  No models or audio files available for testing")
    print("This is expected for synthetic audio - real speech audio needed for meaningful transcription")

## Next steps (simple)
- Replace the placeholder audio with a real recording in `data/audio/`
- Install `transformers`, `librosa`, and `soundfile` in your venv and re-run the transcription cell