In [None]:
# Install required packages
!pip install SpeechRecognition transformers pydub librosa
!apt install libasound2-dev portaudio19-dev libportaudio2 libportaudiocpp0 ffmpeg -y

from google.colab import output
from IPython.display import HTML, Javascript, display
from transformers import pipeline
import speech_recognition as sr
import numpy as np
import base64
import librosa
import soundfile as sf
import io

# Initialize components with advanced emotion detection model
emotion_analyzer = pipeline(
    "text-classification",
    model="SamLowe/roberta-base-go_emotions",  # More comprehensive emotion model
    top_k=None  # Get all emotion scores
)

# Emotion mapping to simplify outputs
EMOTION_MAPPING = {
    'admiration': 'Admiration',
    'amusement': 'Amusement',
    'anger': 'Anger',
    'annoyance': 'Annoyance',
    'approval': 'Approval',
    'caring': 'Caring',
    'confusion': 'Confusion',
    'curiosity': 'Curiosity',
    'desire': 'Desire',
    'disappointment': 'Disappointment',
    'disapproval': 'Disapproval',
    'disgust': 'Disgust',
    'embarrassment': 'Embarrassment',
    'excitement': 'Excitement',
    'fear': 'Fear',
    'gratitude': 'Gratitude',
    'grief': 'Grief',
    'joy': 'Joy',
    'love': 'Love',
    'nervousness': 'Nervousness',
    'optimism': 'Optimism',
    'pride': 'Pride',
    'realization': 'Realization',
    'relief': 'Relief',
    'remorse': 'Remorse',
    'sadness': 'Sadness',
    'surprise': 'Surprise',
    'neutral': 'Neutral'
}

recognizer = sr.Recognizer()

# Configure recognizer for better accuracy
recognizer.energy_threshold = 4000
recognizer.dynamic_energy_threshold = True
recognizer.pause_threshold = 0.8

# HTML interface with enhanced audio processing
display(HTML("""
<div id="recorder">
  <button id="recordBtn" style="padding: 10px 20px; font-size: 16px;">Start Recording</button>
  <p id="status" style="font-weight: bold; color: #333;">Ready to record</p>
  <div id="results" style="margin-top: 20px;"></div>
</div>

<script src="https://cdn.jsdelivr.net/npm/recordrtc@5.6.2/RecordRTC.min.js"></script>
<script>
let recorder;
let audioChunks = [];

async function startRecording() {
  try {
    const stream = await navigator.mediaDevices.getUserMedia({
      audio: {
        echoCancellation: true,
        noiseSuppression: true,
        sampleRate: 44100
      }
    });

    recorder = RecordRTC(stream, {
      type: 'audio',
      mimeType: 'audio/wav',
      sampleRate: 44100,
      desiredSampRate: 44100,
      recorderType: RecordRTC.StereoAudioRecorder,
      numberOfAudioChannels: 1,
      timeSlice: 250,
      ondataavailable: blob => {
        audioChunks.push(blob);
      }
    });

    recorder.startRecording();
    document.getElementById('recordBtn').textContent = 'Stop & Analyze';
    document.getElementById('status').textContent = 'Recording... Speak clearly now!';
    document.getElementById('status').style.color = '#d35400';
    return true;
  } catch (err) {
    document.getElementById('status').textContent = 'Error: ' + err.message;
    document.getElementById('status').style.color = '#c0392b';
    return false;
  }
}

async function stopRecording() {
  return new Promise(resolve => {
    recorder.stopRecording(() => {
      const blob = recorder.getBlob();
      const reader = new FileReader();

      reader.onload = function() {
        const base64data = this.result.split(',')[1];
        resolve(base64data);
      };

      reader.readAsDataURL(blob);
      document.getElementById('status').textContent = 'Processing audio...';
      document.getElementById('status').style.color = '#16a085';

      // Stop all tracks
      recorder.getDataURL().getTracks().forEach(track => track.stop());
    });
  });
}

document.getElementById('recordBtn').onclick = async function() {
  if (this.textContent === 'Start Recording') {
    await startRecording();
  } else {
    const audioData = await stopRecording();
    google.colab.kernel.invokeFunction('analyzeAudio', [audioData], {});
    this.textContent = 'Start Recording';
  }
};
</script>
"""))

def enhance_audio(audio_bytes):
    """Improve audio quality using librosa"""
    audio, sr = librosa.load(io.BytesIO(audio_bytes), sr=44100)
    audio = librosa.effects.preemphasis(audio)
    audio = librosa.util.normalize(audio)
    buffer = io.BytesIO()
    sf.write(buffer, audio, sr, format='WAV')
    return buffer.getvalue()

def analyze_audio(audio_base64):
    try:
        # Convert and enhance audio
        audio_bytes = base64.b64decode(audio_base64)
        enhanced_audio = enhance_audio(audio_bytes)
        audio_array = np.frombuffer(enhanced_audio, dtype=np.int16)
        audio_data = sr.AudioData(audio_array, 44100, 2)

        # Recognize speech
        try:
            text = recognizer.recognize_google(audio_data, language='en-US')
        except:
            text = recognizer.recognize_whisper(audio_data, model='base.en')

        print(f"\n🎤 Recognized text: {text}")

        # Analyze emotions (returns all emotion scores)
        raw_results = emotion_analyzer(text)[0]

        # Process and map emotions
        results = []
        for result in raw_results:
            label = result['label']
            if label in EMOTION_MAPPING:
                results.append({
                    'label': EMOTION_MAPPING[label],
                    'score': result['score']
                })

        # Sort by score (highest first)
        results.sort(key=lambda x: x['score'], reverse=True)

        # Get top 3 emotions
        top_emotions = results[:3]
        dominant_emotion = top_emotions[0]

        # Format results for display
        emotion_results = "\n".join([
            f"• {result['label']}: {result['score']:.2%}"
            for result in results[:5]  # Show top 5 emotions
        ])

        print(f"\n🎭 Detected Emotions:\n{emotion_results}")
        print(f"\n🌟 Dominant Emotion: {dominant_emotion['label']} ({dominant_emotion['score']:.2%} confidence)")

        # Prepare data for JavaScript visualization
        results_js = [
            {'label': r['label'], 'score': round(r['score'] * 100)}
            for r in results[:5]  # Show top 5 emotions in visualization
        ]

        # Determine emotion color
        emotion_colors = {
            'Positive': '#2ecc71',
            'Negative': '#e74c3c',
            'Neutral': '#3498db'
        }

        # Categorize dominant emotion
        if dominant_emotion['label'] in ['Joy', 'Love', 'Excitement', 'Gratitude', 'Amusement', 'Pride']:
            emotion_type = 'Positive'
        elif dominant_emotion['label'] in ['Anger', 'Sadness', 'Fear', 'Disgust', 'Grief', 'Remorse']:
            emotion_type = 'Negative'
        else:
            emotion_type = 'Neutral'

        # Visual feedback with emotion display
        js_code = f"""
        document.getElementById('status').textContent = 'Analysis complete!';
        document.getElementById('status').style.color = '#27ae60';

        // Create HTML for results
        const resultsHTML = `
            <div style="background: #f8f9fa; padding: 15px; border-radius: 5px; margin-top: 10px;">
                <h3 style="margin-top: 0;">Emotion Analysis Results</h3>

                <div style="display: flex; align-items: center; margin-bottom: 15px;">
                    <div style="font-size: 24px; margin-right: 15px;">
                        <strong>Final Emotion:</strong>
                        <span style="color: {emotion_colors[emotion_type]}">
                            {dominant_emotion['label']} ({round(dominant_emotion['score']*100)}%)
                        </span>
                    </div>
                    <div style="background: #e9ecef; height: 10px; flex-grow: 1; border-radius: 5px;">
                        <div style="background: {emotion_colors[emotion_type]}; width: {dominant_emotion['score']*100}%; height: 100%; border-radius: 5px;"></div>
                    </div>
                </div>

                <p><strong>Emotion Breakdown:</strong></p>
                <ul style="padding-left: 20px; margin-bottom: 15px;">
                    {''.join([f"<li>{r['label']}: {r['score']}%</li>" for r in results_js])}
                </ul>

                <div style="background: #f1f1f1; padding: 10px; border-radius: 5px;">
                    <p style="margin: 0;"><strong>Recognized Text:</strong> {text}</p>
                </div>
            </div>
        `;

        document.getElementById('results').innerHTML = resultsHTML;
        """

        display(Javascript(js_code))

    except Exception as e:
        print(f"\n❌ Error: {str(e)}")
        display(Javascript("""
        document.getElementById('status').textContent = 'Error in analysis';
        document.getElementById('status').style.color = '#c0392b';
        document.getElementById('results').innerHTML = '';
        """))

# Register the callback
output.register_callback('analyzeAudio', analyze_audio)

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libasound2-dev is already the newest version (1.2.6.1-1ubuntu1).
libportaudio2 is already the newest version (19.6.0-1.1).
libportaudiocpp0 is already the newest version (19.6.0-1.1).
portaudio19-dev is already the newest version (19.6.0-1.1).
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


Device set to use cpu



🎤 Recognized text: I am feeling very tired today

🎭 Detected Emotions:
• Sadness: 49.79%
• Neutral: 20.37%
• Disappointment: 14.84%
• Annoyance: 3.68%
• Approval: 2.11%

🌟 Dominant Emotion: Sadness (49.79% confidence)


<IPython.core.display.Javascript object>


🎤 Recognized text: I am feeling very tired today

🎭 Detected Emotions:
• Sadness: 49.79%
• Neutral: 20.37%
• Disappointment: 14.84%
• Annoyance: 3.68%
• Approval: 2.11%

🌟 Dominant Emotion: Sadness (49.79% confidence)


<IPython.core.display.Javascript object>