# THE EMPATHY ENGINE

## Import Libraries and Setup

In [1]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'sample.json'


In [None]:

# Libraries
%pip install --upgrade google-cloud-texttospeech transformers torch numpy pandas tqdm


In [2]:
import warnings
warnings.filterwarnings('ignore')
from pathlib import Path
from typing import Dict, List
from datetime import datetime
import numpy as np
import torch
from transformers import pipeline
from google.cloud import texttospeech
from IPython.display import Audio


In [10]:

class EmotionToVoiceMapper:
    """Maps detected emotions to expressive prosody using moderate (recommended) ranges."""
    def __init__(self):
        self.emotion_profiles = {
            'joy':      {'speed': 1.35, 'pitch_shift': 5,   'volume_db': 5,   'description': 'Extremely bright, highly energetic'},
            'anger':    {'speed': 1.28, 'pitch_shift': 2,   'volume_db': 8,   'description': 'Very forceful, aggressive'},
            'sadness':  {'speed': 0.75, 'pitch_shift': -3,  'volume_db': -3,  'description': 'Slow, low, withdrawn'},
            'fear':     {'speed': 1.42, 'pitch_shift': 4,   'volume_db': 3,   'description': 'Very rapid, panicked, urgent'},
            'surprise': {'speed': 1.38, 'pitch_shift': 7,   'volume_db': 4,   'description': 'Highly elevated, very expressive'},
            'disgust':  {'speed': 0.70, 'pitch_shift': -3,  'volume_db': -4,  'description': 'Very flat, heavily muted'},
            'neutral':  {'speed': 1.0,  'pitch_shift': 0,   'volume_db': 0,   'description': 'Balanced, even baseline'}
        }

    def get_parameters(self, emotion: str, intensity: float) -> Dict:
        profile = self.emotion_profiles.get(
            emotion.lower(), self.emotion_profiles['neutral']
        )
        intensity_factor = np.clip((intensity - 0.3) / 0.7, 0.3, 1.0)
        return {
            'speed': 1.0 + (profile['speed'] - 1.0) * intensity_factor,
            'pitch_shift': int(profile['pitch_shift'] * intensity_factor),
            'volume_db': float(profile['volume_db'] * intensity_factor),
            'emotion': emotion, 'intensity': intensity,
            'description': profile['description']
        }
    def display_profiles(self):
        import pandas as pd
        data = []
        for emotion, profile in self.emotion_profiles.items():
            data.append({
                'Emotion': emotion.capitalize(),
                'Speed': f"{profile['speed']:.2f}x",
                'Pitch': f"{profile['pitch_shift']:+d} ST",
                'Volume': f"{profile['volume_db']:+.1f} dB",
                'Description': profile['description']
            })
        return pd.DataFrame(data)

class EmpathyEngine:
    """Emotion-Aware Text-to-Speech (Google Cloud TTS)"""
    def __init__(self, output_dir: str = 'outputs'):
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
        device = 0 if torch.cuda.is_available() else -1
        print(f"   Loading emotion detection model on {'GPU' if device == 0 else 'CPU'}...")
        self.emotion_classifier = pipeline(
            "text-classification",
            model="j-hartmann/emotion-english-distilroberta-base",
            return_all_scores=True,
            device=device
        )
        self.voice_mapper = EmotionToVoiceMapper()
    def detect_emotion(self, text: str) -> tuple:
        results = self.emotion_classifier(text)[0]
        top_emotion = max(results, key=lambda x: x['score'])
        return (
            top_emotion['label'],
            top_emotion['score'],
            sorted(results, key=lambda x: x['score'], reverse=True)
        )
    def synthesize(
        self,
        text: str,
        output_filename: str = 'output.mp3',
        verbose: bool = True,
        display_audio: bool = True
    ) -> Dict:
        if verbose:
            print(f"{'='*70}")
            print(f"Input: '{text}'")
            print(f"{'='*70}")
        try:
            emotion, intensity, all_scores = self.detect_emotion(text)
            if verbose:
                print(f"\nEmotion Analysis:")
                print(f"   Primary Emotion: {emotion.upper()}")
                print(f"   Confidence: {intensity:.2%}")
                print(f"\n   Top 3 Emotions:")
                for score in all_scores[:3]:
                    bar = '█' * int(score['score'] * 20)
                    print(f"      {score['label']:>10}: {bar} {score['score']:.2%}")
            params = self.voice_mapper.get_parameters(emotion, intensity)
            if verbose:
                print(f"\n Voice Configuration:")
                print(f"   Speed: {params['speed']:.2f}x ({params['description']})")
                print(f"   Pitch Shift: {params['pitch_shift']:+d} semitones")
                print(f"   Volume: {params['volume_db']:+.1f} dB")
            # Google Cloud TTS Synthesis
            client = texttospeech.TextToSpeechClient()
            synthesis_input = texttospeech.SynthesisInput(text=text)
            voice = texttospeech.VoiceSelectionParams(
                language_code="en-US",
                name="en-US-Neural2-G", # You may choose any supported voice
            )
            audio_config = texttospeech.AudioConfig(
                audio_encoding=texttospeech.AudioEncoding.MP3,
                speaking_rate=params['speed'],
                pitch=params['pitch_shift'],
                volume_gain_db=params['volume_db']
            )
            response = client.synthesize_speech(
                input=synthesis_input,
                voice=voice,
                audio_config=audio_config
            )
            output_path = self.output_dir / output_filename
            with open(output_path, "wb") as out:
                out.write(response.audio_content)
            if verbose:
                print(f"Audio saved: {output_path}")
                print(f"{'='*70}\n")
            if display_audio:
                display(Audio(str(output_path)))
            return {
                'emotion': emotion,
                'intensity': intensity,
                'parameters': params,
                'output_file': str(output_path),
                'all_scores': all_scores,
                'success': True,
                'timestamp': datetime.now().isoformat()
            }
        except Exception as e:
            print(f"Error: {str(e)}")
            return {
                'success': False,
                'error': str(e)
            }
    def batch_synthesize(self, texts: List[str], prefix: str = 'output') -> List[Dict]:
        results = []
        print(f"Processing {len(texts)} texts...\n")
        for i, text in enumerate(texts, 1):
            print(f"[{i}/{len(texts)}]")
            result = self.synthesize(
                text,
                output_filename=f'{prefix}_{i}.mp3',
                verbose=True,
                display_audio=False
            )
            results.append(result)
        return results
    def compare_emotions(self, text: str):
        emotion, _, _ = self.detect_emotion(text)
        print(f"Detected emotion: {emotion}\n")
        print("How different emotions would modulate this text:\n")
        for emo in ['joy', 'anger', 'sadness', 'fear', 'surprise', 'neutral', 'disgust']:
            params = self.voice_mapper.get_parameters(emo, 0.9)
            print(f"{emo.capitalize():>10}: Speed={params['speed']:.2f}x, "
                  f"Pitch={params['pitch_shift']:+d}ST, "
                  f"Volume={params['volume_db']:+.1f}dB")


In [5]:
engine = EmpathyEngine(output_dir='outputs')
text = "This is the best news ever!"
engine.synthesize(text)


   Loading emotion detection model on CPU...


Device set to use cpu


Input: 'This is the best news ever!'

Emotion Analysis:
   Primary Emotion: JOY
   Confidence: 59.61%

   Top 3 Emotions:
             joy: ███████████ 59.61%
        surprise: ███ 19.65%
         neutral: ███ 17.30%

 Voice Configuration:
   Speed: 1.05x (Bright, energetic)
   Pitch Shift: +0 semitones
   Volume: +0.8 dB
Audio saved: outputs/output.mp3



{'emotion': 'joy',
 'intensity': 0.5961117148399353,
 'parameters': {'speed': np.float64(1.0507620082582747),
  'pitch_shift': 0,
  'volume_db': 0.8460334709712438,
  'emotion': 'joy',
  'intensity': 0.5961117148399353,
  'description': 'Bright, energetic'},
 'output_file': 'outputs/output.mp3',
 'all_scores': [{'label': 'joy', 'score': 0.5961117148399353},
  {'label': 'surprise', 'score': 0.19654116034507751},
  {'label': 'neutral', 'score': 0.17297565937042236},
  {'label': 'anger', 'score': 0.021357888355851173},
  {'label': 'disgust', 'score': 0.008448890410363674},
  {'label': 'sadness', 'score': 0.003268597414717078},
  {'label': 'fear', 'score': 0.0012960518943145871}],
 'success': True,
 'timestamp': '2025-11-07T20:08:41.934257'}

In [11]:

emotion_tests = [
    ("After months of hard work and anticipation, I can't express how unbelievably excited I am about this incredible opportunity.", "Joy"),
    ("I'm furious about what just happened. The fact that someone could act so irresponsibly is unacceptable!", "Anger"),
    ("I've felt an overwhelming sense of loneliness. It's been ages since anyone truly listened to me.", "Sadness"),
    ("Ever since I read the news, I've been feeling anxious. Something terrifying could happen soon.", "Fear"),
    ("Seeing all my friends gathered for a surprise celebration truly astonished me.", "Surprise"),
    ("The trash piled up with that foul smell was absolutely disgusting. I can't believe anyone would let things get so bad.", "Disgust"),
    ("The next company meeting is at 3 PM. Please submit your reports ahead of time.", "Neutral")
]
results = []
for i, (text, expected) in enumerate(emotion_tests, 1):
    print(f"\n[{i}/{len(emotion_tests)}] Expected: {expected}")
    result = engine.synthesize(
        text,
        output_filename=f'emotion_test_{i}.mp3',
        display_audio=True
    )
    results.append(result)



[1/7] Expected: Joy
Input: 'After months of hard work and anticipation, I can't express how unbelievably excited I am about this incredible opportunity.'

Emotion Analysis:
   Primary Emotion: JOY
   Confidence: 60.39%

   Top 3 Emotions:
             joy: ████████████ 60.39%
        surprise: ██████ 31.83%
         neutral:  3.75%

 Voice Configuration:
   Speed: 1.05x (Bright, energetic)
   Pitch Shift: +0 semitones
   Volume: +0.9 dB
Audio saved: outputs/emotion_test_1.mp3




[2/7] Expected: Anger
Input: 'I'm furious about what just happened. The fact that someone could act so irresponsibly is unacceptable!'

Emotion Analysis:
   Primary Emotion: ANGER
   Confidence: 98.93%

   Top 3 Emotions:
           anger: ███████████████████ 98.93%
         disgust:  0.28%
            fear:  0.27%

 Voice Configuration:
   Speed: 1.13x (Forceful, assertive)
   Pitch Shift: +0 semitones
   Volume: +3.9 dB
Audio saved: outputs/emotion_test_2.mp3




[3/7] Expected: Sadness
Input: 'I've felt an overwhelming sense of loneliness. It's been ages since anyone truly listened to me.'

Emotion Analysis:
   Primary Emotion: FEAR
   Confidence: 66.03%

   Top 3 Emotions:
            fear: █████████████ 66.03%
         sadness: ████ 23.23%
         neutral: █ 5.01%

 Voice Configuration:
   Speed: 1.09x (Tense, anxious)
   Pitch Shift: +1 semitones
   Volume: +0.5 dB
Audio saved: outputs/emotion_test_3.mp3




[4/7] Expected: Fear
Input: 'Ever since I read the news, I've been feeling anxious. Something terrifying could happen soon.'

Emotion Analysis:
   Primary Emotion: FEAR
   Confidence: 99.32%

   Top 3 Emotions:
            fear: ███████████████████ 99.32%
        surprise:  0.16%
         neutral:  0.16%

 Voice Configuration:
   Speed: 1.17x (Tense, anxious)
   Pitch Shift: +2 semitones
   Volume: +1.0 dB
Audio saved: outputs/emotion_test_4.mp3




[5/7] Expected: Surprise
Input: 'Seeing all my friends gathered for a surprise celebration truly astonished me.'

Emotion Analysis:
   Primary Emotion: SURPRISE
   Confidence: 97.66%

   Top 3 Emotions:
        surprise: ███████████████████ 97.66%
             joy:  0.98%
         neutral:  0.50%

 Voice Configuration:
   Speed: 1.14x (Elevated, expressive)
   Pitch Shift: +2 semitones
   Volume: +1.9 dB
Audio saved: outputs/emotion_test_5.mp3




[6/7] Expected: Disgust
Input: 'The trash piled up with that foul smell was absolutely disgusting. I can't believe anyone would let things get so bad.'

Emotion Analysis:
   Primary Emotion: DISGUST
   Confidence: 98.85%

   Top 3 Emotions:
         disgust: ███████████████████ 98.85%
            fear:  0.30%
           anger:  0.30%

 Voice Configuration:
   Speed: 0.92x (Flat, muted)
   Pitch Shift: +0 semitones
   Volume: -1.0 dB
Audio saved: outputs/emotion_test_6.mp3




[7/7] Expected: Neutral
Input: 'The next company meeting is at 3 PM. Please submit your reports ahead of time.'

Emotion Analysis:
   Primary Emotion: NEUTRAL
   Confidence: 87.29%

   Top 3 Emotions:
         neutral: █████████████████ 87.29%
        surprise: █ 5.98%
            fear:  3.67%

 Voice Configuration:
   Speed: 1.00x (Balanced, even)
   Pitch Shift: +0 semitones
   Volume: +0.0 dB
Audio saved: outputs/emotion_test_7.mp3



In [7]:
def test_custom_text(text):
    return engine.synthesize(text, output_filename='custom.mp3')
# Example:
test_custom_text("You are amazing! What a wonderful surprise!")


Input: 'You are amazing! What a wonderful surprise!'

Emotion Analysis:
   Primary Emotion: SURPRISE
   Confidence: 48.41%

   Top 3 Emotions:
        surprise: █████████ 48.41%
             joy: █████████ 46.58%
         neutral:  3.32%

 Voice Configuration:
   Speed: 1.04x (Elevated, expressive)
   Pitch Shift: +0 semitones
   Volume: +0.6 dB
Audio saved: outputs/custom.mp3



{'emotion': 'surprise',
 'intensity': 0.4840720295906067,
 'parameters': {'speed': np.float64(1.045),
  'pitch_shift': 0,
  'volume_db': 0.6,
  'emotion': 'surprise',
  'intensity': 0.4840720295906067,
  'description': 'Elevated, expressive'},
 'output_file': 'outputs/custom.mp3',
 'all_scores': [{'label': 'surprise', 'score': 0.4840720295906067},
  {'label': 'joy', 'score': 0.46582773327827454},
  {'label': 'neutral', 'score': 0.0332486517727375},
  {'label': 'sadness', 'score': 0.005888821557164192},
  {'label': 'anger', 'score': 0.005263343453407288},
  {'label': 'fear', 'score': 0.004395237658172846},
  {'label': 'disgust', 'score': 0.001304212724789977}],
 'success': True,
 'timestamp': '2025-11-07T20:09:39.097761'}