In [5]:
!pip uninstall -y elevenlabs
!pip install -U elevenlabs


Found existing installation: elevenlabs 1.54.0
Uninstalling elevenlabs-1.54.0:
  Successfully uninstalled elevenlabs-1.54.0
Collecting elevenlabs
  Using cached elevenlabs-1.54.0-py3-none-any.whl.metadata (7.3 kB)
Using cached elevenlabs-1.54.0-py3-none-any.whl (347 kB)
Installing collected packages: elevenlabs
Successfully installed elevenlabs-1.54.0


In [6]:
import elevenlabs
print(elevenlabs.__version__)  # Should be the latest version


1.54.0


In [7]:
! python -m pip install pydub



In [8]:
import json
from elevenlabs.client import ElevenLabs
from elevenlabs import VoiceSettings
from pydub import AudioSegment
import os


In [None]:
api_key = "eleven_labs_api_key"
api = ElevenLabs(api_key=api_key)


In [10]:
from elevenlabs.client import VoiceSettings

VOICE_MAPPING = {
    "male_1": "pqHfZKP75CvOlQylNhV4",  # Bill -
    "male_2": "iP95p4xoKVk53GoZ742B",  # Chris - Casual
    "male_3": "bVMeCyTHy58xNoL34h3p",  # Jeremy - Excited
    "female_1": "Xb7hH8MSUJpSbSDYk0k2",  # Alice - confident
    "female_2": "MF3mGyEYCl7XYWbV9V6O",  # Ellie - emotional
    "female_3": "XB0fDUnXU5powFXDhCwa",   # Charlotte - Seductive
    "narrator": "nPczCjzI2devNBz1zQrb" #brian
}


EMOTION_SETTINGS = {
    "neutral": VoiceSettings(stability=0.5, similarity_boost=0.8, style=0.5),
    "happy": VoiceSettings(stability=0.3, similarity_boost=0.9, style=0.9),
    "sad": VoiceSettings(stability=0.6, similarity_boost=0.5, style=0.7),
    "angry": VoiceSettings(stability=0.2, similarity_boost=1.0, style=0.8),
    "fearful": VoiceSettings(stability=0.4, similarity_boost=0.9, style=0.9),
    "surprised": VoiceSettings(stability=0.3, similarity_boost=1.0, style=1.0),
    "disgusted": VoiceSettings(stability=0.7, similarity_boost=0.5, style=0.6),
}



In [11]:
import random

character_voice_mapping = {}

available_male_voices = ["male_1", "male_2", "male_3"]
available_female_voices = ["female_1", "female_2", "female_3"]

def get_unique_character_voice(character, gender):
    global available_male_voices, available_female_voices

    if character == "narrator":
      return VOICE_MAPPING["narrator"]

    if character in character_voice_mapping:
        return character_voice_mapping[character]

    if gender.lower() == "male":
        if not available_male_voices:
            available_male_voices = ["male_1", "male_2", "male_3"]
        selected_voice = available_male_voices.pop(random.randint(0, len(available_male_voices) - 1))
    else:
        if not available_female_voices:
            available_female_voices = ["female_1", "female_2", "female_3"]
        selected_voice = available_female_voices.pop(random.randint(0, len(available_female_voices) - 1))

    character_voice_mapping[character] = VOICE_MAPPING[selected_voice]
    return character_voice_mapping[character]

In [12]:
def load_json(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return json.load(f)


In [13]:
def generate_speech(character, gender, emotion, dialogue, output_dir,index):
    voice_id = get_unique_character_voice(character=character, gender=gender)
    voice_settings = EMOTION_SETTINGS.get(emotion, EMOTION_SETTINGS["neutral"])  # Default to neutral

    # Generate speech (returns a generator)
    audio_generator = api.text_to_speech.convert(
        voice_id=voice_id,
        text=dialogue,
        voice_settings=voice_settings
    )
    # Save audio file correctly
    file_path = os.path.join(output_dir, f"{character}{index}_{emotion}.mp3")

    with open(file_path, "wb") as f:
        for chunk in audio_generator:  # Write the chunks properly
            f.write(chunk)

    return file_path


In [14]:
! python -m pip install ffmpeg whisperx



In [15]:
! pip install Whisper



In [16]:
import librosa
import soundfile as sf
import numpy as np
import whisper
import spacy

def detect_important_words(text):
    """Extracts important words based on POS tagging (nouns, verbs, adjectives)."""
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    return {token.text: token.pos_ in {"NOUN", "VERB", "ADJ"} for token in doc}

# Define emotion modulation ranges
emotion_modulation_ranges = {
    "neutral": (1.0, 1.0),
    "happy": (1.1, 1.3),
    "angry": (1.3, 1.7),
    "sad": (0.7, 0.9),
    "fearful": (1.2, 1.4),
    "disgusted": (1.1, 1.3),
    "surprised": (1.3, 1.5)
}

def compute_modulation_factor(emotion, is_important, sentence_weight):
    """Compute modulation factor based on emotion intensity and word importance."""
    min_factor, max_factor = emotion_modulation_ranges.get(emotion, (1.0, 1.0))
    intensity = min_factor + (max_factor - min_factor) * sentence_weight
    return intensity * (1.1 if is_important else 1.0)

def align_audio_with_text(audio_path):
    """Use OpenAI Whisper to generate word timestamps."""
    model = whisper.load_model("base")  # Try "tiny" if RAM is an issue
    result = model.transcribe(audio_path, word_timestamps=True)

    # print(result)  # <-- Debugging step: Print full output structure

    word_timestamps = []
    for segment in result.get("segments", []):  # Use .get() to avoid KeyError
        if "words" in segment:
            for word_data in segment["words"]:
                word_timestamps.append({
                    "text": word_data.get("text", ""),  # Use .get() to prevent crashes
                    "start": word_data.get("start", 0),
                    "end": word_data.get("end", 0)
                })
        else:
            print(f"Warning: No word-level timestamps in segment: {segment}")

    return word_timestamps


def fine_tune_audio(input_file, output_file, text, emotion):
    """Apply dynamic modulation to emphasize important words."""
    y, sr = librosa.load(input_file, sr=None)
    word_timestamps = align_audio_with_text(input_file)
    important_words = detect_important_words(text)
    sentence_weight = min(len(text.split()) / 10, 1.0)
    new_audio = np.copy(y)

    for word_data in word_timestamps:
        word = word_data["text"]
        start_time, end_time = word_data["start"], word_data["end"]
        start_sample, end_sample = int(start_time * sr), int(end_time * sr)

        modulation_factor = compute_modulation_factor(emotion, word in important_words, sentence_weight)
        new_audio[start_sample:end_sample] *= modulation_factor

    sf.write(output_file, new_audio, sr)
    return output_file


In [23]:
def process_story_with_finetuning(json_file, output_dir="audio_output"):
    os.makedirs(output_dir, exist_ok=True)
    story_data = load_json(json_file)

    audio_files = []
    for i, dialogue in enumerate(story_data):
        raw_audio_path = generate_speech(dialogue["name"], dialogue["predicted_gender"], dialogue["emotion"], dialogue["dialogue"], output_dir,index=i)

        # Fine-tune audio
        fine_tuned_path = os.path.join(output_dir, f"tuned_{dialogue['name']}{i}_{dialogue['emotion']}.mp3")
        fine_tune_audio(raw_audio_path, fine_tuned_path, dialogue["dialogue"], dialogue["emotion"])

        audio_files.append(fine_tuned_path)

    return audio_files



In [18]:
from pydub import AudioSegment
import os

def get_dynamic_pause(audio_length_ms):
    """ Determine the silence duration based on the audio length. """
    if audio_length_ms < 2000:  # Short sentence (<2 sec)
        return 400
    elif audio_length_ms < 4000:  # Medium sentence (2-4 sec)
        return 700
    else:  # Long sentence (>4 sec)
        return 1000

def merge_audio(audio_files, output_file="final_story.mp3"):
    final_audio = AudioSegment.empty()

    for i, file in enumerate(audio_files):
        if not os.path.exists(file):
            print(f"Warning: Skipping missing file {file}")
            continue

        audio = AudioSegment.from_file(file, format="mp3")
        pause_duration = get_dynamic_pause(len(audio))

        # Increase pause at the end for a natural ending
        if i == len(audio_files) - 1:
            pause_duration += 500

        final_audio += audio + AudioSegment.silent(duration=pause_duration)

    final_audio.export(output_file, format="mp3")
    print(f"Final story audio saved as {output_file}")


In [19]:
! pip install --upgrade pip
! pip install openai-whisper librosa soundfile spacy
! python -m spacy download en_core_web_sm




Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


CONNECTION TO EMOTION DETECTION

In [20]:
! pip install pyngrok



In [None]:
import torch
import json
import os
import flask
from flask import Flask, request, jsonify, send_file
from pyngrok import ngrok

# Initialize Flask app
app = Flask(__name__)

@app.route('/process_story', methods=['POST'])
def process_story():
    if 'file' not in request.files:
        return jsonify({"error": "No file uploaded"}), 400

    file = request.files['file']
    output_dir = os.path.join(os.getcwd(), "audio_output")
    os.makedirs(output_dir, exist_ok=True)

    file_path = os.path.join(output_dir, "story.json")
    file.save(file_path)

    # Process the story using the previously defined functions
    audio_files = process_story_with_finetuning(file_path, output_dir)

    # Merge audio files and get final output
    output_file = os.path.join(output_dir, "final_story.mp3")
    merge_audio(audio_files, output_file)

    # Return the merged audio file
    return send_file(output_file, as_attachment=True, mimetype='audio/mp3')

# Start ngrok and Flask
public_url = ngrok.connect(5000).public_url
print(f"Public URL: {public_url}")

app.run(port=5000)

Public URL: https://d7ad-34-81-106-9.ngrok-free.app
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [23/Mar/2025 08:26:04] "[33mGET / HTTP/1.1[0m" 404 -
100%|███████████████████████████████████████| 139M/139M [00:01<00:00, 98.3MiB/s]
INFO:werkzeug:127.0.0.1 - - [23/Mar/2025 08:33:20] "POST /process_story HTTP/1.1" 200 -


Final story audio saved as /content/audio_output/final_story.mp3
