# Neural Speech Generation for Shakespeare Play

In [3]:
%load_ext autoreload
%autoreload 2

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Set Up Environment

In [2]:
from dotenv import load_dotenv

load_dotenv()

True

## Load Data

In [15]:
import json

fname = "../data/The-Comedy-of-Errors-ACT-I-Scene-I-output.json"
with open(fname, 'r') as f: 
    json_output = json.loads(f.read())

json_output

{'Act 1': {'Scene 1': {'Characters': ['Solinus',
    'Egeon',
    'Duke',
    'Jailer',
    'Attendants'],
   'Enter': ['Solinus', 'Egeon', 'Jailer', 'Attendants'],
   'Dialogues': [{'Speaker': 'Egeon',
     'Text': 'Proceed, Solinus, to procure my fall, And by the doom of death end woes and all.'},
    {'Speaker': 'Duke',
     'Text': 'Merchant of Syracusa, plead no more. I am not partial to infringe our laws. The enmity and discord which of late Sprung from the rancorous outrage of your duke To merchants, our well-dealing countrymen, Who, wanting guilders to redeem their lives, Have sealed his rigorous statutes with their bloods, Excludes all pity from our threat’ning looks. For since the mortal and intestine jars ’Twixt thy seditious countrymen and us, It hath in solemn synods been decreed, Both by the Syracusians and ourselves, To admit no traffic to our adverse towns. Nay, more, if any born at Ephesus Be seen at Syracusian marts and fairs; Again, if any Syracusian born Come to the

In [28]:
import pandas as pd

dialogues = json_output['Act 1']['Scene 1']['Dialogues']
df = pd.DataFrame.from_dict(dialogues)
df

Unnamed: 0,Speaker,Text
0,Egeon,"Proceed, Solinus, to procure my fall, And by t..."
1,Duke,"Merchant of Syracusa, plead no more. I am not ..."
2,Egeon,"Yet this my comfort: when your words are done,..."
3,Duke,"Well, Syracusian, say in brief the cause Why t..."
4,Egeon,A heavier task could not have been imposed Tha...
5,Duke,"Nay, forward, old man. Do not break off so, Fo..."
6,Egeon,"O, had the gods done so, I had not now Worthil..."
7,Duke,"And for the sake of them thou sorrowest for, D..."
8,Egeon,"My youngest boy, and yet my eldest care, At ei..."
9,Duke,"Hapless Egeon, whom the fates have marked To b..."


In [29]:
df['Speaker'].unique()

array(['Egeon', 'Duke', 'Jailer'], dtype=object)

## Generate Neural Speech

In [None]:
import os
import azure.cognitiveservices.speech as speechsdk

speech_config = speechsdk.SpeechConfig(subscription=os.getenv('SPEECH_KEY'), region=os.getenv('SPEECH_REGION'))
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)

# Text to synthesise 
text = "Hey, how are you?"

# Synthesise speech
result = speech_synthesizer.speak_text_async(text).get()

In [38]:
import azure.cognitiveservices.speech as speechsdk

def speech_synthesis_to_mp3_file(speaker, text, fname):
    """performs speech synthesis to a mp3 file"""
    # Creates an instance of a speech config with specified subscription key and service region.
    speech_config = speechsdk.SpeechConfig(subscription=os.getenv('SPEECH_KEY'), region=os.getenv('SPEECH_REGION'))
    
    # Sets the synthesis output format.
    # The full list of supported format can be found here:
    # https://docs.microsoft.com/azure/cognitive-services/speech-service/rest-text-to-speech#audio-outputs
    speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3)
    
    # Creates a speech synthesizer using file as audio output.
    # Replace with your own audio file name.
    file_config = speechsdk.audio.AudioOutputConfig(filename=fname)

    # Sets the synthesis voice name.
    if speaker == "Egeon":
        voice_name = "en-GB-EthanNeural"
    elif speaker == "Duke":
        voice_name = "en-GB-ElliotNeural"
    elif speaker == "Jailer":
        voice_name = "en-GB-NoahNeural"

    speech_config.speech_synthesis_voice_name = voice_name

    speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=file_config)

    try:
        text = text
    except EOFError as e:
        print(e)
        exit()
        
    result = speech_synthesizer.speak_text_async(text).get()
    # Check result
    if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
        print("Speech synthesized for text [{}], and the audio was saved to [{}]".format(text, fname))
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        print("Speech synthesis canceled: {}".format(cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            print("Error details: {}".format(cancellation_details.error_details))

In [39]:
for index, row in df.iterrows():
    speaker = row["Speaker"]
    text = row["Text"]
    fname_out = "../output/audio/" + str(index) + "-" + str(speaker) + "-" + "audio.mp3"; print(fname_out)
    speech_synthesis_to_mp3_file(speaker=row["Speaker"], text=row["Text"], fname=fname_out)

../output/audio/0-Egeon-audio.mp3
Speech synthesized for text [Proceed, Solinus, to procure my fall, And by the doom of death end woes and all.], and the audio was saved to [11-Egeon-audio.mp3]
../output/audio/1-Duke-audio.mp3
Speech synthesized for text [Merchant of Syracusa, plead no more. I am not partial to infringe our laws. The enmity and discord which of late Sprung from the rancorous outrage of your duke To merchants, our well-dealing countrymen, Who, wanting guilders to redeem their lives, Have sealed his rigorous statutes with their bloods, Excludes all pity from our threat’ning looks. For since the mortal and intestine jars ’Twixt thy seditious countrymen and us, It hath in solemn synods been decreed, Both by the Syracusians and ourselves, To admit no traffic to our adverse towns. Nay, more, if any born at Ephesus Be seen at Syracusian marts and fairs; Again, if any Syracusian born Come to the bay of Ephesus, he dies, His goods confiscate to the Duke’s dispose, Unless a thou