# Summary

* Implemented a dependency-parsing based algorithm to simulate the Tone 3 Sandhi (T3S) process in Mandarin Chinese, which yields different patterns of realization depending on the syntactic structure of the expression. 

* Produced more natural output than Google Text-to-Speech (gTTS).

# Background

* Mandarin is a tonal language with four distinctive tones: T1 (high), T2 (low-high), T3 (low), and T4 (high-low).

* T3S is a phonological process by which a T3 is changed into a T2 when it is immediately followed by another T3.

* The patterns of realization of T3S depend on the syntactic structure of the expression.

In [None]:
!pip install -U spacy
!python -m spacy download zh_core_web_lg # https://spacy.io/models/zh

import zh_core_web_lg
nlp = zh_core_web_lg.load()

In [None]:
from spacy import displacy

In [None]:
sent = nlp('两百碗酒') # 'two-hundred bowls of wine' 
                    # Underlying tones: 3333; Expected surface tones: 2223 (other patterns are unacceptable)
for token in sent:
    print(token.text, token.pos_, token.dep_, list(token.ancestors))

displacy.render(sent, style='dep', jupyter=True)

两百 NUM nummod [酒]
碗 NUM mark:clf [两百, 酒]
酒 NOUN ROOT []


In [None]:
sent = nlp('想买好酒') # 'want to buy good wine'
                    # Underlying tones: 3333; Expected surface tones: 2323 or 2223 (among other acceptable patterns)
for token in sent:
    print(token.text, token.pos_, token.dep_, list(token.ancestors))

displacy.render(sent, style='dep', jupyter=True)

想 VERB ROOT []
买 VERB ccomp [想]
好 ADJ amod [酒, 买, 想]
酒 NOUN dobj [买, 想]


In [None]:
sent = nlp('我想买两碗酒') # 'I want to buy two bowls of wine'
                        # Underlying tones: 333333; Expected surface tones: 223223 or 222223 (among other acceptable patterns)
for token in sent:
    print(token.text, token.pos_, token.dep_, list(token.ancestors))

displacy.render(sent, style='dep', jupyter=True)

我 PRON nsubj [想]
想 VERB ROOT []
买 VERB ccomp [想]
两 NUM nummod [酒, 买, 想]
碗 NUM mark:clf [两, 酒, 买, 想]
酒 NOUN dobj [买, 想]


In [None]:
sent = nlp('我很想买好酒') # 'I really want to buy good wine'
                        # Underlying tones: 333333; Expected surface tones: 322323 or 222323 or 222223 (among other acceptable patterns)
for token in sent:
    print(token.text, token.pos_, token.dep_, list(token.ancestors))

displacy.render(sent, style='dep', jupyter=True)

我 PRON nsubj [想]
很 ADV advmod [想]
想 VERB ROOT []
买 VERB ccomp [想]
好 ADJ amod [酒, 买, 想]
酒 NOUN dobj [买, 想]


In [None]:
sent = nlp('老李很想买两百碗好酒') # 'Old Li really wants to buy two-hundred bowls of good wine'
                              # Underlying tones: 3333333333; Expected surface tones: 2322322323 (among other acceptable patterns)
for token in sent:
    print(token.text, token.pos_, token.dep_, list(token.ancestors))

displacy.render(sent, style='dep', jupyter=True)

老李 PROPN nsubj [想]
很 ADV advmod [想]
想 VERB ROOT []
买 VERB ccomp [想]
两百 NUM nummod [酒, 买, 想]
碗 NUM mark:clf [两百, 酒, 买, 想]
好 ADJ amod [酒, 买, 想]
酒 NOUN dobj [买, 想]


# Implementation

In [None]:
def T3S(text, tone_list):
    sent = nlp(text)

    # Apply T3S within tokens
    for token in sent:
        for i in range(len(token) - 1):
            if tone_list[token.idx + i] == tone_list[token.idx + i + 1] == '3': #idx: true index 
                tone_list[token.idx + i] = '2'

    # Apply T3S to structurally adjacent T3s, with the possibility of acceptable over-application
    for token in sent:   
        for token_anc in token.ancestors:
            # token_anc immediately follows token 
            if (token_anc.i == token.i + 1) and (tone_list[token_anc.idx] == tone_list[token_anc.idx - 1] == '3'):
                tone_list[token_anc.idx - 1] = '2'         
            # token immediately follows token_anc
            elif (token.i == token_anc.i + 1) and (tone_list[token.idx] == tone_list[token.idx - 1] == '3'):
                tone_list[token.idx - 1] = '2'
        
    # Apply T3S to remaining adjacent T3s, left-to-right
    for i in range(len(tone_list) - 1):
        if tone_list[i] == tone_list[i + 1] == '3':
            tone_list[i] = '2'

    print(tone_list)

# Results

In [None]:
! pip install pinyin # https://pypi.org/project/pinyin/
import pinyin

In [None]:
text = '两百碗酒' # 'two-hundred bowls of wine' 
                # Underlying tones: 3333; Expected surface tones: 2223 (other patterns unacceptable)
py = pinyin.get(text, format = 'numerical')
tone_list = [s for s in py if s.isnumeric()]
T3S(text, tone_list)

['2', '2', '2', '3']


In [None]:
text = '想买好酒' # 'want to buy good wine'
                # Underlying tones: 3333; Expected surface tones: 2323 or 2223 (among other acceptable patterns)
py = pinyin.get(text, format = 'numerical')
tone_list = [s for s in py if s.isnumeric()]
T3S(text, tone_list)

['2', '3', '2', '3']


In [None]:
text = '我想买两碗酒' # 'I want to buy two bowls of wine'
                  # Underlying tones: 333333; Expected surface tones: 223223 or 222223 (among other acceptable patterns)
py = pinyin.get(text, format = 'numerical')
tone_list = [s for s in py if s.isnumeric()]
T3S(text, tone_list)

['2', '2', '2', '2', '2', '3']


In [None]:
text = '我很想买好酒' # 'I really want to buy good wine'
                  # Underlying tones: 333333; Expected surface tones: 322323 or 222323 or 222223 (among other acceptable patterns)
py = pinyin.get(text, format = 'numerical')
tone_list = [s for s in py if s.isnumeric()]
T3S(text, tone_list)

['3', '2', '2', '3', '2', '3']


In [None]:
text = '老李很想买两百碗好酒' # 'Old Li really wants to buy two-hundred bowls of good wine'
                        # Underlying tones: 3333333333; Expected surface tones: 2322322323 (among other acceptable patterns)
py = pinyin.get(text, format = 'numerical')
tone_list = [s for s in py if s.isnumeric()]
T3S(text, tone_list)

['2', '3', '2', '2', '3', '2', '2', '3', '2', '3']


# gTTS

* gTTs features more instances of acceptable over-application of T3S.
* The current implementation of T3S can help make the output more natural. 

In [None]:
!pip install gTTS 
import gtts
import IPython

In [None]:
tts = gtts.gTTS('两百碗酒', lang='zh-cn') # 'two-hundred bowls of wine'
tts.save('两百碗酒.mp3')
IPython.display.Audio('两百碗酒.mp3') # Output: 2223 (acceptable)

In [None]:
tts = gtts.gTTS('想买好酒', lang='zh-cn') # 'want to buy good wine'
tts.save('想买好酒.mp3')
IPython.display.Audio('想买好酒.mp3') # Output: 2223 (acceptable); cf. 2323 (more natural)

In [None]:
tts = gtts.gTTS('我想买两碗酒', lang='zh-cn') # 'I want to buy two bowls of wine'
tts.save('我想买两碗酒.mp3')
IPython.display.Audio('我想买两碗酒.mp3') # Output: 222223 (acceptable) 

In [None]:
tts = gtts.gTTS('我很想买好酒', lang='zh-cn') # 'I really want to buy good wine'
tts.save('我很想买好酒.mp3')
IPython.display.Audio('我很想买好酒.mp3') # Output: 222223 (acceptable); cf. 322323 (more natural)

In [None]:
tts = gtts.gTTS('老李很想买两百碗好酒', lang='zh-cn') # 'Old Li really wants to buy two-hundred bowls of good wine'
tts.save('老李很想买两百碗好酒.mp3')
IPython.display.Audio('老李很想买两百碗好酒.mp3') # Output: 2222322223 (acceptable); cf. 2322322323 (more natural)