# Summary

* Implemented the Mandarin Tone 3 Sandhi (T3S) process which yields different patterns of realization depending on the syntactic structure of the expression. 

# Background

* Mandarin is a tonal language with four distinctive tones: T1 (high), T2 (low-high), T3 (low), and T4 (high-low).

* T3S is a phonological process by which a T3 is changed into a T2 when it is immediately followed by a T3.

* The pattern of realization of T3S depends on the syntactic structure of the expression.

In [None]:
!pip install -U spacy
!python -m spacy download zh_core_web_lg # https://spacy.io/models/zh

import zh_core_web_lg
nlp = zh_core_web_lg.load()

In [None]:
from spacy import displacy

In [None]:
sent = nlp('两百碗水') # 'two-hundred bowls of water' 
                    # Underlying tones: 3333; Expected surface tones: 2223
for token in sent:
    print(token.text, token.pos_, token.dep_, list(token.ancestors))

displacy.render(sent, style='dep', jupyter=True)

两百 NUM nummod [水]
碗 NUM mark:clf [两百, 水]
水 NOUN ROOT []


In [None]:
sent = nlp('想买好酒') # 'want to buy good wine'
                    # Underlying tones: 3333; Expected surface tones: 2323
for token in sent:
    print(token.text, token.pos_, token.dep_, list(token.ancestors))

displacy.render(sent, style='dep', jupyter=True)

想 VERB ROOT []
买 VERB ccomp [想]
好 ADJ amod [酒, 买, 想]
酒 NOUN dobj [买, 想]


In [None]:
sent = nlp('老李想早点走')  # 'Old Li wants to leave a bit earlier'
                        # Underlying tones: 333333; Expected surface tones: 223223
for token in sent:
    print(token.text, token.pos_, token.dep_, list(token.ancestors))

displacy.render(sent, style='dep', jupyter=True)

老李 PROPN nsubj [想]
想 VERB ROOT []
早点 ADV advmod [走, 想]
走 VERB ccomp [想]


# Implementation

In [None]:
def T3S(text, tone_list):
    sent = nlp(text)

    # Apply T3S within token
    for token in sent:
        for i in range(len(token) - 1):
            if tone_list[token.idx + i] == tone_list[token.idx + i + 1] == '3': #idx: true index 
                tone_list[token.idx + i] = '2'

    # Apply T3S cyclically bottom-up
    for token in sent:   
        for token_anc in token.ancestors:
            # token_anc immediately follows token 
            if (token_anc.i == token.i + 1) and (tone_list[token_anc.idx] == tone_list[token_anc.idx - 1] == '3'):
                tone_list[token_anc.idx - 1] = '2'
            # token immediately follows token_anc
            elif (token.i == token_anc.i + 1) and (tone_list[token.idx] == tone_list[token.idx - 1] == '3'):
                tone_list[token.idx - 1] = '2'
        
    # Apply T3S to remaining adjacent T3s
    for i in range(len(tone_list) - 1):
        if tone_list[i] == tone_list[i + 1] == '3':
            tone_list[i] = '2'

    print(tone_list)

# Results

In [None]:
! pip install pinyin # https://pypi.org/project/pinyin/
import pinyin

In [None]:
text = '我爱汉语' # 'I love the Chinese language'
                  # Underlying tones: 3443; Expected surface tones: 3443
py = pinyin.get(text, format = 'numerical')
tone_list = [s for s in py if s.isnumeric()]
T3S(text, tone_list)

['3', '4', '4', '3']


In [None]:
text = '两百碗水' # 'two-hundred bowls of water' 
                # Underlying tones: 3333; Expected surface tones: 2223
py = pinyin.get(text, format = 'numerical')
tone_list = [s for s in py if s.isnumeric()]
T3S(text, tone_list)

['2', '2', '2', '3']


In [None]:
text = '想买好酒' # 'want to buy good wine'
                # Underlying tones: 3333; Expected surface tones: 2323
py = pinyin.get(text, format = 'numerical')
tone_list = [s for s in py if s.isnumeric()]
T3S(text, tone_list)

['2', '3', '2', '3']


In [None]:
text = '老李想早点走' # 'Old Li wants to leave a bit earlier'
                  # Underlying tones: 333333; Expected surface tones: 223223
py = pinyin.get(text, format = 'numerical')
tone_list = [s for s in py if s.isnumeric()]
T3S(text, tone_list)

['2', '2', '3', '2', '2', '3']
