In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

import pandas as pd

In [148]:
df = pd.read_csv('./large-corpus/other.tsv', sep='\t')
df.head(1)

  df = pd.read_csv('./large-corpus/other.tsv', sep='\t')


Unnamed: 0,client_id,path,sentence_id,sentence,sentence_domain,up_votes,down_votes,age,gender,accents,variant,locale,segment
0,25bc975d06200b7b1c9135db090561cb0d9b28d172e51c...,common_voice_zh-CN_19703883.mp3,33bf426e7162122b062710c1aee44949efea3fef159bbb...,模式种采样自台湾龟山岛。,,1,0,thirties,female_feminine,出生地：31 上海市,,zh-CN,


In [149]:
df = df[['path', 'sentence', 'age', 'gender', 'accents']]

In [150]:
df

Unnamed: 0,path,sentence,age,gender,accents
0,common_voice_zh-CN_19703883.mp3,模式种采样自台湾龟山岛。,thirties,female_feminine,出生地：31 上海市
1,common_voice_zh-CN_19706151.mp3,后者娶天之瓮主神。,thirties,female_feminine,出生地：31 上海市
2,common_voice_zh-CN_19961025.mp3,贝尔卢。,twenties,male_masculine,出生地：32 江苏省
3,common_voice_zh-CN_20051292.mp3,日本与东帝汶的关系可追溯至第二次世界大战期间。,twenties,male_masculine,出生地：13 河北省
4,common_voice_zh-CN_20051309.mp3,单县各乡镇设有卫生院或医院。,twenties,male_masculine,出生地：13 河北省
...,...,...,...,...,...
600242,common_voice_zh-CN_41891247.mp3,炫家军来也！,thirties,,出生地：51 四川省
600243,common_voice_zh-CN_41891248.mp3,索伊贝尔斯多尔夫是德国巴伐利亚州的一个市镇。,thirties,,出生地：51 四川省
600244,common_voice_zh-CN_41891249.mp3,情报侦察：乌克兰军队通过加强情报侦察工作，及时获得敌方无人机的动向和部署情况，从而能够提前采...,thirties,,出生地：51 四川省
600245,common_voice_zh-CN_41907107.mp3,三,,,


In [9]:
from pypinyin import lazy_pinyin, Style
import re

In [63]:
new_df = df[['path', 'sentence', 'age', 'gender', 'accents']]

In [30]:
def breakdown_pinyin(phrase):
  clean_phrase = re.sub(r'[^\w]', '', phrase)
  initial = lazy_pinyin(clean_phrase, style=Style.INITIALS, strict=True)
  final = lazy_pinyin(clean_phrase, style=Style.FINALS, strict=True)
  tone = [ word[-1] for word in lazy_pinyin(clean_phrase, style=Style.FINALS_TONE3, strict=False, neutral_tone_with_five=True, tone_sandhi=True)]
  
  initial = [init if init != '' else "EMPTY" for init in initial ]

  return list(zip(initial, final, tone))

In [46]:
pd.Series([ (path, word) for entry in df.loc[:500, 'sentence'].apply(lambda x: breakdown_pinyin(x)) for path, word in zip(df.loc[:500, 'path'], entry)])

0            (common_voice_zh-CN_19703883.mp3, (m, o, 2))
1           (common_voice_zh-CN_19706151.mp3, (sh, i, 4))
2         (common_voice_zh-CN_19961025.mp3, (zh, ong, 3))
3           (common_voice_zh-CN_20051292.mp3, (c, ai, 3))
4          (common_voice_zh-CN_20051309.mp3, (y, ang, 4))
                              ...                        
7439      (common_voice_zh-CN_19703883.mp3, (sh, eng, 4))
7440        (common_voice_zh-CN_19706151.mp3, (l, ai, 2))
7441    (common_voice_zh-CN_19961025.mp3, (EMPTY, ao, 4))
7442         (common_voice_zh-CN_20051292.mp3, (n, a, 4))
7443    (common_voice_zh-CN_20051309.mp3, (EMPTY, er, 3))
Length: 7444, dtype: object

In [146]:
def breakdown_pinyin_v_to_u(phrase):
  clean_phrase = re.sub(r'[^\w]', '', phrase)
  initial = lazy_pinyin(clean_phrase, style=Style.INITIALS, strict=True)
  final = lazy_pinyin(clean_phrase, style=Style.FINALS, strict=True)
  tone = [ word[-1] for word in lazy_pinyin(clean_phrase, style=Style.FINALS_TONE3, strict=False, v_to_u=True, neutral_tone_with_five=True, tone_sandhi=True)]
  
  initial = [init if init != '' else "EMPTY" for init in initial ]

  return list(zip(initial, final, tone))

In [64]:
new_df['pinyin_breakdown'] = new_df['sentence'].apply(breakdown_pinyin)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['pinyin_breakdown'] = new_df['sentence'].apply(breakdown_pinyin)


In [147]:
myDF = new_df.copy()
myDF['pinyin_breakdown'] = myDF['sentence'].apply(breakdown_pinyin_v_to_u)

In [163]:
exploded = myDF.explode('pinyin_breakdown', ignore_index=True)
exploded[['initial', 'final', 'tone']] = pd.DataFrame(exploded['pinyin_breakdown'].tolist(), index=exploded.index)
# exploded.drop(columns=['sentence', 'pinyin_breakdown'], inplace=True)
exploded

Unnamed: 0,path,sentence,age,gender,accents,pinyin_breakdown,initial,final,tone
0,common_voice_zh-CN_19703883.mp3,模式种采样自台湾龟山岛。,thirties,female_feminine,出生地：31 上海市,"(m, o, 2)",m,o,2
1,common_voice_zh-CN_19703883.mp3,模式种采样自台湾龟山岛。,thirties,female_feminine,出生地：31 上海市,"(sh, i, 4)",sh,i,4
2,common_voice_zh-CN_19703883.mp3,模式种采样自台湾龟山岛。,thirties,female_feminine,出生地：31 上海市,"(zh, ong, 3)",zh,ong,3
3,common_voice_zh-CN_19703883.mp3,模式种采样自台湾龟山岛。,thirties,female_feminine,出生地：31 上海市,"(c, ai, 3)",c,ai,3
4,common_voice_zh-CN_19703883.mp3,模式种采样自台湾龟山岛。,thirties,female_feminine,出生地：31 上海市,"(EMPTY, iang, 4)",EMPTY,iang,4
...,...,...,...,...,...,...,...,...,...
8802803,common_voice_zh-CN_41907108.mp3,数据集包含真实、仿真和干净的录音。,,,,"(g, an, 1)",g,an,1
8802804,common_voice_zh-CN_41907108.mp3,数据集包含真实、仿真和干净的录音。,,,,"(j, ing, 4)",j,ing,4
8802805,common_voice_zh-CN_41907108.mp3,数据集包含真实、仿真和干净的录音。,,,,"(d, e, 5)",d,e,5
8802806,common_voice_zh-CN_41907108.mp3,数据集包含真实、仿真和干净的录音。,,,,"(l, u, 4)",l,u,4


In [186]:
valid_initials = ['EMPTY', 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'j',
       'q', 'x', 'z', 'c', 's', 'zh', 'ch', 'sh', 'r']
valid_finals = ['i', 'a', 'ai', 'an', 'ang', 'ao', 'e', 'ei', 'en', 'eng', 'er', 'i', 'ia', 'ian', 'iang', 'iao', 'ie', 'in', 'ing', 'iong', 'iou', 'o', 'ong', 'ou', 'u', 'ua', 'uai', 'uan', 'uang', 'uei', 'uen', 'ueng', 'uo', 'v', 'van', 've', 'vn']

def breakdown_pinyin_v_to_u(phrase):
  clean_phrase = re.sub(r'[^\w]', '', phrase)
  initial = lazy_pinyin(clean_phrase, style=Style.INITIALS, strict=True)
  final = lazy_pinyin(clean_phrase, style=Style.FINALS, strict=True)
  tone = [ word[-1] for word in lazy_pinyin(clean_phrase, style=Style.FINALS_TONE3, strict=False, v_to_u=True, neutral_tone_with_five=True, tone_sandhi=True)]
  
  initial = [init if init != '' else "EMPTY" for init in initial ]

  return list(zip(initial, final, tone))

def is_valid_pinyin(pinyin_breakdown):
    for initial, final, tone in pinyin_breakdown:
        if initial not in valid_initials or final not in valid_finals:
            return False
    return True

In [None]:

  df = df[['path', 'sentence', 'age', 'gender', 'accents']]

  df['pinyin_breakdown'] = df['sentence'].apply(breakdown_pinyin_v_to_u)
  
  # filter out all non real initials and
  df = df[df['pinyin_breakdown'].apply(is_valid_pinyin)]

In [169]:
exploded.loc[exploded['initial'] == 'IanWallace']

Unnamed: 0,path,sentence,age,gender,accents,pinyin_breakdown,initial,final,tone
40045,common_voice_zh-CN_26053563.mp3,班轮笔记由鼓手Ian Wallace撰写。,twenties,male_masculine,出生地：37 山东省,"(IanWallace, IanWallace, e)",IanWallace,IanWallace,e


In [178]:
exploded['final'].value_counts()

final
i                                 1529172
e                                  865223
u                                  659344
ian                                361759
ong                                356640
a                                  325995
uei                                312894
ai                                 302749
an                                 289719
ing                                289115
uo                                 287096
v                                  257268
en                                 255355
eng                                248971
ang                                208464
ao                                 207957
ou                                 186126
iou                                175038
ie                                 170298
in                                 167540
ei                                 159012
ia                                 152339
iao                                144911
iang                        

In [156]:
exploded['initial'].value_counts()

initial
EMPTY                             1341483
d                                  871520
sh                                 720602
zh                                 587004
j                                  563002
x                                  506032
l                                  479921
g                                  453627
b                                  375771
h                                  374515
t                                  305160
z                                  272813
m                                  270349
q                                  259968
ch                                 247094
f                                  223178
s                                  207207
k                                  200013
n                                  171785
r                                  159482
c                                  119249
p                                   92882
A                                      15
ㄟＰＰ                       

# VALID PINYIN MAPPINGS

In [182]:
from utils.constants import DATA_DIR


pinyin_combos = pd.read_csv(DATA_DIR / "pinyin-combos.csv")

In [183]:
pinyin_combos.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,b,p,m,f,d,t,n,l,...,j,q,x,z,c,s,zh,ch,sh,r
0,i,,,,,,,,,,...,,,,zi,ci,si,zhi,chi,shi,ri
1,a,a,ba,pa,ma,fa,da,ta,na,la,...,,,,za,ca,sa,zha,cha,sha,
2,ai,ai,bai,pai,mai,,dai,tai,nai,lai,...,,,,zai,cai,sai,zhai,chai,shai,
3,an,an,ban,pan,man,fan,dan,tan,,lan,...,,,,zan,can,san,zhan,chan,shan,ran
4,ang,ang,bang,pang,mang,fang,dang,tang,nang,lang,...,,,,zang,cang,sang,zhang,chang,shang,rang


In [184]:
pinyin_combos = pinyin_combos.rename(columns={'Unnamed: 0': 'finals', 'Unnamed: 1': 'EMPTY'})
pinyin_combos.loc[:, 'finals'] = pinyin_combos.loc[:, 'finals'].apply(lambda x: x.replace('ü', 'v'))
pinyin_combos

Unnamed: 0,finals,EMPTY,b,p,m,f,d,t,n,l,...,j,q,x,z,c,s,zh,ch,sh,r
0,i,,,,,,,,,,...,,,,zi,ci,si,zhi,chi,shi,ri
1,a,a,ba,pa,ma,fa,da,ta,na,la,...,,,,za,ca,sa,zha,cha,sha,
2,ai,ai,bai,pai,mai,,dai,tai,nai,lai,...,,,,zai,cai,sai,zhai,chai,shai,
3,an,an,ban,pan,man,fan,dan,tan,,lan,...,,,,zan,can,san,zhan,chan,shan,ran
4,ang,ang,bang,pang,mang,fang,dang,tang,nang,lang,...,,,,zang,cang,sang,zhang,chang,shang,rang
5,ao,ao,bao,pao,mao,,dao,tao,nao,lao,...,,,,zao,cao,sao,zhao,chao,shao,rao
6,e,e,,,me,,de,te,ne,le,...,,,,ze,ce,se,zhe,che,she,re
7,ei,ei,bei,pei,mei,fei,dei,,nei,lei,...,,,,zei,,,zhei,,shei,
8,en,en,ben,pen,men,fen,,,nen,,...,,,,zen,cen,sen,zhen,chen,shen,ren
9,eng,eng,beng,peng,meng,feng,deng,teng,neng,leng,...,,,,zeng,ceng,seng,zheng,cheng,sheng,reng


In [174]:
pinyin_combos.columns[1:]

Index(['EMPTY', 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'j',
       'q', 'x', 'z', 'c', 's', 'zh', 'ch', 'sh', 'r'],
      dtype='object')

In [185]:
print(list(pinyin_combos['finals']))

['i', 'a', 'ai', 'an', 'ang', 'ao', 'e', 'ei', 'en', 'eng', 'er', 'i', 'ia', 'ian', 'iang', 'iao', 'ie', 'in', 'ing', 'iong', 'iou', 'o', 'ong', 'ou', 'u', 'ua', 'uai', 'uan', 'uang', 'uei', 'uen', 'ueng', 'uo', 'v', 'van', 've', 'vn']


In [104]:
import json

In [181]:
def replace_umlauts(s):
    if isinstance(s, str):
        return s.replace('ü', 'v')
    return s

# Apply the function to the DataFrame
pinyin_combos = pinyin_combos.apply(replace_umlauts)

# Initialize an empty dictionary to store the mappings
mapping = {}
# Iterate through each row
for index, row in pinyin_combos.iterrows():
    final_value = row['finals']
    for col in pinyin_combos.columns[1:]:  # Skip the first column
        if not pd.isna(row[col]):
            if col not in mapping:
                mapping[col] = []
            mapping[col].append(final_value)

print(mapping)

with open(DATA_DIR / 'valid_pinyin_mappings.json', 'w') as f:
    json.dump(mapping, f, indent=2)

print("Mapping saved to valid_pinyin_mappings.json")

{'z': ['i', 'a', 'ai', 'an', 'ang', 'ao', 'e', 'ei', 'en', 'eng', 'ong', 'ou', 'u', 'uan', 'uei', 'uen', 'uo'], 'c': ['i', 'a', 'ai', 'an', 'ang', 'ao', 'e', 'en', 'eng', 'ong', 'ou', 'u', 'uan', 'uei', 'uen', 'uo'], 's': ['i', 'a', 'ai', 'an', 'ang', 'ao', 'e', 'en', 'eng', 'ong', 'ou', 'u', 'uan', 'uei', 'uen', 'uo'], 'zh': ['i', 'a', 'ai', 'an', 'ang', 'ao', 'e', 'ei', 'en', 'eng', 'ong', 'ou', 'u', 'ua', 'uai', 'uan', 'uang', 'uei', 'uen', 'uo'], 'ch': ['i', 'a', 'ai', 'an', 'ang', 'ao', 'e', 'en', 'eng', 'ong', 'ou', 'u', 'uai', 'uan', 'uang', 'uei', 'uen', 'uo'], 'sh': ['i', 'a', 'ai', 'an', 'ang', 'ao', 'e', 'ei', 'en', 'eng', 'ou', 'u', 'ua', 'uai', 'uan', 'uang', 'uei', 'uen', 'uo'], 'r': ['i', 'an', 'ang', 'ao', 'e', 'en', 'eng', 'ong', 'ou', 'u', 'uan', 'uei', 'uen', 'uo'], 'EMPTY': ['a', 'ai', 'an', 'ang', 'ao', 'e', 'ei', 'en', 'eng', 'er', 'i', 'ia', 'ian', 'iang', 'iao', 'ie', 'in', 'ing', 'iong', 'iou', 'o', 'ou', 'u', 'ua', 'uai', 'uan', 'uang', 'uei', 'uen', 'ueng', '

In [None]:
special_combinations = {"SPECIAL_COMBINATIONS": [special for special in list(pinyin_combos['EMPTY']) if not pd.isna(special)]}

In [144]:
def womp(phrase):
  clean_phrase = re.sub(r'[^\w]', '', phrase)
  initial = lazy_pinyin(clean_phrase, style=Style.INITIALS, strict=True)
  final = lazy_pinyin(clean_phrase, style=Style.FINALS, strict=True)
  tone = [ word[-1] for word in lazy_pinyin(clean_phrase, style=Style.FINALS_TONE3, strict=False, v_to_u=True, neutral_tone_with_five=True, tone_sandhi=True)]
  
  initial = [init if init != '' else "EMPTY" for init in initial ]

  return list(zip(initial, final, tone))

In [145]:
womp("一样")

[('EMPTY', 'i', '1'), ('EMPTY', 'iang', '4')]