In [23]:
import numpy as np
import pandas as pd
import json
import ast
from tqdm import tqdm

from pypinyin import lazy_pinyin, Style
from utils.data_processing import breakdown_pinyin, is_valid_pinyin, proportional_sample
import re
from utils.constants import *


import re
import IPython
import torch
import torchaudio
from torchaudio import transforms as T
import pandas as pd
import numpy as np

from tqdm import tqdm

from IPython.display import Audio
from matplotlib.patches import Rectangle
import librosa

tqdm.pandas(desc="Processing data")

In [2]:
df = pd.read_csv(f"{DATA_DIR / 'metadata.csv'}")

In [3]:
df['word_files'] = df['word_files'].apply(ast.literal_eval)

In [4]:
df['pinyin_breakdown'] = df['sentence'].progress_apply(breakdown_pinyin)

Processing data: 100%|██████████| 187314/187314 [01:23<00:00, 2241.63it/s]


In [5]:
def split_chinese_non_chinese(text):
    # Regular expression to match Chinese characters
    chinese_pattern = re.compile(r'[\u4e00-\u9fff]')
    
    result = []
    buffer = ""  # Buffer to collect non-Chinese characters

    for char in text:
        if chinese_pattern.match(char):  
            if buffer:
                result.append(buffer)  # Append the previous non-Chinese sequence
                buffer = ""  # Reset buffer
            result.append(char)  # Append the Chinese character itself
        else:
            buffer += char  # Group non-Chinese characters together
    
    if buffer:
        result.append(buffer)  # Append any remaining non-Chinese characters
    
    return result

df['character'] = df['sentence'].progress_apply(split_chinese_non_chinese)


Processing data: 100%|██████████| 187314/187314 [00:01<00:00, 168709.62it/s]


In [6]:
df.head()

Unnamed: 0,word_files,sentence,pinyin_breakdown,character
0,[/home/connor/audio/data/word_tensors/common_v...,黑身准裂腹鱼为辐鳍鱼纲鲤形目鲤科的其中一种,"[(h, ei, 1), (sh, en, 1), (zh, uen, 3), (l, ie...","[黑, 身, 准, 裂, 腹, 鱼, 为, 辐, 鳍, 鱼, 纲, 鲤, 形, 目, 鲤, ..."
1,[/home/connor/audio/data/word_tensors/common_v...,否,"[(f, ou, 3)]",[否]
2,[/home/connor/audio/data/word_tensors/common_v...,宋朝末年年间定居粉岭围,"[(s, ong, 4), (ch, ao, 2), (m, o, 4), (n, ian,...","[宋, 朝, 末, 年, 年, 间, 定, 居, 粉, 岭, 围]"
3,[/home/connor/audio/data/word_tensors/common_v...,油小路通是京都市主要的南北向道路之一,"[(EMPTY, iou, 2), (x, iao, 3), (l, u, 4), (t, ...","[油, 小, 路, 通, 是, 京, 都, 市, 主, 要, 的, 南, 北, 向, 道, ..."
4,[/home/connor/audio/data/word_tensors/common_v...,富尔马诺夫出生在科斯特罗马省谢列达的一个农民家庭,"[(f, u, 4), (EMPTY, er, 3), (m, a, 3), (n, uo,...","[富, 尔, 马, 诺, 夫, 出, 生, 在, 科, 斯, 特, 罗, 马, 省, 谢, ..."


In [81]:
df['sentence'].progress_apply(
    lambda row: clean_pinyin(lazy_pinyin(row)),
    desc='New custom description'
)

Processing data:   0%|          | 1/187312 [00:00<07:31, 414.62it/s]


TypeError: <lambda>() got an unexpected keyword argument 'desc'

In [7]:
valid_pinyin = set("abcdefghijklmnopqrstuvwxyz")  # Allowed characters

def clean_pinyin(pinyin_list):
  return [''.join(c for c in word if c in valid_pinyin) for word in pinyin_list]

df['pinyin'] = df['sentence'].apply(lambda row: clean_pinyin(lazy_pinyin(row)))


In [8]:
valid_lengths = df.progress_apply(lambda row: len({
    len(row['word_files']), 
    len(row['pinyin_breakdown']), 
    len(row['pinyin']), 
    len(row['character'])
}) == 1, axis=1)

Processing data: 100%|██████████| 187314/187314 [00:01<00:00, 151146.83it/s]


In [9]:
df = df[valid_lengths]

In [10]:
exploded = df.explode(['word_files', 'pinyin_breakdown', 'pinyin', 'character'], ignore_index=True)

In [11]:
exploded[['initial', 'final', 'tone']] = pd.DataFrame(exploded['pinyin_breakdown'].tolist(), index=exploded.index)

In [32]:
exploded.drop(columns=['pinyin_breakdown'], inplace=True)

In [33]:
exploded

Unnamed: 0,word_files,sentence,character,pinyin,initial,final,tone
0,/home/connor/audio/data/word_tensors/common_vo...,黑身准裂腹鱼为辐鳍鱼纲鲤形目鲤科的其中一种,黑,hei,h,ei,1
1,/home/connor/audio/data/word_tensors/common_vo...,黑身准裂腹鱼为辐鳍鱼纲鲤形目鲤科的其中一种,身,shen,sh,en,1
2,/home/connor/audio/data/word_tensors/common_vo...,黑身准裂腹鱼为辐鳍鱼纲鲤形目鲤科的其中一种,准,zhun,zh,uen,3
3,/home/connor/audio/data/word_tensors/common_vo...,黑身准裂腹鱼为辐鳍鱼纲鲤形目鲤科的其中一种,裂,lie,l,ie,4
4,/home/connor/audio/data/word_tensors/common_vo...,黑身准裂腹鱼为辐鳍鱼纲鲤形目鲤科的其中一种,腹,fu,f,u,4
...,...,...,...,...,...,...,...
2533517,/home/connor/audio/data/word_tensors/common_vo...,古希腊作家之一,腊,la,l,a,4
2533518,/home/connor/audio/data/word_tensors/common_vo...,古希腊作家之一,作,zuo,z,uo,4
2533519,/home/connor/audio/data/word_tensors/common_vo...,古希腊作家之一,家,jia,j,ia,1
2533520,/home/connor/audio/data/word_tensors/common_vo...,古希腊作家之一,之,zhi,zh,i,1


In [34]:
valid_chinese_mask = exploded.progress_apply(lambda x: is_valid_pinyin(x['initial'], x['final']), axis=1)

Processing data: 100%|██████████| 2533522/2533522 [00:11<00:00, 221929.33it/s]


In [35]:
better = exploded[valid_chinese_mask]

In [36]:
better

Unnamed: 0,word_files,sentence,character,pinyin,initial,final,tone
0,/home/connor/audio/data/word_tensors/common_vo...,黑身准裂腹鱼为辐鳍鱼纲鲤形目鲤科的其中一种,黑,hei,h,ei,1
1,/home/connor/audio/data/word_tensors/common_vo...,黑身准裂腹鱼为辐鳍鱼纲鲤形目鲤科的其中一种,身,shen,sh,en,1
2,/home/connor/audio/data/word_tensors/common_vo...,黑身准裂腹鱼为辐鳍鱼纲鲤形目鲤科的其中一种,准,zhun,zh,uen,3
3,/home/connor/audio/data/word_tensors/common_vo...,黑身准裂腹鱼为辐鳍鱼纲鲤形目鲤科的其中一种,裂,lie,l,ie,4
4,/home/connor/audio/data/word_tensors/common_vo...,黑身准裂腹鱼为辐鳍鱼纲鲤形目鲤科的其中一种,腹,fu,f,u,4
...,...,...,...,...,...,...,...
2533517,/home/connor/audio/data/word_tensors/common_vo...,古希腊作家之一,腊,la,l,a,4
2533518,/home/connor/audio/data/word_tensors/common_vo...,古希腊作家之一,作,zuo,z,uo,4
2533519,/home/connor/audio/data/word_tensors/common_vo...,古希腊作家之一,家,jia,j,ia,1
2533520,/home/connor/audio/data/word_tensors/common_vo...,古希腊作家之一,之,zhi,zh,i,1


In [37]:
word_tensor = torch.load(Path(better.loc[0, 'word_files']), weights_only=False)
print(f"sentence: {better.loc[0, 'character']}")
IPython.display.Audio(word_tensor, rate=16000)

sentence: 黑


In [38]:
better['tone'].value_counts()

tone
4    886854
2    589754
1    537002
3    402777
5    117049
Name: count, dtype: int64

In [39]:
better['final'].value_counts()

final
i       439395
e       243655
u       188159
ian     105564
ong     101971
a        94080
uei      90189
ai       87856
ing      84922
an       84739
uo       82193
v        73861
en       73705
eng      72334
ang      59389
ao       58953
ou       54525
iou      51656
in       49008
ie       47754
ei       46398
ia       43681
iao      42784
iang     38275
uan      34147
van      28320
ve       25081
er       24025
uen      22560
uang     20582
ua       20141
vn       14574
o        13163
iong     10462
uai       5160
ueng       175
Name: count, dtype: int64

In [40]:
sampled_data = proportional_sample(better)

In [41]:
sampled_data

Unnamed: 0,word_files,sentence,character,pinyin,initial,final,tone,sanity,augment
948561,/home/connor/audio/data/word_tensors/common_vo...,担当箱根旅游的旅客输送,旅,lv,l,v,3,1,0
2408431,/home/connor/audio/data/word_tensors/common_vo...,芹菜作为蔬菜食用历史悠久,芹,qin,q,in,2,1,0
1855587,/home/connor/audio/data/word_tensors/common_vo...,龟兹乐是古龟兹的音乐,音,yin,EMPTY,in,1,1,0
1517846,/home/connor/audio/data/word_tensors/common_vo...,芬兰自然史博物馆是位于芬兰首都赫尔辛基的一座博物馆,博,bo,b,o,2,1,0
394945,/home/connor/audio/data/word_tensors/common_vo...,道光十四年出生,光,guang,g,uang,1,1,0
...,...,...,...,...,...,...,...,...,...
1990052,/home/connor/audio/data/word_tensors/common_vo...,建设洛茂铁路的目的是为了沿线煤炭外运,的,di,d,i,4,1,0
1025767,/home/connor/audio/data/word_tensors/common_vo...,日本首都制药公司在大脑机能领域开发出了新药,开,kai,k,ai,1,1,0
1533192,/home/connor/audio/data/word_tensors/common_vo...,创办于泰王拉玛五世时,拉,la,l,a,1,1,0
1317589,/home/connor/audio/data/word_tensors/common_vo...,是一种常见的食用贻贝,是,shi,sh,i,4,1,0


In [59]:
og = better['final'].value_counts(normalize=True)
samp = sampled_data['final'].value_counts(normalize=True)

In [73]:
samp[(abs(samp - og) > 0.05)]

final
i    0.09773
Name: proportion, dtype: float64

In [None]:
if abs(og - samp) > 0.05:
 print(f"initial has imbalanced proprotions")


In [78]:
def check_proportions(original, sampled, slack=0.05):
  for col in ['initial', 'final', 'tone']:
    og = original[col].value_counts(normalize=True)
    samp = sampled[col].value_counts(normalize=True)

    if sum(abs(og - samp) > slack):
      print(f"{col} has imbalanced proprotions\noriginal: \n{og[(abs(samp - og) > 0.05)]}\n\nsampled: \n{samp[(abs(samp - og) > 0.05)]}")


In [79]:
check_proportions(better, sampled_data)

final has imbalanced proprotions
original: 
final
i    0.173438
Name: proportion, dtype: float64

sampled: 
final
i    0.09773
Name: proportion, dtype: float64


In [19]:
from sklearn.utils import compute_class_weight

from utils.constants import RANDOM_SEED


def better_sample(df: pd.DataFrame):
  clean_df = df.copy()
  for label in ['initial', 'final']:
    unique_classes = np.unique(clean_df[label])
    class_weights = compute_class_weight('balanced', classes=unique_classes, y=clean_df[label])
    class_weight_dict = dict(zip(unique_classes, class_weights))
    clean_df[f"{label}_weight"] = clean_df[label].map(class_weight_dict)

  clean_df["sample_weight"] = clean_df[['initial_weight', 'final_weight']].mean(axis=1)

  # Sample using computed weights
  sampled_df = clean_df.sample(n=500_000, weights=clean_df["sample_weight"], random_state=RANDOM_SEED)
  
  # drop some cols
  sampled_df.drop(columns=['pinyin_breakdown', 'initial_weight', 'final_weight', 'sample_weight'], inplace=True)
  sampled_df['sanity'] = 1
  sampled_df['augment'] = 0

  return sampled_df

In [None]:
sampled_data =  better_sample(clean_better)

In [28]:
sampled_data.shape

(500000, 11)

In [23]:
from sklearn.model_selection import train_test_split
from utils.augmentations import generate_labels

X = sampled_data['path']
y = generate_labels(sampled_data)


In [29]:
X.shape

(500000,)

In [62]:
# test train split
train_df, temp_df = train_test_split(sampled_data, test_size=0.2, stratify=sampled_data[['final']], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df[['final']], random_state=42)


In [55]:
train_df

Unnamed: 0,path,sentence,age,gender,accents,words,initial,final,tone,sanity,augment
204797,common_voice_zh-CN_32918927.mp3,发现真相的诗音决定除去铁平但被圭一制止了。,,,,相,x,iang,4,1,0
66829,common_voice_zh-CN_32564279.mp3,马尔尼莱孔皮耶尼。,,,,耶,EMPTY,ie,2,1,0
586652,common_voice_zh-CN_38022143.mp3,目前公平交易委员会已经对三星进行处分。,,,,行,x,ing,2,1,0
223241,common_voice_zh-CN_32949484.mp3,肿管蚜为常蚜科肿管蚜属下的一个种。,twenties,male_masculine,出生地：23 黑龙江省,科,k,e,1,1,0
570462,common_voice_zh-CN_33964399.mp3,此外还有一些其他情况下的类似的论点都没有得到法院的支持。,,,,外,EMPTY,uai,4,1,0
...,...,...,...,...,...,...,...,...,...,...,...
103108,common_voice_zh-CN_32632916.mp3,傍晚儿子回来了,,,,儿,EMPTY,er,2,1,0
421139,common_voice_zh-CN_33402863.mp3,根据国际特赦组织。,,,,赦,sh,e,4,1,0
461014,common_voice_zh-CN_33505548.mp3,零索引惯例的另一个特性是在现代计算机中实作的模运算。,,,,算,s,uan,4,1,0
272342,common_voice_zh-CN_33062651.mp3,黄巢领导的农民起义军陷桂州。,,,,领,l,ing,3,1,0


In [65]:
import json
from random import shuffle
from sklearn.utils import resample

from utils.constants import DATA_DIR


def mark_augments(df: pd.DataFrame, sample_frac=0.7, augment_frac=0.2, insane_frac=0.1, ueng_boost=0.5):
  # resampling calcs
  unaugmented = len(df)
  total_size = int(unaugmented / sample_frac)
  to_augment = int(total_size * augment_frac)
  to_insane = int(total_size * insane_frac)

  # augment sampling
  augmented = resample(df, replace=True, n_samples=to_augment, random_state=RANDOM_SEED)
  augmented['augment'] = 1
  augmented['sanity'] = 1

  # insane sampling
  insane = resample(df, replace=True, n_samples=to_insane, random_state=RANDOM_SEED)

  with open(DATA_DIR / 'invalid_initial_final_mappings.json', 'r') as file:
    invalid_mappings = json.load(file)

  insane['final'] = insane['initial'].map(lambda x: np.random.choice(invalid_mappings.get(x, [None])))
  insane['augment'] = 1
  insane['sanity'] = 0

  overall = pd.concat([df, augmented, insane], ignore_index=True)

  # getting more ueng bc not many uengs
  ueng = overall.loc[overall['final'] == 'ueng']
  more_ueng = resample(ueng, replace=True, n_samples=int(len(ueng) * ueng_boost), random_state=RANDOM_SEED)
  more_ueng['augment'] = 1
  
  done = pd.concat([overall, more_ueng], ignore_index=True)

  return done

In [81]:
new_train_df = mark_augments(train_df)

In [82]:
new_train_df = new_train_df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

In [83]:
new_train_df

Unnamed: 0,path,sentence,age,gender,accents,words,initial,final,tone,sanity,augment
0,common_voice_zh-CN_33361319.mp3,武雄温泉车站铁路车站。,,,,铁,t,ie,3,1,0
1,common_voice_zh-CN_32479855.mp3,幼苗怕冻。,,,,苗,m,iao,2,1,1
2,common_voice_zh-CN_33553815.mp3,棕竹为棕榈科棕竹属下的一个种。,,,,棕,z,ong,1,1,0
3,common_voice_zh-CN_32609558.mp3,蒸气动力压路机可藉齿比调整来执行不同任务。,,,,调,t,iao,2,1,0
4,common_voice_zh-CN_33247995.mp3,蓝德尔星孔珊瑚为轴孔珊瑚科星孔珊瑚下的一个种。,,,,德,d,e,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...
573115,common_voice_zh-CN_33168243.mp3,索泼查尼修道院在中世纪塞尔维亚美术中拥有重要地位。,,,,中,zh,ong,1,1,0
573116,common_voice_zh-CN_33316854.mp3,后由张世则接任。,,,,任,r,en,4,1,0
573117,common_voice_zh-CN_33322782.mp3,该组织致力于塞浦路斯的统一以及塞浦路斯希腊人和塞浦路斯土耳其人的和解。,,,,和,h,e,2,1,0
573118,common_voice_zh-CN_33615279.mp3,匍匐旬子为蔷薇科旬子属的植物。,,,,的,d,e,5,1,0


In [100]:
new_train_df[['augment']].value_counts()

augment
0          400000
1          173120
Name: count, dtype: int64

In [125]:
X_train, train_augs, y_train = new_train_df['path'].to_list(), new_train_df['augment'], generate_labels(new_train_df)

In [146]:
def feature_extraction(args):
  wav, labels, aug= args
  if aug:
    wav = 'skibiit'

  
  return wav, labels
  return {"converted": wav, "labels": labels}


In [147]:
inputt = list(zip(X_train, y_train, train_augs))[0]

In [150]:
t = [ feature_extraction(inputt) for i in range(5)] 

In [152]:
t

[('common_voice_zh-CN_33361319.mp3', (18, 15, 2, 1)),
 ('common_voice_zh-CN_33361319.mp3', (18, 15, 2, 1)),
 ('common_voice_zh-CN_33361319.mp3', (18, 15, 2, 1)),
 ('common_voice_zh-CN_33361319.mp3', (18, 15, 2, 1)),
 ('common_voice_zh-CN_33361319.mp3', (18, 15, 2, 1))]

In [153]:
t = [(feat, label) for feat, label in t if feat is not None]

In [168]:
train_features, train_label = zip(*t)

In [170]:
train_label

((18, 15, 2, 1),
 (18, 15, 2, 1),
 (18, 15, 2, 1),
 (18, 15, 2, 1),
 (18, 15, 2, 1))

In [171]:
train_label = np.array(train_label)

In [174]:
train_label[:, 3]

array([1, 1, 1, 1, 1])

In [161]:
for entry in np.array(train_label):
  print(f'init: {entry[0]}, final: {entry[1]}, tone: {entry[2]}, sanity: {entry[3]}')

init: 18, final: 15, tone: 2, sanity: 1
init: 18, final: 15, tone: 2, sanity: 1
init: 18, final: 15, tone: 2, sanity: 1
init: 18, final: 15, tone: 2, sanity: 1
init: 18, final: 15, tone: 2, sanity: 1


# ---------------------------------------------------------------

In [None]:
exploded = myDF.explode('pinyin_breakdown', ignore_index=True)
exploded[['initial', 'final', 'tone']] = pd.DataFrame(exploded['pinyin_breakdown'].tolist(), index=exploded.index)
# exploded.drop(columns=['sentence', 'pinyin_breakdown'], inplace=True)
exploded

ValueError: columns must have matching element counts

In [12]:
valid_initials = ['EMPTY', 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'j',
       'q', 'x', 'z', 'c', 's', 'zh', 'ch', 'sh', 'r']
valid_finals = ['i', 'a', 'ai', 'an', 'ang', 'ao', 'e', 'ei', 'en', 'eng', 'er', 'i', 'ia', 'ian', 'iang', 'iao', 'ie', 'in', 'ing', 'iong', 'iou', 'o', 'ong', 'ou', 'u', 'ua', 'uai', 'uan', 'uang', 'uei', 'uen', 'ueng', 'uo', 'v', 'van', 've', 'vn']

def is_valid_pinyin(initial, final, tone):
    if initial not in valid_initials or final not in valid_finals:
        return False
    return True

In [13]:
mask = exploded.apply(lambda x: is_valid_pinyin(x['initial'], x['final'], x['tone']), axis=1)

In [14]:
clean_df = exploded[mask]

In [15]:
clean_df['tone'].value_counts()

tone
4    3076351
2    2062752
1    1863196
3    1389505
5     410838
Name: count, dtype: int64

In [21]:
len(clean_df['initial'].unique())

22

In [20]:
len(clean_df['final'].unique())

36

In [22]:
len(clean_df['tone'].unique())

5

In [24]:
from sklearn.utils import resample
from utils.constants import *

In [40]:
total_dist = clean_df['final'].value_counts() / len(clean_df['final']) 

In [41]:
smaller_dist = clean_df['final'].value_counts() / len(clean_df['final']) * 0.9

In [52]:
(abs(1 - smaller_dist / total_dist ) > 0.4).all()

False

Index(['i', 'e', 'u', 'ian', 'ong', 'a', 'uei', 'ai', 'an', 'ing', 'uo', 'v',
       'en', 'eng', 'ang', 'ao', 'ou', 'iou', 'ie', 'in', 'ei', 'ia', 'iao',
       'iang', 'uan', 'van', 've', 'uen', 'er', 'uang', 'ua', 'vn', 'o',
       'iong', 'uai', 'ueng'],
      dtype='object', name='final')

In [16]:
clean_df['final'].value_counts()

final
i       1529172
e        865223
u        659344
ian      361759
ong      356640
a        325995
uei      312894
ai       302749
an       289719
ing      289115
uo       287096
v        257268
en       255355
eng      248971
ang      208464
ao       207957
ou       186126
iou      175038
ie       170298
in       167540
ei       159012
ia       152339
iao      144911
iang     130922
uan      121414
van       97907
ve        88311
uen       79196
er        78593
uang      70541
ua        68976
vn        52180
o         46856
iong      36775
uai       17265
ueng        721
Name: count, dtype: int64

In [17]:
clean_df['initial'].value_counts()

initial
EMPTY    1341468
d         871520
sh        720602
zh        587004
j         563002
x         506032
l         479921
g         453627
b         375771
h         374515
t         305160
z         272813
m         270349
q         259968
ch        247094
f         223178
s         207207
k         200013
n         171785
r         159482
c         119249
p          92882
Name: count, dtype: int64

In [118]:
clean_df.head(3)

Unnamed: 0,path,sentence,age,gender,accents,pinyin_breakdown,initial,final,tone
0,common_voice_zh-CN_19703883.mp3,模式种采样自台湾龟山岛。,thirties,female_feminine,出生地：31 上海市,"(m, o, 2)",m,o,2
1,common_voice_zh-CN_19703883.mp3,模式种采样自台湾龟山岛。,thirties,female_feminine,出生地：31 上海市,"(sh, i, 4)",sh,i,4
2,common_voice_zh-CN_19703883.mp3,模式种采样自台湾龟山岛。,thirties,female_feminine,出生地：31 上海市,"(zh, ong, 3)",zh,ong,3


In [119]:
from sklearn.utils.class_weight import compute_class_weight
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

In [120]:
for label in ['initial', 'final']:
    class_weights = compute_class_weight('balanced', classes=np.unique(clean_df[label]), y=clean_df[label])
    class_weight_dict = dict(zip(np.unique(clean_df[label]), class_weights))
    clean_df[f"{label}_weight"] = clean_df[label].map(class_weight_dict)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df[f"{label}_weight"] = clean_df[label].map(class_weight_dict)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df[f"{label}_weight"] = clean_df[label].map(class_weight_dict)


In [121]:
clean_df.head(3)

Unnamed: 0,path,sentence,age,gender,accents,pinyin_breakdown,initial,final,tone,initial_weight,final_weight
0,common_voice_zh-CN_19703883.mp3,模式种采样自台湾龟山岛。,thirties,female_feminine,出生地：31 上海市,"(m, o, 2)",m,o,2,1.480013,5.218496
1,common_voice_zh-CN_19703883.mp3,模式种采样自台湾龟山岛。,thirties,female_feminine,出生地：31 上海市,"(sh, i, 4)",sh,i,4,0.555258,0.159902
2,common_voice_zh-CN_19703883.mp3,模式种采样自台湾龟山岛。,thirties,female_feminine,出生地：31 上海市,"(zh, ong, 3)",zh,ong,3,0.681631,0.685615


In [122]:
clean_df["sample_weight"] = clean_df[['initial_weight', 'final_weight']].mean(axis=1)

# Sample using computed weights
sampled_df = clean_df.sample(n=500_000, weights=clean_df["sample_weight"], random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df["sample_weight"] = clean_df[['initial_weight', 'final_weight']].mean(axis=1)


In [125]:
print(len(sampled_df['initial'].value_counts()))
sampled_df['initial'].value_counts()

22


initial
EMPTY    67509
j        32006
x        31238
d        26759
h        26151
b        23838
zh       23747
g        23557
l        23406
sh       22169
m        20740
q        19281
t        18337
ch       18297
f        16784
z        16487
k        16288
n        15315
s        15079
r        14978
c        14084
p        13950
Name: count, dtype: int64

In [129]:
sampled_df[sampled_df['final'] == 'ueng']['pinyin_breakdown'].value_counts()

pinyin_breakdown
(EMPTY, ueng, 1)    692
(EMPTY, ueng, 4)     29
Name: count, dtype: int64

In [126]:
print(len(sampled_df['final'].value_counts()))
sampled_df['final'].value_counts()

36


final
i       46900
e       29763
u       29131
en      19000
ai      18892
a       18821
an      18140
ian     18011
ong     17923
eng     16332
ing     16087
uo      15906
ang     14196
ei      13767
ao      13669
v       13084
uei     12875
ou      12417
in      11811
iao     10701
iou     10575
ie      10513
uan     10132
ia       9912
iang     9704
uen      9460
o        8698
ve       8684
ua       8617
van      8572
uang     8545
vn       7808
er       7582
iong     6975
uai      6076
ueng      721
Name: count, dtype: int64

In [135]:
sampled_df.head()

Unnamed: 0,path,sentence,age,gender,accents,pinyin_breakdown,initial,final,tone,initial_weight,final_weight,sample_weight
3296630,common_voice_zh-CN_32943920.mp3,大花甘肃紫堇为罂粟科紫堇属下的一个变种。,,,,"(g, an, 1)",g,an,1,0.882046,0.843983,0.863015
8367665,common_voice_zh-CN_33732667.mp3,而当时很多人无法支付一片披萨的价格便提供他们可以提供的物品交换相应大小的披萨。,,,,"(p, in, 3)",p,in,3,4.307832,1.459459,2.883646
6442068,common_voice_zh-CN_33422789.mp3,报道里的内容并不能证明桑兰撒谎。,,,,"(h, uang, 3)",h,uang,3,1.068369,3.466322,2.267345
5268782,common_voice_zh-CN_33233990.mp3,抑制震荡子在数学上的应用可能有助于从昼夜节律到内分泌等领域的研究。,,,,"(EMPTY, iong, 4)",EMPTY,iong,4,0.29827,6.649023,3.473647
1371528,common_voice_zh-CN_32618891.mp3,布里翁河畔苏塞。,,,,"(EMPTY, ueng, 1)",EMPTY,ueng,1,0.29827,339.137078,169.717674


In [136]:
sampled_df['sanity'] = 1
sampled_df['augment'] = 0

In [142]:
unaugmented = len(sampled_df)
total_size = int(unaugmented / 0.7)
to_augment = int(total_size * 0.2)
to_insane = int(total_size * 0.1)

print(total_size, unaugmented, to_augment, to_insane)


714285 500000 142857 71428


In [151]:
augmented = resample(sampled_df, replace=True, n_samples=to_augment, random_state=RANDOM_SEED)
augmented['sanity'] = 1
augmented.head()

Unnamed: 0,path,sentence,age,gender,accents,pinyin_breakdown,initial,final,tone,initial_weight,final_weight,sample_weight,sanity,augment
6439322,common_voice_zh-CN_33422524.mp3,段氏出身于段部鲜卑家族。,,,,"(sh, en, 1)",sh,en,1,0.555258,0.95756,0.756409,1,0
6411201,common_voice_zh-CN_33419859.mp3,回旅馆没有发现我的帽子,,,,"(m, ei, 2)",m,ei,2,1.480013,1.537732,1.508873,1,0
5083509,common_voice_zh-CN_33201874.mp3,挪威政府的一些主要行事也在奥斯陆主教座堂举行。,,,,"(j, iao, 4)",j,iao,4,0.71069,1.687366,1.199028,1,0
7710663,common_voice_zh-CN_33610360.mp3,中华人民共和国女政治家。,,,,"(m, in, 2)",m,in,2,1.480013,1.459459,1.469736,1,0
4985534,common_voice_zh-CN_33182131.mp3,这样可以确保雷达波的一部分能量能够从微粒表面反射回雷达站所在方向。,,,,"(l, ei, 2)",l,ei,2,0.833721,1.537732,1.185726,1,0


In [None]:
insane = resample(sampled_df, replace=True, n_samples=to_insane, random_state=RANDOM_SEED)
insane['sanity'] = 0
insane.head()


Unnamed: 0,path,sentence,age,gender,accents,pinyin_breakdown,initial,final,tone,initial_weight,final_weight,sample_weight,sanity,augment
6439322,common_voice_zh-CN_33422524.mp3,段氏出身于段部鲜卑家族。,,,,"(sh, en, 1)",sh,en,1,0.555258,0.95756,0.756409,0,0
6411201,common_voice_zh-CN_33419859.mp3,回旅馆没有发现我的帽子,,,,"(m, ei, 2)",m,ei,2,1.480013,1.537732,1.508873,0,0
5083509,common_voice_zh-CN_33201874.mp3,挪威政府的一些主要行事也在奥斯陆主教座堂举行。,,,,"(j, iao, 4)",j,iao,4,0.71069,1.687366,1.199028,0,0
7710663,common_voice_zh-CN_33610360.mp3,中华人民共和国女政治家。,,,,"(m, in, 2)",m,in,2,1.480013,1.459459,1.469736,0,0
4985534,common_voice_zh-CN_33182131.mp3,这样可以确保雷达波的一部分能量能够从微粒表面反射回雷达站所在方向。,,,,"(l, ei, 2)",l,ei,2,0.833721,1.537732,1.185726,0,0


In [159]:
import json

In [161]:
with open(DATA_DIR / 'invalid_initial_final_mappings.json', 'r') as file:
  invalid_mappings = json.load(file)

np.random.seed(RANDOM_SEED)
insane['final'] = insane.apply(lambda row: np.random.choice(invalid_mappings[row['initial']]) if row['initial'] in invalid_mappings else None, axis=1)


In [162]:
new = pd.concat([augmented, insane])
new['augment'] = 1
new.head(2)

Unnamed: 0,path,sentence,age,gender,accents,pinyin_breakdown,initial,final,tone,initial_weight,final_weight,sample_weight,sanity,augment
6439322,common_voice_zh-CN_33422524.mp3,段氏出身于段部鲜卑家族。,,,,"(sh, en, 1)",sh,en,1,0.555258,0.95756,0.756409,1,1
6411201,common_voice_zh-CN_33419859.mp3,回旅馆没有发现我的帽子,,,,"(m, ei, 2)",m,ei,2,1.480013,1.537732,1.508873,1,1


In [163]:
new.tail(2)

Unnamed: 0,path,sentence,age,gender,accents,pinyin_breakdown,initial,final,tone,initial_weight,final_weight,sample_weight,sanity,augment
3035991,common_voice_zh-CN_32915905.mp3,他也没有得到正弦定律。,,,,"(zh, eng, 4)",zh,i,4,0.681631,0.982114,0.831872,0,1
7540098,common_voice_zh-CN_33592100.mp3,萨莫韦茨农村居民点是俄罗斯联邦沃罗涅日州埃尔季利区所属的一个农村居民点。,,,,"(c, uen, 1)",c,vn,1,3.355333,3.087502,3.221418,0,1


In [165]:
overall = pd.concat([sampled_df, new])

In [168]:
ueng = overall[overall['final'] == 'ueng']


In [170]:
more_ueng = resample(ueng, replace=True, n_samples=int(len(ueng) / 2), random_state=RANDOM_SEED)
more_ueng['augment'] = 1


In [171]:
done = pd.concat([overall, more_ueng])

In [193]:
done.head(1)

Unnamed: 0,path,sentence,age,gender,accents,pinyin_breakdown,initial,final,tone,initial_weight,final_weight,sample_weight,sanity,augment
3296630,common_voice_zh-CN_32943920.mp3,大花甘肃紫堇为罂粟科紫堇属下的一个变种。,,,,"(g, an, 1)",g,an,1,0.882046,0.843983,0.863015,1,0


In [198]:
done.drop(columns=['pinyin_breakdown', 'initial_weight', 'final_weight', 'sample_weight'])

Unnamed: 0,path,sentence,age,gender,accents,initial,final,tone,sanity,augment
3296630,common_voice_zh-CN_32943920.mp3,大花甘肃紫堇为罂粟科紫堇属下的一个变种。,,,,g,an,1,1,0
8367665,common_voice_zh-CN_33732667.mp3,而当时很多人无法支付一片披萨的价格便提供他们可以提供的物品交换相应大小的披萨。,,,,p,in,3,1,0
6442068,common_voice_zh-CN_33422789.mp3,报道里的内容并不能证明桑兰撒谎。,,,,h,uang,3,1,0
5268782,common_voice_zh-CN_33233990.mp3,抑制震荡子在数学上的应用可能有助于从昼夜节律到内分泌等领域的研究。,,,,EMPTY,iong,4,1,0
1371528,common_voice_zh-CN_32618891.mp3,布里翁河畔苏塞。,,,,EMPTY,ueng,1,1,0
...,...,...,...,...,...,...,...,...,...,...
2725463,common_voice_zh-CN_32783660.mp3,格里翁的总面积为平方公里。,twenties,male_masculine,出生地：11 北京市,EMPTY,ueng,1,1,1
8088281,common_voice_zh-CN_33673946.mp3,潘训福是清朝末年最后一科生员。,,,,ch,ueng,2,0,1
5833873,common_voice_zh-CN_33311731.mp3,乡试第七名。,,,,x,ueng,1,0,1
514073,common_voice_zh-CN_32476710.mp3,此后红九军撤离古浪西进。,,,,c,ueng,3,0,1


In [192]:
abs(sampled_df['final'].value_counts(normalize=True) - done['final'].value_counts(normalize=True))

final
a       0.002970
ai      0.002897
an      0.002836
ang     0.002147
ao      0.002014
e       0.004573
ei      0.001125
en      0.002327
eng     0.002509
er      0.002960
i       0.002559
ia      0.000990
ian     0.001587
iang    0.001145
iao     0.000059
ie      0.000036
in      0.000273
ing     0.001262
iong    0.002232
iou     0.000380
o       0.002107
ong     0.004805
ou      0.001368
u       0.005228
ua      0.001609
uai     0.001909
uan     0.000699
uang    0.001364
uei     0.000659
uen     0.000275
ueng    0.007394
uo      0.001764
v       0.000781
van     0.002060
ve      0.001485
vn      0.002212
Name: proportion, dtype: float64

In [177]:
done['final'].value_counts(normalize=True)

final
i       0.096359
e       0.054953
u       0.053034
ong     0.040651
en      0.035673
ai      0.034887
a       0.034672
ian     0.034435
an      0.033444
ing     0.030912
eng     0.030155
uo      0.030048
v       0.026949
ei      0.026409
ang     0.026245
ao      0.025324
uei     0.025091
in      0.023895
ou      0.023466
iou     0.021530
iao     0.021343
ie      0.021062
ia      0.020814
iang    0.020553
uan     0.019565
o       0.019503
van     0.019204
ve      0.018853
ua      0.018843
uen     0.018645
uang    0.018454
er      0.018124
vn      0.017828
iong    0.016182
uai     0.014061
ueng    0.008836
Name: proportion, dtype: float64

In [127]:
print(len(sampled_df['tone'].value_counts()))
sampled_df['tone'].value_counts()

5


tone
4    165882
2    132377
1    110401
3     80656
5     10684
Name: count, dtype: int64

# VALID PINYIN MAPPINGS

In [54]:
from utils.constants import DATA_DIR


pinyin_combos = pd.read_csv(DATA_DIR / "pinyin-combos.csv")

In [55]:
pinyin_combos.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,b,p,m,f,d,t,n,l,...,j,q,x,z,c,s,zh,ch,sh,r
0,i,,,,,,,,,,...,,,,zi,ci,si,zhi,chi,shi,ri
1,a,a,ba,pa,ma,fa,da,ta,na,la,...,,,,za,ca,sa,zha,cha,sha,
2,ai,ai,bai,pai,mai,,dai,tai,nai,lai,...,,,,zai,cai,sai,zhai,chai,shai,
3,an,an,ban,pan,man,fan,dan,tan,,lan,...,,,,zan,can,san,zhan,chan,shan,ran
4,ang,ang,bang,pang,mang,fang,dang,tang,nang,lang,...,,,,zang,cang,sang,zhang,chang,shang,rang


In [56]:
pinyin_combos = pinyin_combos.rename(columns={'Unnamed: 0': 'finals', 'Unnamed: 1': 'EMPTY'})
pinyin_combos.loc[:, 'finals'] = pinyin_combos.loc[:, 'finals'].apply(lambda x: x.replace('ü', 'v'))
pinyin_combos

Unnamed: 0,finals,EMPTY,b,p,m,f,d,t,n,l,...,j,q,x,z,c,s,zh,ch,sh,r
0,i,,,,,,,,,,...,,,,zi,ci,si,zhi,chi,shi,ri
1,a,a,ba,pa,ma,fa,da,ta,na,la,...,,,,za,ca,sa,zha,cha,sha,
2,ai,ai,bai,pai,mai,,dai,tai,nai,lai,...,,,,zai,cai,sai,zhai,chai,shai,
3,an,an,ban,pan,man,fan,dan,tan,,lan,...,,,,zan,can,san,zhan,chan,shan,ran
4,ang,ang,bang,pang,mang,fang,dang,tang,nang,lang,...,,,,zang,cang,sang,zhang,chang,shang,rang
5,ao,ao,bao,pao,mao,,dao,tao,nao,lao,...,,,,zao,cao,sao,zhao,chao,shao,rao
6,e,e,,,me,,de,te,ne,le,...,,,,ze,ce,se,zhe,che,she,re
7,ei,ei,bei,pei,mei,fei,dei,,nei,lei,...,,,,zei,,,zhei,,shei,
8,en,en,ben,pen,men,fen,,,nen,,...,,,,zen,cen,sen,zhen,chen,shen,ren
9,eng,eng,beng,peng,meng,feng,deng,teng,neng,leng,...,,,,zeng,ceng,seng,zheng,cheng,sheng,reng


In [174]:
pinyin_combos.columns[1:]

Index(['EMPTY', 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'j',
       'q', 'x', 'z', 'c', 's', 'zh', 'ch', 'sh', 'r'],
      dtype='object')

In [185]:
print(list(pinyin_combos['finals']))

['i', 'a', 'ai', 'an', 'ang', 'ao', 'e', 'ei', 'en', 'eng', 'er', 'i', 'ia', 'ian', 'iang', 'iao', 'ie', 'in', 'ing', 'iong', 'iou', 'o', 'ong', 'ou', 'u', 'ua', 'uai', 'uan', 'uang', 'uei', 'uen', 'ueng', 'uo', 'v', 'van', 've', 'vn']


In [57]:
import json

In [59]:
def replace_umlauts(s):
    if isinstance(s, str):
        return s.replace('ü', 'v')
    return s

# Apply the function to the DataFrame
pinyin_combos = pinyin_combos.apply(replace_umlauts)

# Initialize an empty dictionary to store the mappings
invalid_mapping = {}
# Iterate through each row
for index, row in pinyin_combos.iterrows():
    final_value = row['finals']
    for initial in pinyin_combos.columns[1:]:  # Skip the first column
        if not pd.isna(row[initial]):
            if initial not in invalid_mapping:
                invalid_mapping[initial] = []
            invalid_mapping[initial].append(final_value)

print(invalid_mapping)

with open(DATA_DIR / 'valid_initial_final_mappings.json', 'w') as f:
    json.dump(invalid_mapping, f, indent=2)

print("Mapping saved to valid_initial_final_mappings.json")

{'z': ['i', 'a', 'ai', 'an', 'ang', 'ao', 'e', 'ei', 'en', 'eng', 'ong', 'ou', 'u', 'uan', 'uei', 'uen', 'uo'], 'c': ['i', 'a', 'ai', 'an', 'ang', 'ao', 'e', 'en', 'eng', 'ong', 'ou', 'u', 'uan', 'uei', 'uen', 'uo'], 's': ['i', 'a', 'ai', 'an', 'ang', 'ao', 'e', 'en', 'eng', 'ong', 'ou', 'u', 'uan', 'uei', 'uen', 'uo'], 'zh': ['i', 'a', 'ai', 'an', 'ang', 'ao', 'e', 'ei', 'en', 'eng', 'ong', 'ou', 'u', 'ua', 'uai', 'uan', 'uang', 'uei', 'uen', 'uo'], 'ch': ['i', 'a', 'ai', 'an', 'ang', 'ao', 'e', 'en', 'eng', 'ong', 'ou', 'u', 'uai', 'uan', 'uang', 'uei', 'uen', 'uo'], 'sh': ['i', 'a', 'ai', 'an', 'ang', 'ao', 'e', 'ei', 'en', 'eng', 'ou', 'u', 'ua', 'uai', 'uan', 'uang', 'uei', 'uen', 'uo'], 'r': ['i', 'an', 'ang', 'ao', 'e', 'en', 'eng', 'ong', 'ou', 'u', 'uan', 'uei', 'uen', 'uo'], 'EMPTY': ['a', 'ai', 'an', 'ang', 'ao', 'e', 'ei', 'en', 'eng', 'er', 'i', 'ia', 'ian', 'iang', 'iao', 'ie', 'in', 'ing', 'iong', 'iou', 'o', 'ou', 'u', 'ua', 'uai', 'uan', 'uang', 'uei', 'uen', 'ueng', '

In [60]:
def replace_umlauts(s):
    if isinstance(s, str):
        return s.replace('ü', 'v')
    return s

# Apply the function to the DataFrame
pinyin_combos = pinyin_combos.apply(replace_umlauts)

# Initialize an empty dictionary to store the mappings
invalid_mapping = {}
# Iterate through each row
for index, row in pinyin_combos.iterrows():
    final_value = row['finals']
    for initial in pinyin_combos.columns[1:]:  # Skip the first column
        if pd.isna(row[initial]):
            if initial not in invalid_mapping:
                invalid_mapping[initial] = []
            invalid_mapping[initial].append(final_value)

print(invalid_mapping)

with open(DATA_DIR / 'invalid_initial_final_mappings.json', 'w') as f:
    json.dump(invalid_mapping, f, indent=2)

print("Mapping saved to invalid_initial_final_mappings.json")



{'EMPTY': ['i', 'ong'], 'b': ['i', 'e', 'er', 'ia', 'iang', 'iong', 'iou', 'ong', 'ou', 'ua', 'uai', 'uan', 'uang', 'uei', 'uen', 'ueng', 'uo', 'v', 'van', 've', 'vn'], 'p': ['i', 'e', 'er', 'ia', 'iang', 'iong', 'iou', 'ong', 'ua', 'uai', 'uan', 'uang', 'uei', 'uen', 'ueng', 'uo', 'v', 'van', 've', 'vn'], 'm': ['i', 'er', 'ia', 'iang', 'iong', 'ong', 'ua', 'uai', 'uan', 'uang', 'uei', 'uen', 'ueng', 'uo', 'v', 'van', 've', 'vn'], 'f': ['i', 'ai', 'ao', 'e', 'er', 'i', 'ia', 'ian', 'iang', 'iao', 'ie', 'in', 'ing', 'iong', 'iou', 'ong', 'ua', 'uai', 'uan', 'uang', 'uei', 'uen', 'ueng', 'uo', 'v', 'van', 've', 'vn'], 'd': ['i', 'en', 'er', 'iang', 'in', 'iong', 'o', 'ua', 'uai', 'uang', 'ueng', 'v', 'van', 've', 'vn'], 't': ['i', 'ei', 'en', 'er', 'ia', 'iang', 'in', 'iong', 'iou', 'o', 'ua', 'uai', 'uang', 'ueng', 'v', 'van', 've', 'vn'], 'n': ['i', 'an', 'er', 'ia', 'iong', 'o', 'ou', 'ua', 'uai', 'uang', 'uei', 'uen', 'ueng', 'van', 'vn'], 'l': ['i', 'en', 'er', 'iong', 'o', 'ua', 'u

In [None]:
special_combinations = {"SPECIAL_COMBINATIONS": [special for special in list(pinyin_combos['EMPTY']) if not pd.isna(special)]}

In [144]:
def womp(phrase):
  clean_phrase = re.sub(r'[^\w]', '', phrase)
  initial = lazy_pinyin(clean_phrase, style=Style.INITIALS, strict=True)
  final = lazy_pinyin(clean_phrase, style=Style.FINALS, strict=True)
  tone = [ word[-1] for word in lazy_pinyin(clean_phrase, style=Style.FINALS_TONE3, strict=False, v_to_u=True, neutral_tone_with_five=True, tone_sandhi=True)]
  
  initial = [init if init != '' else "EMPTY" for init in initial ]

  return list(zip(initial, final, tone))

In [145]:
womp("一样")

[('EMPTY', 'i', '1'), ('EMPTY', 'iang', '4')]

In [61]:
with open(DATA_DIR / 'invalid_initial_final_mappings.json', 'r') as file:
  data = json.load(file)


In [65]:
np.random.choice(data['b'])

've'

In [66]:
import pandas as pd

In [68]:
df = pd.read_csv(DATA_DIR / 'result.csv')

In [70]:
from sklearn.utils import resample

In [104]:
small = resample(df, n_samples=1000)

In [115]:
np.random.seed(RANDOM_SEED)
small.apply(lambda row: np.random.choice(data[row['initial']], ) if row['initial'] in data else None, axis=1)

2282        i
1856        v
7980     iong
1998       ei
5476       vn
         ... 
9865     ueng
10919     uai
6534      uei
13926      ua
14245    None
Length: 1000, dtype: object

Unnamed: 0,path,sentence,wav_path,initial,final,tone
0,common_voice_zh-CN_31167946.mp3,三,31167946.wav,s,an,1
1,common_voice_zh-CN_31196810.mp3,三,31196810.wav,s,an,1
2,common_voice_zh-CN_32705211.mp3,二,32705211.wav,EMPTY,er,4
3,common_voice_zh-CN_32705242.mp3,二,32705242.wav,EMPTY,er,4
4,common_voice_zh-CN_32706533.mp3,否,32706533.wav,f,ou,3
...,...,...,...,...,...,...
14926,common_voice_zh-CN_41882588.mp3,八,41882588.wav,b,a,1
14927,common_voice_zh-CN_41882589.mp3,六,41882589.wav,l,iu,4
14928,common_voice_zh-CN_41882590.mp3,一,41882590.wav,y,i,1
14929,common_voice_zh-CN_41882591.mp3,五,41882591.wav,w,u,3
