In [1]:
import os
import pandas as pd

In [2]:
this_dir = os.getcwd()
work_dir = os.path.dirname(this_dir)
data_dir = os.path.join(work_dir, 'data')

os.makedirs(data_dir, exist_ok=True)

In [3]:
##### setting area #####

consonant = 'c' 

# mean values of the consonant

cog = 7000

fri_dur = 174

# means of cog and frication duration

'''
ts: 4000, 96
tc: 7000, 96
s: 4000, 174
c: 7000, 174

Transcription for file names:

tʂ: ts
tɕ: tc
ʂ: s
ɕ: c

'''

vowel = 'i'

# formants
f_vals = [3372, 2761, 437] # f3, f2, f1

'''
i: 3372, 2761, 437
ɪ: 3053, 2365, 483
e: 3047, 2530, 536
ɛ: 2979, 2058, 731
u: 2735, 1105, 459
ʊ: 2827, 1225, 519
o: 2828, 1035, 555
ɔ: 2824, 1136, 781

Transcription for file names:

i: i *
ɪ: L
e: e *
ɛ: F
u: u *
ʊ: W
o: o *
ɔ: D

*: used in training

'''

word = vowel + consonant + vowel

# no. of tokens for each word
sample_size = 8000

### Consonant synthesis


In [4]:
import numpy as np
from scipy.stats import truncnorm

In [5]:
c_f_means = np.array([cog, fri_dur])
c_f_stds = np.array([500, 13])

con_means = np.array([200, 0.5, 1, 50, 60]) # sta_dev, skewness, kurtosis, bur_int, fri_int
con_stds = con_means * 0.05

con_means = np.concatenate((c_f_means, con_means))
con_stds = np.concatenate((c_f_stds, con_stds))

consonants = np.zeros((sample_size, len(con_means)))
for i in range(len(con_means)):
    a, b = (con_means[i] - 2*con_stds[i] - con_means[i]) / con_stds[i], (con_means[i] + 2*con_stds[i] - con_means[i]) / con_stds[i]
    dist = truncnorm(a, b, loc = con_means[i], scale = con_stds[i])
    consonants[:, i] = dist.rvs(size = sample_size)

In [6]:
# total duration of fixed value 200 for consonants
con_dur = np.full((sample_size, 1), 200)

# zero values for all other features
zeros = np.full((sample_size, 9), 0)

# all concatenated
consonants = np.hstack((consonants, con_dur, zeros))

print(consonants[0])

[7.10909227e+03 1.63178269e+02 2.05436293e+02 5.14848799e-01
 9.08360303e-01 4.77657712e+01 6.48044470e+01 2.00000000e+02
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00]


### Vowel synthesis

In [7]:
vow_means = np.array([80, 200, 170, 110 ,90]) # voc_int, f0, b3, b2, b1
vow_means = np.append(vow_means, f_vals) # *, f3, f2, f1
vow_stds = vow_means * 0.05

vowels = np.zeros((sample_size, len(vow_means)))
for i in range(len(vow_means)):
    a, b = (vow_means[i] - 2*vow_stds[i] - vow_means[i]) / vow_stds[i], (vow_means[i] + 2*vow_stds[i] - vow_means[i]) / vow_stds[i]
    dist = truncnorm(a, b, loc = vow_means[i], scale = vow_stds[i])
    vowels[:, i] = dist.rvs(size = sample_size)

In [8]:
# vocalic duration and total duration of fixed value 400 for vowels
vow_dur = np.full((sample_size, 2), 400)

# zero values for all other features
zeros = np.full((sample_size, 7), 0)

# all concatenated
vowels = np.hstack((zeros, vow_dur, vowels))

print(vowels[0])

[   0.            0.            0.            0.            0.
    0.            0.          400.          400.           72.61917825
  212.39485422  161.94203674  109.72841264   87.74978366 3172.51458626
 2777.76843001  393.94701612]


### A csv file as guideline

In [9]:
element = [
    'cog', 'fri_dur', 'sta_dev', 'skewness', 'kurtosis', 'bur_int', 'fri_int', 
    'tot_dur', 'voc_dur', 'voc_int',
    'f0', 'b3', 'b2', 'b1', 'f3', 'f2', 'f1'
]

explanation = [
    'center of gravity', 'frication duration', 'standard deviation', 'skewness', 'kurtosis', 'burst intensity', 'frication intensity',
    'total duration', 'vocalic duration', 'vocalic intensity',
    'fundamental frequency', 'bandwidth of f3', 'bandwidth of f2', 'bandwidth of f1', 'f3', 'f2', 'f1'
]

mean = [
    '4000, 7000', '96, 174', '200', '0.5', '1', '50', '60',
    '200 for con, 400 for vow', '0 for con, 400 for vow', '0 for con, 80 for vow',
    '200', '170', '110', '90', '', '', '']


random = ['gaussian'] * 7 + ['fixed'] * 2 + ['gaussian'] * 8

structure = pd.DataFrame({
    'element': element,
    'explanation': explanation, 
    'mean': mean,
    'random': random,
    'consonant_sample': consonants[0],
    'consonant': consonant,
    'vowel_sample': vowels[0],
    'vowel': vowel
})

file_name = os.path.join(data_dir, 'structure_sample.csv')
print(file_name)
structure.to_csv(file_name, index=True)

/Users/shuhaoz19/Desktop/Workspace/2025_Allophone/gradient/data/structure_sample.csv


### Save as 3*17 .npy

In [10]:
metadata = []

subdata_dir = os.path.join(data_dir, word)
os.makedirs(subdata_dir, exist_ok=True)

for i in range(sample_size):
    uid = word + f'_{i+1:04d}'
    filename = f'{uid}.npy'
    save_path = os.path.join(subdata_dir, filename)
    
    vcv = np.vstack([vowels[i], consonants[i], vowels[i]])

    np.save(save_path, vcv)
    
    cog = vcv[1][0]
    fri_dur = vcv[1][1]
    
    save_path_rel = os.path.relpath(save_path, start=work_dir)

    metadata.append({
        'uid': uid,
        'path': save_path_rel,
        'cog': cog,
        'fri_dur': fri_dur,
        'word': word
    })

metaframe = pd.DataFrame(metadata)

csv_name = word + '_meta.csv'
csv_path = os.path.join(data_dir, csv_name)
metaframe.to_csv(csv_path, index=False)

In [11]:
savetest = np.load(save_path)

print(savetest)

[[0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 4.00000000e+02
  4.00000000e+02 7.62651996e+01 1.96561572e+02 1.63102878e+02
  1.15253069e+02 8.59183158e+01 3.32123695e+03 2.83428375e+03
  4.17138881e+02]
 [6.53557829e+03 1.77942891e+02 1.92521676e+02 5.02965259e-01
  1.00111816e+00 4.57867281e+01 6.17065703e+01 2.00000000e+02
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 4.00000000e+02
  4.00000000e+02 7.62651996e+01 1.96561572e+02 1.63102878e+02
  1.15253069e+02 8.59183158e+01 3.32123695e+03 2.83428375e+03
  4.17138881e+02]]
