In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
# load the japanese phone dictionary 
def load_phone_dict(path):
  phone_dict = list()
  with open(path, 'r') as f:
    for line in f:
      phone_dict.append(line.strip())
  return phone_dict  

In [3]:
def phone_to_one_hot(phone, phone_dict):
  tmp = list(np.zeros(len(phone_dict)))
  if phone=='None':   
    return tmp
  else:
    tmp[phone_dict.index(phone)] = 1.0
    return tmp

In [4]:
def time_to_frame(time, window_size, step_size):
  number_windows=(time-2*window_size)//step_size
  if number_windows >= 0:
    return number_windows
  else:
    return 0

In [5]:
#### convert /mono/.labs to csv
#### a floder under /mono/ called /csv/ will be created and it will hold all the csv files with current, next, before
#### the format of one example will be shown below

label_path = '/Users/ShiyuMu/Desktop/HTS-demo_NIT-SONG070-F001/data/labels/mono/'
fileList = [label_path + f for f in os.listdir(label_path) if f.endswith('.lab')]
try:
  # create dictionary to hold all the csv file 
  os.mkdir(label_path + 'csv/');
  print('csv dictionary created')
except:
  pass
for file in fileList:
  df = pd.read_csv(file, sep=" ", header=None)
  df.columns = ["beg", "end", "current"]
  df['before'] = pd.Series(['None']).append(df['current']).reset_index(drop=True)
  df['next'] = df['current'][1:].append(pd.Series(['None'])).reset_index(drop=True)
  df.to_csv(label_path + 'csv/'+ file.split('/')[-1]+'.csv', sep='\t', index=False)

### example of loading csv files 

In [6]:
## this is the path we just created
csv_path = label_path + 'csv/'
# get all csv files 
csvList = [csv_path + f for f in os.listdir(csv_path) if f.endswith('.csv')]
df_list = list()
## add all csv files as dataframe to a list
for file in csvList:
  df = pd.read_csv(file, sep='\t')
  df_list.append(df)

In [7]:
## the df_list contains 31 dataframe (since we have 31 dataset)
## one dataframe will look like this. beg and end here indicates nonoseconds
df_list[0].head()

Unnamed: 0,beg,end,current,before,next
0,0,23800000,pau,,n
1,23800000,24400000,n,pau,e
2,24400000,30450000,e,n,N
3,30450000,35800000,N,e,n
4,35800000,36200000,n,N,e


### load phoneme dictionary

In [8]:
phone_dict_path = 'JPN_phone_dict.txt'
phone_dict = load_phone_dict(phone_dict_path)

### convert timestamps to frame numbers

In [9]:
#### define window size and step size to convert timestamp to frame id
window_size = 250000
step_size = 100000

for df in df_list:
#   print(len(df['beg'].values))
  for index in range(len(df['beg'].values)):
#     print(index)
    df['beg'].values[index] = time_to_frame(df['beg'].values[index], window_size, step_size)
  for index in range(len(df['end'].values)):
#     print(index)
    df['end'].values[index] = time_to_frame(df['end'].values[index], window_size, step_size)

In [57]:
## now df_list contains 31 dataframe, one would be like this
df_list[0].head()

Unnamed: 0,beg,end,current,before,next
0,0,233,pau,,n
1,233,239,n,pau,e
2,239,299,e,n,N
3,299,353,N,e,n
4,353,357,n,N,e


### Unfold all frames and list them one by one

In [75]:
unfolded_df_list = list()
for dataset in df_list:
  unfold_list = list()
  for index in range(dataset.shape[0]):
    beg_ = dataset.iloc[index]['beg']
    end_ = dataset.iloc[index]['end']
    current_ = dataset.iloc[index]['current']
    before_ = dataset.iloc[index]['before']
    next_ = dataset.iloc[index]['next']
    corase_encoding_vectors = corase_encoding(index, beg_, end_)
#     print(corase_encoding_vectors)
    if index == 0:
      unfold_list.append([beg_, end_, 'None', current_, next_, corase_encoding(beg_, beg_, end_)])
    else:
      unfold_list.append([beg_, end_, before_, current_, next_, corase_encoding(beg_, beg_, end_)])
    for i in range(beg_+1, end_):
      unfold_list.append([beg_, end_, before_, current_, next_, corase_encoding(i, beg_, end_)])
    if index == (dataset.shape[0] -1):
      unfold_list.append([beg_, end_, before_, current_, 'None', corase_encoding(end_, beg_, end_)])
    else:
      unfold_list.append([beg_, end_, before_, current_, next_, corase_encoding(end_, beg_, end_)])
  unfold_df = pd.DataFrame(unfold_list, columns=['beg', 'end', 'before', 'current', 'next', 'coarse'])
  unfolded_df_list.append(unfold_df)

In [66]:
def corase_encoding(current_index, beg, end):
  middle = int((end-beg)/2)
  if current_index == beg:
    return [1, 0, 0]
  elif current_index == middle:
    return [0, 1, 0]
  elif current_index == end:
    return [0, 0, 1]
  else:
    inverse_dist_beg = 1/np.abs(current_index-beg)
    inverse_dist_middle = 1/np.abs(current_index-middle)
    inverse_dist_end = 1/np.abs(current_index-end)
    p_beg = inverse_dist_beg/(inverse_dist_beg+inverse_dist_middle+inverse_dist_end)
    p_middle = inverse_dist_middle/(inverse_dist_beg+inverse_dist_middle+inverse_dist_end)
    p_end = inverse_dist_end/(inverse_dist_beg+inverse_dist_middle+inverse_dist_end)
    return [p_beg, p_middle, p_end]

In [89]:
## see how corase_encoding works. 
## Input: current position, beg of this phoneme, end of this phoneme 
## Output: the probability of the current frame's position in the [begining, middle, end] of the current phoneme
print(corase_encoding(110, 0, 233))
## This function has been built in the feature generation process itself so you don't need to worry about it.

[0.04943729903536978, 0.9063504823151126, 0.044212218649517694]


In [87]:
## now unfolded_df_list is a list contains 31 dataframe, 
## one example would be like this, the index at the front means frame id
unfolded_df_list[0]

Unnamed: 0,beg,end,before,current,next,coarse
0,0,233,,pau,n,"[1, 0, 0]"
1,0,233,,pau,n,"[0.9871609871609871, 0.008584008584008583, 0.0..."
2,0,233,,pau,n,"[0.9744671403197158, 0.017095914742451153, 0.0..."
3,0,233,,pau,n,"[0.961915688959621, 0.025537584662644806, 0.01..."
4,0,233,,pau,n,"[0.949503924181845, 0.03391085443506589, 0.016..."
5,0,233,,pau,n,"[0.9372291967559161, 0.04221753138540162, 0.02..."
6,0,233,,pau,n,"[0.9250889152341434, 0.05045939537640782, 0.02..."
7,0,233,,pau,n,"[0.9130805441269135, 0.05863820008154491, 0.02..."
8,0,233,,pau,n,"[0.9012016021361816, 0.06675567423230976, 0.03..."
9,0,233,,pau,n,"[0.8894496604445764, 0.07481352284113259, 0.03..."


### save the unfolded dataframe to disk

In [90]:
### a folder under /mono/csv/ called /unfolded/ will becreated which holds all datafames like above
try:
  # create dictionary to hold all the unfolded file 
  os.mkdir(label_path + 'csv/unfolded/');
  print('unfolded folder created')
except:
  print('folder already exist')
  pass
for i in range(len(csvList)):
  unfolded_df_list[i].to_csv(label_path + 'csv/unfolded/'+ csvList[i].split('/')[-1], sep='\t', index=False)

folder already exist


### After you created all the files, you can load them back to memory easily


## NOTICE, if you already have all the files saved, the code below is the only thing you need to load the file and start to work with your training. The code before is only for generating. 

In [91]:
## this is the path we just created
unfolded_path = label_path + 'csv/unfolded/'
# get all csv files 
fileNameList = [unfolded_path + f for f in os.listdir(unfolded_path) if f.endswith('.csv')]
unfolded_df_list_ = list()
## add all csv files as dataframe to a list
for file in fileNameList:
  df = pd.read_csv(file, sep='\t')
  unfolded_df_list_.append(df)

In [92]:
unfolded_df_list_[0].head()

Unnamed: 0,beg,end,before,current,next,coarse
0,0,233,,pau,n,"[1, 0, 0]"
1,0,233,,pau,n,"[0.9871609871609871, 0.008584008584008583, 0.0..."
2,0,233,,pau,n,"[0.9744671403197158, 0.017095914742451153, 0.0..."
3,0,233,,pau,n,"[0.961915688959621, 0.025537584662644806, 0.01..."
4,0,233,,pau,n,"[0.949503924181845, 0.03391085443506589, 0.016..."


### of course you may also want the filename of this specify dataframe to match with raw files

In [93]:
# the index of fileNameList maintains the same with unfolded_df_list_
fileNameList[0].split('/')[-1]

'nitech_jp_song070_f001_010.lab.csv'

### get one-hot-vector, call phone_to_one_hot() to a specific phoneme

In [94]:
one_hot = phone_to_one_hot('sil', phone_dict)
print('sil', one_hot)

sil [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


### to access one specifiy position

In [95]:
### the first dataframe in the list, the index 0, and the current label
unfolded_df_list_[0].iloc[0]['current']

'pau'