In [4]:
import os
import glob
import argparse
import json
import jieba



class CHINESE(object):
    def __init__(self,config):
        super(CHINESE, self).__init__()
        self.chinese_root = config.chinese_dataset_root
        self.store_path = config.chinese_save_root
        self.vocab_path = config.vocab_path
        
    
    def extract_words(self):
        word_list=[]
        self.train_dir = os.path.join(self.chinese_root,'TRAIN')
        WAV_files = sorted(glob.glob(self.train_dir+'/*.WAV'))
        i=0
        for audio_filepath in WAV_files:
            print(i)
            i+=1
            #print(audio_filepath)
            txt_file = audio_filepath[:-4]+'.txt'
            lines = open(txt_file, 'r',encoding='utf-8') 
            for line in lines:
                #print(line)
                words = jieba.lcut(line)
                for item in words:
                    if item not in word_list:
                        word_list.append(item)
 

        #print(word_list)
        #print(len(word_list))
        return word_list
    
    def create_vocab(self):
        word_list = self.extract_words()
        full_chars_list = []
        for word in word_list:
            full_chars_list.append(word)
        full_chars_list.append('<s>')
        full_chars_list.append('</s>')
        full_chars_list.append('_')
        vocab = sorted(list(set(full_chars_list)))
        fid_vocab = open(self.vocab_path,'w',encoding='utf-8')
        for item in vocab:
            fid_vocab.write(item+'\n')
        fid_vocab.close()
    
    def create_vocab_dict(self):
        vocab_list = [line.rstrip('\n') for line in open(self.vocab_path,,encoding='utf-8')]
        i=0
        vocab_dict={}
        for item in vocab_list:
            vocab_dict[item] = i
            i+=1
        return vocab_dict
    
        
    def create_char_mapping(self,word_file,vocab_dict):
        lines = open(word_file, 'r',encoding='utf-8')
        for line in lines:
            words = jieba.lcut(line)
            char_mapped=[]
            chars_list=[]
            chars_list.append('<s>')
            char_mapped.append(vocab_dict['<s>'])
            for item in words:
                word =item
                char = item
                try:
                    phns = vocab_dict[char]
                except:
                    continue
                char_mapped.append(phns)
                chars_list.append(char)
                char_mapped.append(vocab_dict['_'])
                chars_list.append('_')
            char_mapped.append(vocab_dict['</s>'])
            chars_list.append('</s>')
        #print(chars_list)
        return char_mapped,chars_list
    
    
            
    def process_data_train(self):
        if not os.path.exists(self.store_path):
            os.makedirs(self.store_path)
        self.train_store_path = os.path.join(self.store_path,'TRAIN')
        if not os.path.exists(self.train_store_path):
            os.makedirs(self.train_store_path)
        vocab_dict = self.create_vocab_dict()
        self.train_dir = os.path.join(self.chinese_root,'TRAIN')
        WAV_files = sorted(glob.glob(self.train_dir+'/*.WAV'))
        i = 1
        for audio_filepath in WAV_files:
            print(i)
            i += 1
            txt_file = audio_filepath[:-4]+'.txt'
            char_mapped,chars_list = self.create_char_mapping(txt_file,vocab_dict)
            json_write_filepath =self.train_store_path+'/'+audio_filepath.split('\\')[-1][:-4]+'.json'
            data_frame = {}
            data_frame['audio_filepath'] = audio_filepath.replace('\\','/').replace('..','.')
            data_frame['char_map_seq'] = ' '.join([str(char_item) for char_item in char_mapped])
            data_frame['chars'] = ' '.join([str(char_item) for char_item in chars_list])
            #print(chars_list)
            data_frame['char_seq_len']=len(char_mapped)
            with open(json_write_filepath, 'w',encoding='utf-8') as fid:
                json.dump(data_frame, fid,ensure_ascii=False,indent=4)
                        
        
    def process_data_test(self):
        if not os.path.exists(self.store_path):
            os.makedirs(self.store_path)
        self.test_store_path = os.path.join(self.store_path,'TEST')
        if not os.path.exists(self.test_store_path):
            os.makedirs(self.test_store_path)
        vocab_dict = self.create_vocab_dict()
        self.test_dir = os.path.join(self.chinese_root,'TEST')
        WAV_files = sorted(glob.glob(self.test_dir+'/*.WAV'))
        for audio_filepath in WAV_files:
            txt_file = audio_filepath[:-4]+'.txt'
            char_mapped,chars_list = self.create_char_mapping(txt_file,vocab_dict)
            json_write_filepath =self.test_store_path+'/'+audio_filepath.split('\\')[-1][:-4]+'.json'
            data_frame = {}
            data_frame['audio_filepath'] = audio_filepath.replace('\\','/').replace('..','.')
            data_frame['char_map_seq'] = ' '.join([str(char_item) for char_item in char_mapped])
            data_frame['chars'] = ' '.join([str(char_item) for char_item in chars_list])
            #print(chars_list)
            data_frame['char_seq_len']=len(char_mapped)
            with open(json_write_filepath, 'w',encoding='utf-8') as fid:
                json.dump(data_frame, fid,ensure_ascii=False,indent=4)
        


parser = argparse.ArgumentParser("Configuration for data preparation")
parser.add_argument("--chinese_dataset_root", default="../CHINESE", type=str,help='Dataset path')
parser.add_argument("--chinese_save_root", default="../CHINESE/processed_data", type=str,help='Save directory after processing')
parser.add_argument("--vocab_path",default='../data_utils/vocab.txt',type=str, help='Filepath to write vocabulary')

print('start')
config = parser.parse_known_args()[0]#parser.parse_args()
chinese = CHINESE(config)

chinese.create_vocab() #创建语料表
chinese.process_data_train()
chinese.process_data_test()


SyntaxError: invalid syntax (<ipython-input-4-e632a2196fc9>, line 55)

In [6]:
import glob

train_files = glob.glob("../CHINESE/processed_data/TRAIN"+'/*.json')
test_files = glob.glob("../CHINESE/processed_data/TEST"+'/*.json')

write_train_list = open("../data_utils/training.txt",'w',encoding='utf-8')
for train_file in train_files:
    train_file = './'+'/'.join(train_file.split('/')[1:])
    write_train_list.write(train_file+'\n')
write_train_list.close()

write_test_list = open("../data_utils/testing.txt",'w',encoding='utf-8')
for test_file in test_files:
    test_file = './'+'/'.join(test_file.split('/')[1:])
    write_test_list.write(test_file+'\n')
write_test_list.close()