In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import os
import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
from tqdm import tqdm

import commons
import utils
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence

from scipy.io.wavfile import write

from pydub import AudioSegment

def num2timestr( num ):
    return str(int(int(num)/3600))+':'+str(int(int(num)/60))+':'+str(int(int(num)%60))+'.'+str(int(int(num*100)%100))
def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

def single(words, index):
    print(words[index][1])
    audios = []
    
#训练员单人模型
    hps = utils.get_hparams_from_file(trainerConfigPath)
    net_g_2 = SynthesizerTrn(
        len(symbols),
        hps.data.filter_length // 2 + 1,
        hps.train.segment_size // hps.data.hop_length,
        **hps.model).cuda()
    _ = net_g_2.eval()

    _ = utils.load_checkpoint(trainerPthPath, net_g_2, None)

    #马娘多人模型
    hps_ms = utils.get_hparams_from_file(configPath)
    net_g = SynthesizerTrn(
        len(hps_ms.symbols),
        hps_ms.data.filter_length // 2 + 1,
        hps_ms.train.segment_size // hps_ms.data.hop_length,
        n_speakers=hps_ms.data.n_speakers,
        **hps_ms.model).cuda()
    _ = net_g.eval()
    
    _ = utils.load_checkpoint(pthPath, net_g, None)


    print("ID: "+str(index) +" "+ words[index][0]+"：   "+words[index][1])
    for i in range(10):
        if words[index][0]=="T":
            stn_tst = get_text(words[index][1], hps)
            with torch.no_grad():
                x_tst = stn_tst.cuda().unsqueeze(0)
                x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
                audio = net_g_2.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
        else:
            stn_tst = get_text(words[index][1], hps_ms)
            with torch.no_grad():
                x_tst = stn_tst.cuda().unsqueeze(0)
                x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
                sid = torch.LongTensor([umaDict[words[index][0]]]).cuda()
                audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
                audios.append(audio)
                print("count: "+str(i))
                ipd.display(ipd.Audio(audio, rate=hps_ms.data.sampling_rate, normalize=False))
    return audios
            



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
umaDict = {'特别周': 0, '无声铃鹿': 1, '东海帝皇': 2, '丸善斯基': 3, '富士奇迹': 4,
           '小栗帽': 5, '黄金船': 6, '伏特加': 7, '大和赤骥': 8, '大树快车': 9, '草上飞': 10,
           '菱亚马逊': 11, '目白麦昆': 12, '神鹰': 13, '好歌剧': 14, '成田白仁': 15, '鲁道夫象征': 16, 
           '气槽': 17, '爱丽数码': 18, '星云天空': 19, '玉藻十字': 20, '美妙姿势': 21, '琵琶晨光': 22, '摩耶重炮': 23, 
           '曼城茶座': 24, '美浦波旁': 25, '目白赖恩': 26, '菱曙': 27, '雪中美人': 28, '米浴': 29, '艾尼斯风神': 30,
           '爱丽速子': 31, '爱慕织姬': 32, '稻荷一': 33, '胜利奖券': 34, '空中神宫': 35, '荣进闪耀': 36, '真机伶': 37,
           '川上公主': 38, '黄金城': 39, '樱花进王': 40, '采珠': 41, '新光风': 42, '东商变革': 43, '超级小海湾': 44,
           '醒目飞鹰': 45, '荒漠英雄': 46, '东瀛佐敦': 47, '中山庆典': 48, '成田大进': 49, '西野花': 50, '春乌拉拉': 51, 
           '青竹回忆': 52, '微光飞驹': 53, '美丽周日': 54, '待兼福来': 55, 'Mr CB': 56, '名将怒涛': 57, '目白多伯': 58,
           '优秀素质': 59, '帝王光辉': 60, '待兼诗歌剧': 61, '生野狄杜斯': 62, '目白善信': 63, '大拓太阳神': 64, 
           '双涡轮': 65, '里见光钻': 66, '北部玄驹': 67, '樱花千代王': 68, '天狼星象征': 69, '目白阿尔丹': 70,
           '八重无敌': 71, '鹤丸刚志': 72, '目白光明': 73, '成田路': 74, '也文摄辉': 75, '小林力奇': 76,
           '北港火山': 77, '奇瑞骏': 78, '苦涩糖霜': 79, '小小蚕茧': 80, '骏川手纲': 81, '秋川弥生': 82,
           '乙名史悦子': 83, '桐生院葵': 84, '安心泽刺刺美': 85, '樫本理子': 86}
configPath = "./logs/uma.json"
trainerConfigPath="./logs/single.json"
pthPath = "./logs/uma900.pth"
trainerPthPath="./logs/kkr.pth"
#读取角色
#角色名


In [3]:
# 说话人|句子的结构 一行一行地读。
# 根据每句话之间的\n的数量来控制空白语音的数量 每一个\n加500ms
# 把CPU推理 GPU推理整合成一个函数 直接调用 算了吧 有点复杂
#每一句话将由三部分组成： 说话人,句子,句尾留白长度(500ms*n) 完成
words__ = open("words.txt",'r',encoding="UTF-8")
words_ = words__.readlines()
words = []
for i in range(len(words_)):
    if words_[i].find("|") != -1:
        words.append(words_[i].split('|'))
        words[-1].append(0)
    else:
        if words_[i] == '\n':
            words[-1][2]+=1
            continue
        words[-1][1]=words[-1][1]+words_[i]
for word in words:
    word[1].replace('','')
    print(word)

['优秀素质', '「チョコが一番ですわ」。\n', 0]
['优秀素质', 'わふると言えばコレですわ。\n', 0]
['优秀素质', '種類いっぱいありますけども。\n', 0]
['优秀素质', 'これだけあれば。\n', 0]
['优秀素质', '勝ちですわ。\n', 2]
['优秀素质', '「待望の新しょうひん」。\n', 0]
['优秀素质', '今回はラスクですわ。\n', 0]
['优秀素质', 'ランチパックの耳なんで何をどうしても美味いに決まってますわ。\n', 0]
['优秀素质', '美味すぎて手が止まりませんわ。\n', 0]
['优秀素质', 'パクパクですわ。\n', 2]
['优秀素质', '「毎夜コレですわ」。\n', 0]
['优秀素质', 'そのままつまんで食べられるこのしょうひん。\n', 0]
['优秀素质', 'ほんのり塩味がいい感じ！\n', 0]
['优秀素质', 'これ食べて酒呑んで永久コンボですわ。\n', 0]
['优秀素质', '永久機関の完成ですわ。', 0]


In [69]:
#批量生成 GPU推理
txt_str=[]

txt_str.append("[V4+ Styles]")
txt_str.append("Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour,OutlineColour,BackColour,Bold,Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment,MarginL, MarginR, MarginV, Encoding")
txt_str.append("Style: Default,MS Gothic,20,&H00FFFFFF,&H000000FF,&H00000000,&H00000000,0,0,0,0,100,100,0,0,1,2,2,2,10,10,10,1\n")

txt_str.append("")
txt_str.append("[Events]")
txt_str.append("Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text")
total_duration=0
output_sound=[]
#上面是新加的

#训练员单人模型
hps = utils.get_hparams_from_file(trainerConfigPath)
net_g_2 = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model).cuda()
_ = net_g_2.eval()

_ = utils.load_checkpoint(trainerPthPath, net_g_2, None)

#马娘多人模型
hps_ms = utils.get_hparams_from_file(configPath)
net_g = SynthesizerTrn(
    len(hps_ms.symbols),
    hps_ms.data.filter_length // 2 + 1,
    hps_ms.train.segment_size // hps_ms.data.hop_length,
    n_speakers=hps_ms.data.n_speakers,
    **hps_ms.model).cuda()
_ = net_g.eval()

_ = utils.load_checkpoint(pthPath, net_g, None)
#为了方便计数 从0开始
cnt=0
blank = AudioSegment.from_file("blank.wav")
blankDuration = blank.duration_seconds
for word in words:
    if word[0]=="T":
        stn_tst = get_text(word[1], hps)
        with torch.no_grad():
            x_tst = stn_tst.cuda().unsqueeze(0)
            x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
            audio = net_g_2.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
    else:
        stn_tst = get_text(word[1], hps_ms)
        with torch.no_grad():
            x_tst = stn_tst.cuda().unsqueeze(0)
            x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
            sid = torch.LongTensor([umaDict[word[0]]]).cuda()
            audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
    print("ID: "+str(cnt) +" "+ word[0]+"：   "+word[1])
    ipd.display(ipd.Audio(audio, rate=hps_ms.data.sampling_rate, normalize=False))
    #后面是新加的
    write("./output/output"+str(cnt)+".wav",hps_ms.data.sampling_rate,audio)
    sound = AudioSegment.from_file("./output/output"+str(cnt)+".wav")
    duration = sound.duration_seconds  # 音频时长（ms）
    print(total_duration)
    print(duration)
    if output_sound==[]:
        output_sound=sound
    else:
        output_sound = output_sound + sound
    print(word[2]+1)
    for i in range(word[2]):
        output_sound = output_sound + blank
    txt_str.append("Dialogue: 0,"+num2timestr(total_duration)+','+(num2timestr(total_duration+duration))+','+'Default'+',,0,0,0,,'+str(word[1]).replace("\n","\\N"))
    total_duration=total_duration+blankDuration*word[2]+duration
    cnt+=1
with open("./output/output.ass", "w", encoding="UTF-8") as f:
    f.write("\n".join(txt_str))
output_sound.export("./output/output.wav", format="wav")  # 保存文件
temp = open("./output/output.wav")
temp.close()
print("COMPLETE!!!!!")

INFO:root:Loaded checkpoint './logs/kkr.pth' (iteration 856)
INFO:root:Loaded checkpoint './logs/uma900.pth' (iteration 414)
ID: 0 优秀素质：   「チョコが一番ですわ」。



DEBUG:pydub.converter:subprocess.call(['ffmpeg', '-y', '-i', './output/output0.wav', '-acodec', 'pcm_s32le', '-vn', '-f', 'wav', '-'])
0
1.4628571428571429
1
ID: 1 优秀素质：   わふると言えばコレですわ。



DEBUG:pydub.converter:subprocess.call(['ffmpeg', '-y', '-i', './output/output1.wav', '-acodec', 'pcm_s32le', '-vn', '-f', 'wav', '-'])
1.4628571428571429
2.020136054421769
1
ID: 2 优秀素质：   種類いっぱいありますけども。



DEBUG:pydub.converter:subprocess.call(['ffmpeg', '-y', '-i', './output/output2.wav', '-acodec', 'pcm_s32le', '-vn', '-f', 'wav', '-'])
3.482993197278912
1.7647165532879818
1
ID: 3 优秀素质：   これだけあれば。



DEBUG:pydub.converter:subprocess.call(['ffmpeg', '-y', '-i', './output/output3.wav', '-acodec', 'pcm_s32le', '-vn', '-f', 'wav', '-'])
5.2477097505668935
1.207437641723356
1
ID: 4 优秀素质：   勝ちですわ。



DEBUG:pydub.converter:subprocess.call(['ffmpeg', '-y', '-i', './output/output4.wav', '-acodec', 'pcm_s32le', '-vn', '-f', 'wav', '-'])
6.455147392290249
0.7430385487528345
3
ID: 5 优秀素质：   「待望の新しょうひん」。



DEBUG:pydub.converter:subprocess.call(['ffmpeg', '-y', '-i', './output/output5.wav', '-acodec', 'pcm_s32le', '-vn', '-f', 'wav', '-'])
8.198185941043084
1.486077097505669
1
ID: 6 优秀素质：   今回はラスクですわ。



DEBUG:pydub.converter:subprocess.call(['ffmpeg', '-y', '-i', './output/output6.wav', '-acodec', 'pcm_s32le', '-vn', '-f', 'wav', '-'])
9.684263038548753
1.3119274376417234
1
ID: 7 优秀素质：   ランチパックの耳なんで何をどうしても美味いに決まってますわ。



DEBUG:pydub.converter:subprocess.call(['ffmpeg', '-y', '-i', './output/output7.wav', '-acodec', 'pcm_s32le', '-vn', '-f', 'wav', '-'])
10.996190476190476
4.051882086167801
1
ID: 8 优秀素质：   美味すぎて手が止まりませんわ。



DEBUG:pydub.converter:subprocess.call(['ffmpeg', '-y', '-i', './output/output8.wav', '-acodec', 'pcm_s32le', '-vn', '-f', 'wav', '-'])
15.048072562358277
2.020136054421769
1
ID: 9 优秀素质：   パクパクですわ。



DEBUG:pydub.converter:subprocess.call(['ffmpeg', '-y', '-i', './output/output9.wav', '-acodec', 'pcm_s32le', '-vn', '-f', 'wav', '-'])
17.068208616780044
1.1609977324263039
3
ID: 10 优秀素质：   「毎夜コレですわ」。



DEBUG:pydub.converter:subprocess.call(['ffmpeg', '-y', '-i', './output/output10.wav', '-acodec', 'pcm_s32le', '-vn', '-f', 'wav', '-'])
19.229206349206347
1.253877551020408
1
ID: 11 优秀素质：   そのままつまんで食べられるこのしょうひん。



DEBUG:pydub.converter:subprocess.call(['ffmpeg', '-y', '-i', './output/output11.wav', '-acodec', 'pcm_s32le', '-vn', '-f', 'wav', '-'])
20.483083900226756
2.786394557823129
1
ID: 12 优秀素质：   ほんのり塩味がいい感じ！



DEBUG:pydub.converter:subprocess.call(['ffmpeg', '-y', '-i', './output/output12.wav', '-acodec', 'pcm_s32le', '-vn', '-f', 'wav', '-'])
23.269478458049885
2.5425850340136056
1
ID: 13 优秀素质：   これ食べて酒呑んで永久コンボですわ。



DEBUG:pydub.converter:subprocess.call(['ffmpeg', '-y', '-i', './output/output13.wav', '-acodec', 'pcm_s32le', '-vn', '-f', 'wav', '-'])
25.81206349206349
2.9373242630385485
1
ID: 14 优秀素质：   永久機関の完成ですわ。


DEBUG:pydub.converter:subprocess.call(['ffmpeg', '-y', '-i', './output/output14.wav', '-acodec', 'pcm_s32le', '-vn', '-f', 'wav', '-'])
28.74938775510204
2.182675736961451
1
COMPLETE!!!!!


In [4]:
#对不满意的语句重新生成
#一次生成10句 选好的在下个单元格保存

    
wordID = 12
audios = single(words,wordID)


ほんのり塩味がいい感じ！

INFO:root:Loaded checkpoint './logs/kkr.pth' (iteration 856)
INFO:root:Loaded checkpoint './logs/uma900.pth' (iteration 414)
ID: 12 优秀素质：   ほんのり塩味がいい感じ！

count: 0


count: 1


count: 2


count: 3


count: 4


count: 5


count: 6


count: 7


count: 8


count: 9


In [73]:
#选择上面想要的保存
print(wordID)
wantedAudio =6
write("./output/output"+str(wordID)+".wav",hps_ms.data.sampling_rate,audios[wantedAudio])
print("Saved!")

12
Saved!


In [74]:
#重整合
txt_str=[]
txt_str.append("[V4+ Styles]")
txt_str.append("Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour,OutlineColour,BackColour,Bold,Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment,MarginL, MarginR, MarginV, Encoding")
txt_str.append("Style: Default,MS Gothic,20,&H00FFFFFF,&H000000FF,&H00000000,&H00000000,0,0,0,0,100,100,0,0,1,2,2,2,10,10,10,1\n")
txt_str.append("")
txt_str.append("[Events]")
txt_str.append("Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text")
total_duration=0
output_sound=[]

blank = AudioSegment.from_file("blank.wav")
blankDuration = blank.duration_seconds

for i in range(len(words)):
    sound = AudioSegment.from_file("./output/output"+str(i)+".wav")
    duration = sound.duration_seconds  # 音频时长（ms）
    #print(total_duration)
    #print(duration)
    print("ID: "+str(i) +" "+ words[i][0]+"：   "+words[i][1])
    if output_sound==[]:
        output_sound=sound
    else:
        output_sound = output_sound + sound
    for j in range(words[i][2]):
        output_sound = output_sound+blank
    txt_str.append("Dialogue: 0,"+num2timestr(total_duration)+','+(num2timestr(total_duration+duration))+','+'Default'+',,0,0,0,,'+str(words[i][1]).replace("\n","\\N"))
    total_duration=total_duration+blankDuration*words[i][2]+duration
with open("./output/output.ass", "w", encoding="utf8") as f:
    f.write("\n".join(txt_str))
output_sound.export("./output/output.wav", format="wav")  # 保存文件
temp = open("./output/output.wav")
temp.close()
print("DONE!")

DEBUG:pydub.converter:subprocess.call(['ffmpeg', '-y', '-i', './output/output0.wav', '-acodec', 'pcm_s32le', '-vn', '-f', 'wav', '-'])
ID: 0 优秀素质：   「チョコが一番ですわ」。

DEBUG:pydub.converter:subprocess.call(['ffmpeg', '-y', '-i', './output/output1.wav', '-acodec', 'pcm_s32le', '-vn', '-f', 'wav', '-'])
ID: 1 优秀素质：   わふると言えばコレですわ。

DEBUG:pydub.converter:subprocess.call(['ffmpeg', '-y', '-i', './output/output2.wav', '-acodec', 'pcm_s32le', '-vn', '-f', 'wav', '-'])
ID: 2 优秀素质：   種類いっぱいありますけども。

DEBUG:pydub.converter:subprocess.call(['ffmpeg', '-y', '-i', './output/output3.wav', '-acodec', 'pcm_s32le', '-vn', '-f', 'wav', '-'])
ID: 3 优秀素质：   これだけあれば。

DEBUG:pydub.converter:subprocess.call(['ffmpeg', '-y', '-i', './output/output4.wav', '-acodec', 'pcm_s32le', '-vn', '-f', 'wav', '-'])
ID: 4 优秀素质：   勝ちですわ。

DEBUG:pydub.converter:subprocess.call(['ffmpeg', '-y', '-i', './output/output5.wav', '-acodec', 'pcm_s32le', '-vn', '-f', 'wav', '-'])
ID: 5 优秀素质：   「待望の新しょうひん」。

DEBUG:pydub.converter:subproc

In [None]:
#单句 CPU推理
print(words[0][1])
#CPU推理
stn_tst = get_text(words[0][1], hps_ms)
with torch.no_grad():
    x_tst = stn_tst.unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
    sid = torch.LongTensor([umaDict[words[0][0]]])
    audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
ipd.display(ipd.Audio(audio, rate=hps_ms.data.sampling_rate, normalize=False))
print(audio)
write("./output/output.wav",hps_ms.data.sampling_rate,audio_)

In [None]:
blank = AudioSegment.from_file("blank.wav")
blankDuration = blank.duration_seconds
print(blankDuration)