In [119]:
import pandas as pd
import os
import torch
from tqdm import tqdm
import numpy as np
import shutil
from TTS.api import TTS
from transformers import pipeline
import soundfile as sf
from librosa import resample
from subprocess import check_call
import random

In [120]:
import sys
print(sys.version)


3.10.13 (main, Nov 21 2023, 14:24:09) [GCC 11.4.0]


In [121]:
df = pd.read_csv('TWATC_processed.csv')
df

Unnamed: 0,idx,type,character,sentence,j-hartmann/emotion-english-distilroberta-base,SamLowe/roberta-base-go_emotion,michellejieli/emotion_text_classifier
0,,speech,Madison,No!,anger,disapproval,anger
1,,consigne,,"Surprised, Alexandre jumps up and points his g...",,,
2,,speech,Alexandre,One move and you're dead!,anger,neutral,anger
3,,speech,Madison,Please don't shoot!,fear,neutral,fear
4,,speech,Alexandre,Hands up!,joy,neutral,anger
...,...,...,...,...,...,...,...
551,,consigne,,"He looks towards the audience, towards the win...",,,
552,,speech,Madison,So? should I tell her to come over?,surprise,curiosity,neutral
553,,speech,Alexandre,She crossed the Atlantic to find me. I can cro...,neutral,neutral,neutral
554,,consigne,,Alexander kisses Madison.,,,


In [122]:
list_character = df['character'].dropna().unique().tolist()
list_character

['Madison', 'Alexandre']

In [123]:
dico = {
    "Madison" : "bea",
    "Alexandre" : "sam",
}
emotion_dico = {
    "anger": "Angry",
    "disgust": "Disgusted",
    "neutral": "Neutral",
    "sadness": "Sleepy", 
    "joy": "Amused",
    "fear": "Neutral",
    "surprise": "Neutral",
}

In [124]:
df['audio'] = df['character'].map(dico)
df['emotion'] = df['michellejieli/emotion_text_classifier'].map(emotion_dico)
df

Unnamed: 0,idx,type,character,sentence,j-hartmann/emotion-english-distilroberta-base,SamLowe/roberta-base-go_emotion,michellejieli/emotion_text_classifier,audio,emotion
0,,speech,Madison,No!,anger,disapproval,anger,bea,Angry
1,,consigne,,"Surprised, Alexandre jumps up and points his g...",,,,,
2,,speech,Alexandre,One move and you're dead!,anger,neutral,anger,sam,Angry
3,,speech,Madison,Please don't shoot!,fear,neutral,fear,bea,Neutral
4,,speech,Alexandre,Hands up!,joy,neutral,anger,sam,Angry
...,...,...,...,...,...,...,...,...,...
551,,consigne,,"He looks towards the audience, towards the win...",,,,,
552,,speech,Madison,So? should I tell her to come over?,surprise,curiosity,neutral,bea,Neutral
553,,speech,Alexandre,She crossed the Atlantic to find me. I can cro...,neutral,neutral,neutral,sam,Neutral
554,,consigne,,Alexander kisses Madison.,,,,,


In [125]:
# Drop non audio lines
df = df.dropna(subset=['audio'])
df

Unnamed: 0,idx,type,character,sentence,j-hartmann/emotion-english-distilroberta-base,SamLowe/roberta-base-go_emotion,michellejieli/emotion_text_classifier,audio,emotion
0,,speech,Madison,No!,anger,disapproval,anger,bea,Angry
2,,speech,Alexandre,One move and you're dead!,anger,neutral,anger,sam,Angry
3,,speech,Madison,Please don't shoot!,fear,neutral,fear,bea,Neutral
4,,speech,Alexandre,Hands up!,joy,neutral,anger,sam,Angry
6,,speech,Madison,"Calm down, you see, I'm unarmed . Now please p...",fear,neutral,fear,bea,Neutral
...,...,...,...,...,...,...,...,...,...
549,,speech,Madison,"Yes, but she hasn't read it yet. She didn't kn...",surprise,neutral,sadness,bea,Sleepy
550,,speech,Alexandre,I still love her. This book is the proof of that.,joy,love,joy,sam,Amused
552,,speech,Madison,So? should I tell her to come over?,surprise,curiosity,neutral,bea,Neutral
553,,speech,Alexandre,She crossed the Atlantic to find me. I can cro...,neutral,neutral,neutral,sam,Neutral


In [126]:
df[df["emotion"].isna()]

Unnamed: 0,idx,type,character,sentence,j-hartmann/emotion-english-distilroberta-base,SamLowe/roberta-base-go_emotion,michellejieli/emotion_text_classifier,audio,emotion


In [127]:

# Create a new column 'count_column' using a loop
counts = {}
count_column = []

for value in df['character']:
    if value in counts:
        counts[value] += 1
    else:
        counts[value] = 1
    count_column.append(counts[value])

df['count_column'] = count_column

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['count_column'] = count_column


In [128]:
folder_path = 'test_dataset'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

In [129]:
def get_fname(s):
    return s.split("\t")[0]

def get_emotion(s):
    return get_fname(s).split("_")[0].split("/")[1].lower()
def get_spker_id(s):
    return get_fname(s).split("_")[0].split("/")[0].lower()

def get_utt_id(s):
    return get_fname(s).split(".")[0].split("_")[-1]
def get_all_different_utt_id(tsv_lines) -> list:
    utts_per_speaker = {
    "sam": [],
    "bea": [],
    "josh": [],
    "jenie": [],
    }
    for line in tsv_lines:
        spkr = get_spker_id(line)
        utt_id = get_utt_id(line)
        if utt_id not in utts_per_speaker[spkr]:
            utts_per_speaker[spkr].append(utt_id)
    return utts_per_speaker
def tsv_per_emotion(tsv_lines, emotion) -> list:
    lines = []
    for line in tsv_lines:
        if emotion.lower() == get_emotion(line):      
            lines.append(line)      
    return lines
def get_tsv_lines_for_utt_ids(tsv_lines, specific_utt_id):
    utts_per_speaker = []
    for line in tsv_lines:
        spkr = get_spker_id(line)
        utt_id = get_utt_id(line)
        if utt_id == specific_utt_id:
            # Assuming the audio file name is also part of the line, extract it
            utts_per_speaker.append(line)

    return utts_per_speaker
def get_tsv_lines_for_emotion(tsv_lines, emotion):
    audio_files_name = []

    for line in tsv_lines:
        if emotion == get_emotion(line):
            # Assuming the audio file name is also part of the line, extract it
            audio_files_name.append(line)

    return audio_files_name


tsv_lines = open("data.tsv", "r").readlines()
root, tsv_lines = tsv_lines[0], tsv_lines[1:]
utts = get_all_different_utt_id(tsv_lines)

In [130]:
def get_number_audio_per_emotion(df, emotion):
    return len(df[df['emotion'] == emotion])

In [131]:
# Function to randomly sample and remove rows from DataFrame
def sample_and_remove_rows(df, num_rows):
    # Randomly sample rows
    sampled_rows = df.sample(n=num_rows)

    # Remove sampled rows from the original DataFrame
    df.drop(sampled_rows.index, inplace=True)

    return sampled_rows, df

In [132]:
def decompose_base_2(number):
    powers = []
    remainder = number
    power = 0

    # Find the highest power of 2 less than or equal to the number
    while 2 ** power <= number:
        power += 1

    # Subtract powers of 2 from the number and store them
    for i in range(power - 1, -1, -1):
        if 2 ** i <= remainder:
            powers.append(2 ** i)
            remainder -= 2 ** i

    return powers

In [133]:
def call(
    model_dir,
    data,
    split,
    output_path,
    src_emotion,
    trg_emotion,
    dict,
    user_dir,
    dataset
):
    cmd = f"""python3 fairseq/examples/emotion_conversion/preprocessing.py \
    --model-dir {model_dir} \
    --data {data} \
    --split {split} \
    --output-path {output_path} \
    --src-emotion {src_emotion} \
    --trg-emotion {trg_emotion} \
    --dict {dict} \
    --user-dir {user_dir} \
    --dataset {dataset}"""
    
    check_call(cmd, shell=True)

In [134]:
file_path = "fairseq/examples/emotion_conversion/data"

In [135]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

In [136]:
tts = TTS("tts_models/en/ljspeech/fast_pitch").to(device)

 > tts_models/en/ljspeech/fast_pitch is already downloaded.
 > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.
 > Using model: fast_pitch
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024
 > Vocoder Model: hifigan
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resam

In [137]:
root = "/home/utilisateur/createch/project/emotion/dataset"

In [138]:
num_audios = len(os.listdir("/home/utilisateur/createch/project/emotion/dataset_final"))

In [139]:
print(get_all_different_utt_id(tsv_lines))

{'sam': ['0150', '0100', '0201', '0306', '0036', '0312', '0423', '0399', '0501', '0390', '0151', '0309', '0438', '0101', '0200', '0037', '0313', '0405', '0422', '0356', '0398', '0391', '0224', '0355', '0369', '0388', '0503', '0373', '0392', '0152', '0219', '0102', '0148', '0203', '0034', '0310', '0421', '0336', '0354', '0368', '0389', '0420', '0502', '0372', '0153', '0218', '0103', '0149', '0202', '0305', '0035', '0311', '0407', '0349', '0375', '0222', '0331', '0353', '0248', '0302', '0032', '0316', '0400', '0427', '0154', '0252', '0104', '0348', '0374', '0168', '0223', '0330', '0352', '0504', '0204', '0249', '0303', '0033', '0426', '0155', '0029', '0105', '0300', '0030', '0314', '0402', '0425', '0250', '0418', '0106', '0329', '0377', '0199', '0220', '0333', '0351', '0206', '0301', '0031', '0315', '0403', '0424', '0157', '0251', '0419', '0107', '0328', '0376', '0198', '0221', '0350', '0411', '0436', '0145', '0243', '0162', '0358', '0396', '0342', '0215', '0364', '0410', '0437', '0144',

In [145]:
emotion_include = df["emotion"].unique()
emotion_include = emotion_include[~pd.isnull(emotion_include)]
print(emotion_include)
emotion_include = [emotion for emotion in emotion_include.tolist() if emotion != "Neutral"]
utts = get_all_different_utt_id(tsv_lines)
dataset_path = "/home/utilisateur/createch/project/emotion/dataset_test"
        
#clear the Neutral folder
for emov_charac in dico.values():
    if os.path.exists(os.path.join(dataset_path, emov_charac, "Neutral")):
        shutil.rmtree(os.path.join(dataset_path, emov_charac, "Neutral"))
    os.makedirs(os.path.join(dataset_path, emov_charac, "Neutral"), exist_ok=True)
print(tsv_lines)
emotion_df = df[df['emotion'].isin(emotion_include)]
plus_df = df[df["emotion"] == "Neutral"].sample(n = int(len(emotion_df)/2))
emotion_df = pd.concat([emotion_df, plus_df])
lines = []
for index, row in emotion_df.iterrows():
    
    #Create the wav name
    character = row["character"]

    specific_utt = utts[dico[character]].pop()
    name_audio = f"{character}_{row['count_column']}_{specific_utt}.wav"
    audio_path = os.path.join(dataset_path, f"{dico[character]}/Neutral",name_audio)
    print(audio_path)
    
    #Creation of the audio file
    sentence = row['sentence']
    print(sentence)
    tts.tts_to_file(sentence, file_path=audio_path)
    
    #Resample the audio file
    data, samplerate = sf.read(audio_path)
    data = resample(data,orig_sr = samplerate, target_sr = 16000)
    sf.write(audio_path, data, 16000)
    name = os.path.join(f"{dico[character]}/Neutral",name_audio)
    line = f"{name}\t{len(data)}\t\n"
    lines.append(line)

    for emotion in emotion_include:
        #save the line for the tsv
        tsv_lines_emotion = tsv_per_emotion(tsv_lines, emotion)
        tsv_lines_utt = get_tsv_lines_for_utt_ids(tsv_lines_emotion,specific_utt)
        for x in tsv_lines_utt:
            if x[-2:]!="\n" and x[:3]=="sam":
                x+="\n"
        print(tsv_lines_utt)
    
        #copy the audio of the tsv
        for line in tsv_lines_utt:
            if get_spker_id(line) == dico[character]:
                lines += [line]
                os.makedirs(os.path.join(dataset_path, dico[character], emotion), exist_ok=True)
                shutil.copy(os.path.join(root, line.split("\t")[0]), os.path.join(dataset_path, line.split("\t")[0]))
    
#save the tsv
lines = [dataset_path + "\t\n"] + lines
print(lines)
lines = lines[:-1] + [lines[-1][:-1]]
open(os.path.join(file_path, "data.tsv"), "w").writelines(lines)
for emotion in emotion_include:
    call(
        model_dir="/home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/save ",
        data="/home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/data",
        split="data",
        output_path="/home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/processed_data_test",
        src_emotion="neutral",
        trg_emotion=emotion.lower(),
        dict="/home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/data/dict.txt",
        user_dir="/home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/fairseq_models",
        dataset=f"/home/utilisateur/createch/project/emotion/dataset_final/{emotion.lower()}"
    )
        
    #save the good tsv

['Angry' 'Neutral' 'Amused' 'Sleepy' 'Disgusted']
['sam/Disgusted/Disgust_141-168_0150.wav\t75293\n', 'sam/Disgusted/Disgust_85-112_0100.wav\t60026\n', 'sam/Disgusted/Disgust_197-224_0201.wav\t115258\n', 'sam/Disgusted/disgust_281-308_0306.wav\t91293\n', 'sam/Disgusted/Disgust_29-56_0036.wav\t86106\n', 'sam/Disgusted/disgust_309-336_0312.wav\t119915\n', 'sam/Disgusted/disgust_421-448_0423.wav\t116247\n', 'sam/Disgusted/disgust_393-420_0399.wav\t63877\n', 'sam/Disgusted/disgust_477-504_0501.wav\t104297\n', 'sam/Disgusted/disgust_367-392_0390.wav\t74930\n', 'sam/Disgusted/Disgust_141-168_0151.wav\t119778\n', 'sam/Disgusted/disgust_309-336_0309.wav\t72949\n', 'sam/Disgusted/disgust_421-448_0438.wav\t136852\n', 'sam/Disgusted/Disgust_85-112_0101.wav\t68491\n', 'sam/Disgusted/Disgust_197-224_0200.wav\t127790\n', 'sam/Disgusted/Disgust_29-56_0037.wav\t88157\n', 'sam/Disgusted/disgust_309-336_0313.wav\t100057\n', 'sam/Disgusted/disgust_393-420_0405.wav\t108974\n', 'sam/Disgusted/disgust_421-4

 > Processing time: 0.08425688743591309
 > Real-time factor: 0.040756940329104145
['bea/Angry/anger_309-336_0317.wav\t34617\n', 'jenie/Angry/anger_309-336_0317.wav\t67096\n', 'sam/Angry/anger_309-336_0317.wav\t36701\n']
['sam/Amused/amused_309-336_0317.wav\t66777\n', 'jenie/Amused/amused_309-336_0317.wav\t66520\n']
['sam/Sleepy/sleepiness_309-336_0317.wav\t66962\n', 'bea/Sleepy/sleepiness_309-336_0317.wav\t53766\n', 'jenie/Sleepy/sleepiness_309-336_0317.wav\t49227\n']
['bea/Disgusted/disgust_309-336_0317.wav\t44936\n', 'sam/Disgusted/disgust_309-336_0317.wav\t58959\n']
/home/utilisateur/createch/project/emotion/dataset_test/sam/Neutral/Alexandre_2_0256.wav
Hands up!
 > Text splitted to sentences.
['Hands up!']
 > Processing time: 0.10108661651611328
 > Real-time factor: 0.08530924273500834
['sam/Angry/anger_253-280_0256.wav\t65845\n', 'bea/Angry/anger_253-280_0256.wav\t53967\n', 'jenie/Angry/anger_253-280_0256.wav\t70894\n']
['sam/Amused/amused_253-280_0256.wav\t106119\n', 'josh/Amused

2023-12-07 16:16:08 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2023-12-07 16:16:12 | INFO | fairseq.tasks.hubert_pretraining | current directory is /home/utilisateur/createch/project/emotion
2023-12-07 16:16:12 | INFO | fairseq.tasks.hubert_pretraining | HubertPretrainingTask Config {'_name': 'hubert_pretraining', 'data': '/checkpoint/wnhsu/data/librispeech/960h/iter/250K_50hz_km100_mp0_65_v2', 'fine_tuning': False, 'labels': ['layer6.km500'], 'label_dir': None, 'label_rate': 50.0, 'sample_rate': 16000, 'normalize': False, 'enable_padding': False, 'max_keep_size': None, 'max_sample_size': 250000, 'min_sample_size': 32000, 'single_target': False, 'random_crop': True, 'pad_audio': False}
2023-12-07 16:16:12 | INFO | fairseq.models.hubert.hubert | HubertModel Config: {'_name': 'hubert', 'label_rate': 50.0, 'extractor_mode': default, 'encoder_layers': 12, 'encoder_embed_dim': 768, 'encoder_ffn_embed_dim': 3072, 'encoder_attention_heads': 1

loading /home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/data/data.tsv and /home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/data/data.km
---
src emotions: neutral
trg emotions: angry
defaultdict(<function main.<locals>.<lambda> at 0x7f6997b34040>, {})
defaultdict(<function main.<locals>.<lambda> at 0x7f6997b34040>, {'SAME': defaultdict(<class 'list'>, {'0421': [0, 1], '0317': [2, 3, 4, 5, 6], '0256': [7, 8, 9, 10], '0344': [11, 12, 13, 14], '0402': [15, 16], '0437': [17, 18], '0436': [19, 20], '0411': [21, 22], '0216': [23, 24, 25, 26, 27], '0412': [28, 29], '0401': [30, 31, 32, 33, 34], '0447': [35, 36, 37, 38, 39], '0414': [40, 41], '0493': [42, 43, 44], '0449': [45, 46], '0455': [47, 48], '0192': [49, 50, 51, 52, 53], '0495': [54, 55, 56], '0443': [57, 58, 59, 60, 61], '0473': [62, 63], '0172': [64, 65, 66, 67, 68], '0226': [69, 70, 71, 72, 73], '0337': [74, 75, 76, 77, 78], '0263': [79, 80, 81, 82], '0484': [83, 84, 85, 86

2023-12-07 16:16:29 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2023-12-07 16:16:29 | INFO | fairseq_cli.preprocess | Namespace(no_progress_bar=False, log_interval=100, log_format=None, log_file=None, aim_repo=None, aim_run_hash=None, tensorboard_logdir=None, wandb_project=None, azureml_logging=False, seed=1, cpu=False, tpu=False, bf16=False, memory_efficient_bf16=False, fp16=False, memory_efficient_fp16=False, fp16_no_flatten_grads=False, fp16_init_scale=128, fp16_scale_window=None, fp16_scale_tolerance=0.0, on_cpu_convert_precision=False, min_loss_scale=0.0001, threshold_loss_scale=None, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, user_dir=None, empty_cache_freq=0, all_gather_list_size=16384, model_parallel_size=1, quantization_config_path=None, profile=False, reset_logging=False, suppress_crashes=False, use_plasma_view=False, plasma_path='/tmp/plasma', criterion='cross_entropy', tokenizer=None, bpe=None

----------------create mafnifest finish----------------


2023-12-07 16:16:31 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2023-12-07 16:16:33 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': False, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': '/home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/fairseq_models', 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'mod

loading pre-trained emotion translation model


2023-12-07 16:16:33 | INFO | fairseq.data.data_utils | loaded 142 examples from: /home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/processed_data_test/tokenized/test.neutral-angry.neutral
2023-12-07 16:16:33 | INFO | fairseq.data.data_utils | loaded 142 examples from: /home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/processed_data_test/tokenized/test.neutral-angry.angry
2023-12-07 16:16:33 | INFO | fairseq.tasks.translation | /home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/processed_data_test/tokenized test neutral-angry 142 examples
2023-12-07 16:16:33 | INFO | fairseq.tasks.fairseq_task | can_reuse_epoch_itr = True
2023-12-07 16:16:33 | INFO | fairseq.tasks.fairseq_task | reuse_dataloader = True
2023-12-07 16:16:33 | INFO | fairseq.tasks.fairseq_task | rebuild_batches = False
2023-12-07 16:16:33 | INFO | fairseq.tasks.fairseq_task | creating new batches for epoch 1
2023-12-07 16:16:47 | INFO | f

Loading '/home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/save/hifigan/g_00400000'
Complete.


2023-12-07 16:16:53 | INFO | synthesize_perso | loaded duration prediction model from /home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/save/duration_predictor
2023-12-07 16:16:53 | INFO | synthesize_perso | loaded f0 prediction model from CnnPredictor(
  (token_emb): Embedding(201, 256, padding_idx=200)
  (gst_emb): Embedding(20, 8)
  (conv_layer): ModuleList(
    (0): Sequential(
      (0): Rearrange('b t c -> b c t')
      (1): Conv1d(264, 256, kernel_size=(5,), stride=(1,), padding=(2,))
      (2): Rearrange('b c t -> b t c')
      (3): ReLU()
      (4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (5): Dropout(p=0.1, inplace=False)
    )
    (1-5): 5 x Sequential(
      (0): Rearrange('b t c -> b c t')
      (1): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,))
      (2): Rearrange('b c t -> b t c')
      (3): ReLU()
      (4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (5): Dropout(p=0.1, inplace=False)
   

/home/utilisateur/createch/project/emotion/dataset_final/angry
outdir: /home/utilisateur/createch/project/emotion/dataset_final/angry
140
#######speaker#######
bea
bea
0


1it [00:00,  2.65it/s]

#######speaker#######
sam
sam
3


2it [00:00,  2.36it/s]

#######speaker#######
sam
sam
3


3it [00:01,  1.52it/s]

#######speaker#######
sam
sam
3


4it [00:02,  1.78it/s]

#######speaker#######
sam
sam
3


5it [00:02,  2.29it/s]

#######speaker#######
sam
sam
3


6it [00:03,  1.92it/s]

#######speaker#######
sam
sam
3


7it [00:03,  2.35it/s]

#######speaker#######
sam
sam
3


8it [00:04,  1.41it/s]

#######speaker#######
sam
sam
3


9it [00:06,  1.04s/it]

#######speaker#######
sam
sam
3


10it [00:07,  1.06it/s]

#######speaker#######
bea
bea
0


11it [00:07,  1.25it/s]

#######speaker#######
bea
bea
0


12it [00:07,  1.48it/s]

#######speaker#######
sam
sam
3


13it [00:08,  1.54it/s]

#######speaker#######
bea
bea
0


14it [00:08,  1.73it/s]

#######speaker#######
sam
sam
3


15it [00:09,  2.08it/s]

#######speaker#######
bea
bea
0


16it [00:09,  2.43it/s]

#######speaker#######
bea
bea
0


17it [00:10,  1.63it/s]

#######speaker#######
bea
bea
0


18it [00:10,  1.84it/s]

#######speaker#######
sam
sam
3


19it [00:11,  1.83it/s]

#######speaker#######
sam
sam
3


20it [00:11,  1.99it/s]

#######speaker#######
sam
sam
3


21it [00:12,  2.38it/s]

#######speaker#######
bea
bea
0


22it [00:12,  1.84it/s]

#######speaker#######
sam
sam
3


23it [00:13,  1.58it/s]

#######speaker#######
sam
sam
3


24it [00:14,  1.59it/s]

#######speaker#######
bea
bea
0


25it [00:14,  1.95it/s]

#######speaker#######
bea
bea
0


26it [00:15,  1.89it/s]

#######speaker#######
bea
bea
0


27it [00:16,  1.45it/s]

#######speaker#######
bea
bea
0


28it [00:16,  1.54it/s]

#######speaker#######
sam
sam
3


29it [00:17,  1.86it/s]

#######speaker#######
sam
sam
3


30it [00:18,  1.54it/s]

#######speaker#######
sam
sam
3


31it [00:18,  1.53it/s]

#######speaker#######
sam
sam
3


32it [00:19,  1.38it/s]

#######speaker#######
bea
bea
0


33it [00:20,  1.38it/s]

#######speaker#######
sam
sam
3


34it [00:21,  1.19it/s]

#######speaker#######
sam
sam
3


35it [00:23,  1.20s/it]

#######speaker#######
sam
sam
3


36it [00:24,  1.08s/it]

#######speaker#######
bea
bea
0


37it [00:24,  1.13it/s]

#######speaker#######
sam
sam
3


39it [00:26,  1.34it/s]

#######speaker#######
sam
sam
3
#######speaker#######
sam
sam
3


40it [00:26,  1.49it/s]

#######speaker#######
sam
sam
3


41it [00:27,  1.44it/s]

#######speaker#######
sam
sam
3


42it [00:27,  1.78it/s]

#######speaker#######
sam
sam
3


43it [00:27,  1.97it/s]

#######speaker#######
sam
sam
3


44it [00:28,  1.68it/s]

#######speaker#######
sam
sam
3


45it [00:29,  2.01it/s]

#######speaker#######
sam
sam
3


46it [00:29,  1.69it/s]

#######speaker#######
bea
bea
0


47it [00:30,  2.03it/s]

#######speaker#######
sam
sam
3


48it [00:30,  2.27it/s]

#######speaker#######
sam
sam
3


49it [00:31,  1.93it/s]

#######speaker#######
sam
sam
3


50it [00:31,  1.91it/s]

#######speaker#######
bea
bea
0


51it [00:31,  2.20it/s]

#######speaker#######
bea
bea
0


52it [00:32,  2.51it/s]

#######speaker#######
bea
bea
0


53it [00:32,  2.61it/s]

#######speaker#######
sam
sam
3


54it [00:33,  2.49it/s]

#######speaker#######
bea
bea
0


55it [00:33,  2.83it/s]

#######speaker#######
sam
sam
3


56it [00:33,  2.48it/s]

#######speaker#######
sam
sam
3


57it [00:34,  1.91it/s]

#######speaker#######
sam
sam
3


58it [00:34,  2.22it/s]

#######speaker#######
bea
bea
0


59it [00:35,  2.26it/s]

#######speaker#######
sam
sam
3


60it [00:37,  1.09it/s]

#######speaker#######
sam
sam
3


61it [00:38,  1.06it/s]

#######speaker#######
bea
bea
0


62it [00:38,  1.28it/s]

#######speaker#######
bea
bea
0


63it [00:39,  1.53it/s]

#######speaker#######
bea
bea
0


65it [00:40,  1.87it/s]

#######speaker#######
bea
bea
0
#######speaker#######
sam
sam
3


66it [00:40,  1.82it/s]

#######speaker#######
sam
sam
3


67it [00:41,  1.53it/s]

#######speaker#######
bea
bea
0


68it [00:41,  1.87it/s]

#######speaker#######
bea
bea
0


69it [00:42,  2.18it/s]

#######speaker#######
bea
bea
0


70it [00:42,  1.88it/s]

#######speaker#######
bea
bea
0


71it [00:43,  1.40it/s]

#######speaker#######
sam
sam
3


72it [00:44,  1.46it/s]

#######speaker#######
sam
sam
3


73it [00:45,  1.46it/s]

#######speaker#######
bea
bea
0


74it [00:46,  1.31it/s]

#######speaker#######
sam
sam
3


75it [00:46,  1.64it/s]

#######speaker#######
sam
sam
3


76it [00:46,  1.84it/s]

#######speaker#######
sam
sam
3


77it [00:47,  1.58it/s]

#######speaker#######
bea
bea
0


78it [00:48,  1.54it/s]

#######speaker#######
sam
sam
3


79it [00:48,  1.84it/s]

#######speaker#######
sam
sam
3


80it [00:49,  1.35it/s]

#######speaker#######
sam
sam
3


82it [00:50,  2.13it/s]

#######speaker#######
sam
sam
3
#######speaker#######
sam
sam
3


83it [00:50,  1.97it/s]

#######speaker#######
bea
bea
0


84it [00:51,  1.81it/s]

#######speaker#######
bea
bea
0


85it [00:52,  1.82it/s]

#######speaker#######
bea
bea
0


86it [00:52,  1.94it/s]

#######speaker#######
sam
sam
3


87it [00:52,  2.27it/s]

#######speaker#######
sam
sam
3


88it [00:53,  2.49it/s]

#######speaker#######
bea
bea
0


89it [00:53,  2.00it/s]

#######speaker#######
bea
bea
0


90it [00:54,  1.45it/s]

#######speaker#######
bea
bea
0


91it [00:55,  1.80it/s]

#######speaker#######
sam
sam
3


92it [00:55,  1.71it/s]

#######speaker#######
bea
bea
0


93it [00:56,  1.81it/s]

#######speaker#######
bea
bea
0


94it [00:57,  1.64it/s]

#######speaker#######
sam
sam
3


95it [00:57,  1.74it/s]

#######speaker#######
sam
sam
3


96it [00:58,  1.66it/s]

#######speaker#######
sam
sam
3


97it [00:58,  2.00it/s]

#######speaker#######
sam
sam
3


98it [00:58,  2.38it/s]

#######speaker#######
sam
sam
3


99it [01:00,  1.10it/s]

#######speaker#######
sam
sam
3


100it [01:01,  1.03it/s]

#######speaker#######
bea
bea
0


101it [01:02,  1.10it/s]

#######speaker#######
sam
sam
3


102it [01:03,  1.30it/s]

#######speaker#######
sam
sam
3


104it [01:03,  1.81it/s]

#######speaker#######
bea
bea
0
#######speaker#######
sam
sam
3


105it [01:04,  1.83it/s]

#######speaker#######
sam
sam
3


106it [01:05,  1.74it/s]

#######speaker#######
sam
sam
3


107it [01:06,  1.33it/s]

#######speaker#######
sam
sam
3


108it [01:06,  1.69it/s]

#######speaker#######
bea
bea
0


109it [01:06,  1.86it/s]

#######speaker#######
bea
bea
0


110it [01:07,  1.96it/s]

#######speaker#######
sam
sam
3


111it [01:07,  1.73it/s]

#######speaker#######
sam
sam
3


112it [01:08,  1.81it/s]

#######speaker#######
sam
sam
3


113it [01:10,  1.13it/s]

#######speaker#######
bea
bea
0


115it [01:11,  1.39it/s]

#######speaker#######
bea
bea
0
#######speaker#######
sam
sam
3


116it [01:11,  1.53it/s]

#######speaker#######
bea
bea
0


117it [01:12,  1.84it/s]

#######speaker#######
bea
bea
0


118it [01:12,  2.21it/s]

#######speaker#######
bea
bea
0


120it [01:14,  1.69it/s]

#######speaker#######
bea
bea
0
#######speaker#######
bea
bea
0


121it [01:14,  2.06it/s]

#######speaker#######
bea
bea
0


122it [01:14,  2.04it/s]

#######speaker#######
sam
sam
3


123it [01:15,  2.04it/s]

#######speaker#######
bea
bea
0


124it [01:15,  1.96it/s]

#######speaker#######
sam
sam
3


125it [01:16,  2.35it/s]

#######speaker#######
bea
bea
0


126it [01:16,  2.66it/s]

#######speaker#######
sam
sam
3


127it [01:16,  2.27it/s]

#######speaker#######
sam
sam
3


128it [01:17,  1.70it/s]

#######speaker#######
bea
bea
0


129it [01:18,  1.63it/s]

#######speaker#######
bea
bea
0


130it [01:18,  1.83it/s]

#######speaker#######
sam
sam
3


131it [01:19,  1.55it/s]

#######speaker#######
sam
sam
3


132it [01:20,  1.78it/s]

#######speaker#######
sam
sam
3


133it [01:20,  2.04it/s]

#######speaker#######
sam
sam
3


134it [01:20,  2.37it/s]

#######speaker#######
bea
bea
0


135it [01:21,  1.85it/s]

#######speaker#######
bea
bea
0


136it [01:22,  1.72it/s]

#######speaker#######
bea
bea
0


137it [01:23,  1.43it/s]

#######speaker#######
sam
sam
3


138it [01:24,  1.19it/s]

#######speaker#######
bea
bea
0


139it [01:24,  1.52it/s]

#######speaker#######
sam
sam
3


140it [01:25,  1.64it/s]
2023-12-07 16:18:18 | INFO | synthesize_perso | Done.
2023-12-07 16:18:20 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2023-12-07 16:18:23 | INFO | fairseq.tasks.hubert_pretraining | current directory is /home/utilisateur/createch/project/emotion
2023-12-07 16:18:23 | INFO | fairseq.tasks.hubert_pretraining | HubertPretrainingTask Config {'_name': 'hubert_pretraining', 'data': '/checkpoint/wnhsu/data/librispeech/960h/iter/250K_50hz_km100_mp0_65_v2', 'fine_tuning': False, 'labels': ['layer6.km500'], 'label_dir': None, 'label_rate': 50.0, 'sample_rate': 16000, 'normalize': False, 'enable_padding': False, 'max_keep_size': None, 'max_sample_size': 250000, 'min_sample_size': 32000, 'single_target': False, 'random_crop': True, 'pad_audio': False}
2023-12-07 16:18:23 | INFO | fairseq.models.hubert.hubert | HubertModel Config: {'_name': 'hubert', 'label_rate': 50.0, 'extractor_mode': default, 'encoder_layers': 12, 'encod

loading /home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/data/data.tsv and /home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/data/data.km
---
src emotions: neutral
trg emotions: amused
defaultdict(<function main.<locals>.<lambda> at 0x7f989727c040>, {})
defaultdict(<function main.<locals>.<lambda> at 0x7f989727c040>, {'SAME': defaultdict(<class 'list'>, {'0421': [0, 1], '0317': [2, 3, 4, 5, 6], '0256': [7, 8, 9, 10], '0344': [11, 12, 13, 14], '0402': [15, 16], '0437': [17, 18], '0436': [19, 20], '0411': [21, 22], '0216': [23, 24, 25, 26, 27], '0412': [28, 29], '0401': [30, 31, 32, 33, 34], '0447': [35, 36, 37, 38, 39], '0414': [40, 41], '0493': [42, 43, 44], '0449': [45, 46], '0455': [47, 48], '0192': [49, 50, 51, 52, 53], '0495': [54, 55, 56], '0443': [57, 58, 59, 60, 61], '0473': [62, 63], '0172': [64, 65, 66, 67, 68], '0226': [69, 70, 71, 72, 73], '0337': [74, 75, 76, 77, 78], '0263': [79, 80, 81, 82], '0484': [83, 84, 85, 8

2023-12-07 16:18:40 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2023-12-07 16:18:41 | INFO | fairseq_cli.preprocess | Namespace(no_progress_bar=False, log_interval=100, log_format=None, log_file=None, aim_repo=None, aim_run_hash=None, tensorboard_logdir=None, wandb_project=None, azureml_logging=False, seed=1, cpu=False, tpu=False, bf16=False, memory_efficient_bf16=False, fp16=False, memory_efficient_fp16=False, fp16_no_flatten_grads=False, fp16_init_scale=128, fp16_scale_window=None, fp16_scale_tolerance=0.0, on_cpu_convert_precision=False, min_loss_scale=0.0001, threshold_loss_scale=None, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, user_dir=None, empty_cache_freq=0, all_gather_list_size=16384, model_parallel_size=1, quantization_config_path=None, profile=False, reset_logging=False, suppress_crashes=False, use_plasma_view=False, plasma_path='/tmp/plasma', criterion='cross_entropy', tokenizer=None, bpe=None

----------------create mafnifest finish----------------


2023-12-07 16:18:43 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2023-12-07 16:18:44 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': False, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': '/home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/fairseq_models', 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'mod

loading pre-trained emotion translation model


2023-12-07 16:18:45 | INFO | fairseq.data.data_utils | loaded 146 examples from: /home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/processed_data_test/tokenized/test.neutral-amused.neutral
2023-12-07 16:18:45 | INFO | fairseq.data.data_utils | loaded 146 examples from: /home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/processed_data_test/tokenized/test.neutral-amused.amused
2023-12-07 16:18:45 | INFO | fairseq.tasks.translation | /home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/processed_data_test/tokenized test neutral-amused 146 examples
2023-12-07 16:18:45 | INFO | fairseq.tasks.fairseq_task | can_reuse_epoch_itr = True
2023-12-07 16:18:45 | INFO | fairseq.tasks.fairseq_task | reuse_dataloader = True
2023-12-07 16:18:45 | INFO | fairseq.tasks.fairseq_task | rebuild_batches = False
2023-12-07 16:18:45 | INFO | fairseq.tasks.fairseq_task | creating new batches for epoch 1
2023-12-07 16:19:07 | INFO

Loading '/home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/save/hifigan/g_00400000'
Complete.


2023-12-07 16:19:13 | INFO | synthesize_perso | loaded duration prediction model from /home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/save/duration_predictor
2023-12-07 16:19:13 | INFO | synthesize_perso | loaded f0 prediction model from CnnPredictor(
  (token_emb): Embedding(201, 256, padding_idx=200)
  (gst_emb): Embedding(20, 8)
  (conv_layer): ModuleList(
    (0): Sequential(
      (0): Rearrange('b t c -> b c t')
      (1): Conv1d(264, 256, kernel_size=(5,), stride=(1,), padding=(2,))
      (2): Rearrange('b c t -> b t c')
      (3): ReLU()
      (4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (5): Dropout(p=0.1, inplace=False)
    )
    (1-5): 5 x Sequential(
      (0): Rearrange('b t c -> b c t')
      (1): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,))
      (2): Rearrange('b c t -> b t c')
      (3): ReLU()
      (4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (5): Dropout(p=0.1, inplace=False)
   

/home/utilisateur/createch/project/emotion/dataset_final/amused
outdir: /home/utilisateur/createch/project/emotion/dataset_final/amused
144
#######speaker#######
bea
bea
0


1it [00:00,  4.30it/s]

#######speaker#######
sam
sam
3


2it [00:00,  2.07it/s]

#######speaker#######
sam
sam
3


3it [00:01,  2.72it/s]

#######speaker#######
sam
sam
3


4it [00:02,  1.35it/s]

#######speaker#######
sam
sam
3


5it [00:03,  1.42it/s]

#######speaker#######
sam
sam
3


6it [00:03,  1.78it/s]

#######speaker#######
sam
sam
3


7it [00:04,  1.54it/s]

#######speaker#######
sam
sam
3


9it [00:04,  2.17it/s]

#######speaker#######
sam
sam
3
#######speaker#######
sam
sam
3


10it [00:05,  2.11it/s]

#######speaker#######
bea
bea
0


11it [00:05,  2.10it/s]

#######speaker#######
bea
bea
0


12it [00:06,  1.99it/s]

#######speaker#######
sam
sam
3


13it [00:08,  1.05it/s]

#######speaker#######
bea
bea
0


14it [00:09,  1.11it/s]

#######speaker#######
sam
sam
3


15it [00:11,  1.28s/it]

#######speaker#######
sam
sam
3


16it [00:11,  1.07s/it]

#######speaker#######
bea
bea
0


17it [00:12,  1.03it/s]

#######speaker#######
sam
sam
3


18it [00:13,  1.07it/s]

#######speaker#######
bea
bea
0


20it [00:14,  1.61it/s]

#######speaker#######
bea
bea
0
#######speaker#######
sam
sam
3


21it [00:15,  1.32it/s]

#######speaker#######
sam
sam
3


22it [00:15,  1.68it/s]

#######speaker#######
bea
bea
0


23it [00:15,  2.04it/s]

#######speaker#######
sam
sam
3


24it [00:16,  2.28it/s]

#######speaker#######
sam
sam
3


25it [00:17,  1.53it/s]

#######speaker#######
sam
sam
3


26it [00:18,  1.36it/s]

#######speaker#######
bea
bea
0


27it [00:18,  1.58it/s]

#######speaker#######
bea
bea
0


28it [00:18,  1.92it/s]

#######speaker#######
bea
bea
0


29it [00:19,  1.79it/s]

#######speaker#######
bea
bea
0


30it [00:20,  1.37it/s]

#######speaker#######
bea
bea
0


31it [00:21,  1.42it/s]

#######speaker#######
sam
sam
3


32it [00:21,  1.77it/s]

#######speaker#######
sam
sam
3


33it [00:22,  1.53it/s]

#######speaker#######
bea
bea
0


34it [00:22,  1.75it/s]

#######speaker#######
bea
bea
0


35it [00:23,  1.35it/s]

#######speaker#######
sam
sam
3


36it [00:24,  1.61it/s]

#######speaker#######
bea
bea
0


37it [00:24,  1.54it/s]

#######speaker#######
sam
sam
3


38it [00:26,  1.18it/s]

#######speaker#######
sam
sam
3


39it [00:27,  1.05it/s]

#######speaker#######
sam
sam
3


40it [00:27,  1.19it/s]

#######speaker#######
sam
sam
3


41it [00:28,  1.30it/s]

#######speaker#######
sam
sam
3


43it [00:29,  1.82it/s]

#######speaker#######
sam
sam
3
#######speaker#######
sam
sam
3


44it [00:29,  1.86it/s]

#######speaker#######
sam
sam
3


45it [00:30,  2.16it/s]

#######speaker#######
bea
bea
0


46it [00:30,  2.19it/s]

#######speaker#######
sam
sam
3


47it [00:31,  1.82it/s]

#######speaker#######
sam
sam
3


48it [00:31,  2.16it/s]

#######speaker#######
bea
bea
0


49it [00:31,  2.43it/s]

#######speaker#######
sam
sam
3


50it [00:32,  2.40it/s]

#######speaker#######
sam
sam
3


51it [00:32,  2.67it/s]

#######speaker#######
bea
bea
0


52it [00:33,  1.79it/s]

#######speaker#######
sam
sam
3


53it [00:35,  1.09it/s]

#######speaker#######
sam
sam
3


54it [00:36,  1.09it/s]

#######speaker#######
bea
bea
0


55it [00:36,  1.34it/s]

#######speaker#######
sam
sam
3


56it [00:36,  1.63it/s]

#######speaker#######
bea
bea
0


57it [00:37,  1.72it/s]

#######speaker#######
sam
sam
3


58it [00:37,  1.95it/s]

#######speaker#######
sam
sam
3


59it [00:38,  1.64it/s]

#######speaker#######
sam
sam
3


60it [00:38,  1.94it/s]

#######speaker#######
sam
sam
3


61it [00:39,  2.24it/s]

#######speaker#######
bea
bea
0


62it [00:39,  2.57it/s]

#######speaker#######
sam
sam
3


63it [00:39,  2.42it/s]

#######speaker#######
sam
sam
3


64it [00:40,  1.74it/s]

#######speaker#######
bea
bea
0


65it [00:41,  2.10it/s]

#######speaker#######
sam
sam
3


66it [00:41,  2.07it/s]

#######speaker#######
bea
bea
0


67it [00:42,  1.82it/s]

#######speaker#######
bea
bea
0


68it [00:42,  2.21it/s]

#######speaker#######
sam
sam
3


69it [00:43,  2.11it/s]

#######speaker#######
sam
sam
3


70it [00:43,  2.02it/s]

#######speaker#######
sam
sam
3


71it [00:43,  2.31it/s]

#######speaker#######
bea
bea
0


72it [00:44,  2.56it/s]

#######speaker#######
sam
sam
3


73it [00:44,  2.06it/s]

#######speaker#######
bea
bea
0


74it [00:45,  1.85it/s]

#######speaker#######
bea
bea
0


75it [00:46,  1.66it/s]

#######speaker#######
bea
bea
0


76it [00:47,  1.49it/s]

#######speaker#######
sam
sam
3


77it [00:47,  1.38it/s]

#######speaker#######
bea
bea
0


78it [00:48,  1.74it/s]

#######speaker#######
sam
sam
3


79it [00:48,  1.78it/s]

#######speaker#######
sam
sam
3


80it [00:49,  1.48it/s]

#######speaker#######
sam
sam
3


81it [00:50,  1.43it/s]

#######speaker#######
bea
bea
0


82it [00:50,  1.76it/s]

#######speaker#######
sam
sam
3


83it [00:51,  1.37it/s]

#######speaker#######
bea
bea
0


85it [00:52,  2.14it/s]

#######speaker#######
sam
sam
3
#######speaker#######
sam
sam
3


86it [00:52,  1.95it/s]

#######speaker#######
bea
bea
0


87it [00:53,  1.77it/s]

#######speaker#######
sam
sam
3


88it [00:54,  1.71it/s]

#######speaker#######
sam
sam
3


89it [00:54,  1.73it/s]

#######speaker#######
sam
sam
3


90it [00:55,  1.90it/s]

#######speaker#######
sam
sam
3


91it [00:55,  1.71it/s]

#######speaker#######
sam
sam
3


92it [00:56,  2.04it/s]

#######speaker#######
sam
sam
3


93it [00:56,  2.39it/s]

#######speaker#######
bea
bea
0


94it [00:57,  1.84it/s]

#######speaker#######
sam
sam
3


95it [00:58,  1.55it/s]

#######speaker#######
sam
sam
3


96it [00:58,  1.91it/s]

#######speaker#######
sam
sam
3


97it [00:59,  1.69it/s]

#######speaker#######
bea
bea
0


98it [00:59,  1.82it/s]

#######speaker#######
sam
sam
3


99it [01:00,  1.37it/s]

#######speaker#######
sam
sam
3


100it [01:01,  1.51it/s]

#######speaker#######
bea
bea
0


101it [01:01,  1.47it/s]

#######speaker#######
bea
bea
0


102it [01:02,  1.84it/s]

#######speaker#######
sam
sam
3


103it [01:02,  2.18it/s]

#######speaker#######
bea
bea
0


104it [01:02,  2.15it/s]

#######speaker#######
bea
bea
0


105it [01:04,  1.48it/s]

#######speaker#######
sam
sam
3


106it [01:04,  1.50it/s]

#######speaker#######
bea
bea
0


107it [01:05,  1.72it/s]

#######speaker#######
bea
bea
0


108it [01:05,  1.75it/s]

#######speaker#######
sam
sam
3


109it [01:05,  2.10it/s]

#######speaker#######
bea
bea
0


110it [01:06,  2.19it/s]

#######speaker#######
sam
sam
3


111it [01:06,  2.20it/s]

#######speaker#######
bea
bea
0


112it [01:07,  2.52it/s]

#######speaker#######
sam
sam
3


113it [01:07,  1.98it/s]

#######speaker#######
sam
sam
3


114it [01:08,  2.30it/s]

#######speaker#######
sam
sam
3


115it [01:08,  2.30it/s]

#######speaker#######
sam
sam
3


116it [01:08,  2.40it/s]

#######speaker#######
sam
sam
3


117it [01:10,  1.57it/s]

#######speaker#######
bea
bea
0


118it [01:10,  1.72it/s]

#######speaker#######
sam
sam
3


119it [01:12,  1.13it/s]

#######speaker#######
sam
sam
3


120it [01:12,  1.25it/s]

#######speaker#######
sam
sam
3


121it [01:12,  1.54it/s]

#######speaker#######
bea
bea
0


122it [01:13,  1.70it/s]

#######speaker#######
sam
sam
3


123it [01:13,  2.01it/s]

#######speaker#######
bea
bea
0


124it [01:13,  2.30it/s]

#######speaker#######
sam
sam
3


125it [01:15,  1.53it/s]

#######speaker#######
sam
sam
3


126it [01:15,  1.85it/s]

#######speaker#######
bea
bea
0


127it [01:15,  2.22it/s]

#######speaker#######
sam
sam
3


128it [01:17,  1.02it/s]

#######speaker#######
bea
bea
0


129it [01:18,  1.12it/s]

#######speaker#######
bea
bea
0


130it [01:20,  1.07s/it]

#######speaker#######
bea
bea
0


131it [01:20,  1.21it/s]

#######speaker#######
sam
sam
3


132it [01:21,  1.21it/s]

#######speaker#######
sam
sam
3


133it [01:21,  1.28it/s]

#######speaker#######
sam
sam
3


134it [01:22,  1.52it/s]

#######speaker#######
sam
sam
3


135it [01:22,  1.44it/s]

#######speaker#######
bea
bea
0


136it [01:23,  1.71it/s]

#######speaker#######
bea
bea
0


137it [01:23,  2.12it/s]

#######speaker#######
bea
bea
0


138it [01:23,  2.46it/s]

#######speaker#######
sam
sam
3


139it [01:24,  1.97it/s]

#######speaker#######
bea
bea
0


140it [01:25,  1.74it/s]

#######speaker#######
sam
sam
3


141it [01:26,  1.47it/s]

#######speaker#######
bea
bea
0


142it [01:27,  1.22it/s]

#######speaker#######
sam
sam
3


143it [01:27,  1.56it/s]

#######speaker#######
bea
bea
0


144it [01:27,  1.64it/s]
2023-12-07 16:20:41 | INFO | synthesize_perso | Done.
2023-12-07 16:20:43 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2023-12-07 16:20:46 | INFO | fairseq.tasks.hubert_pretraining | current directory is /home/utilisateur/createch/project/emotion
2023-12-07 16:20:46 | INFO | fairseq.tasks.hubert_pretraining | HubertPretrainingTask Config {'_name': 'hubert_pretraining', 'data': '/checkpoint/wnhsu/data/librispeech/960h/iter/250K_50hz_km100_mp0_65_v2', 'fine_tuning': False, 'labels': ['layer6.km500'], 'label_dir': None, 'label_rate': 50.0, 'sample_rate': 16000, 'normalize': False, 'enable_padding': False, 'max_keep_size': None, 'max_sample_size': 250000, 'min_sample_size': 32000, 'single_target': False, 'random_crop': True, 'pad_audio': False}
2023-12-07 16:20:46 | INFO | fairseq.models.hubert.hubert | HubertModel Config: {'_name': 'hubert', 'label_rate': 50.0, 'extractor_mode': default, 'encoder_layers': 12, 'encod

loading /home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/data/data.tsv and /home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/data/data.km
---
src emotions: neutral
trg emotions: sleepy
defaultdict(<function main.<locals>.<lambda> at 0x7f6053094040>, {})
defaultdict(<function main.<locals>.<lambda> at 0x7f6053094040>, {'SAME': defaultdict(<class 'list'>, {'0421': [0, 1], '0317': [2, 3, 4, 5, 6], '0256': [7, 8, 9, 10], '0344': [11, 12, 13, 14], '0402': [15, 16], '0437': [17, 18], '0436': [19, 20], '0411': [21, 22], '0216': [23, 24, 25, 26, 27], '0412': [28, 29], '0401': [30, 31, 32, 33, 34], '0447': [35, 36, 37, 38, 39], '0414': [40, 41], '0493': [42, 43, 44], '0449': [45, 46], '0455': [47, 48], '0192': [49, 50, 51, 52, 53], '0495': [54, 55, 56], '0443': [57, 58, 59, 60, 61], '0473': [62, 63], '0172': [64, 65, 66, 67, 68], '0226': [69, 70, 71, 72, 73], '0337': [74, 75, 76, 77, 78], '0263': [79, 80, 81, 82], '0484': [83, 84, 85, 8

2023-12-07 16:21:03 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2023-12-07 16:21:04 | INFO | fairseq_cli.preprocess | Namespace(no_progress_bar=False, log_interval=100, log_format=None, log_file=None, aim_repo=None, aim_run_hash=None, tensorboard_logdir=None, wandb_project=None, azureml_logging=False, seed=1, cpu=False, tpu=False, bf16=False, memory_efficient_bf16=False, fp16=False, memory_efficient_fp16=False, fp16_no_flatten_grads=False, fp16_init_scale=128, fp16_scale_window=None, fp16_scale_tolerance=0.0, on_cpu_convert_precision=False, min_loss_scale=0.0001, threshold_loss_scale=None, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, user_dir=None, empty_cache_freq=0, all_gather_list_size=16384, model_parallel_size=1, quantization_config_path=None, profile=False, reset_logging=False, suppress_crashes=False, use_plasma_view=False, plasma_path='/tmp/plasma', criterion='cross_entropy', tokenizer=None, bpe=None

----------------create mafnifest finish----------------


2023-12-07 16:21:06 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2023-12-07 16:21:07 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': False, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': '/home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/fairseq_models', 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'mod

loading pre-trained emotion translation model


2023-12-07 16:21:07 | INFO | fairseq.data.data_utils | loaded 166 examples from: /home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/processed_data_test/tokenized/test.neutral-sleepy.neutral
2023-12-07 16:21:07 | INFO | fairseq.data.data_utils | loaded 166 examples from: /home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/processed_data_test/tokenized/test.neutral-sleepy.sleepy
2023-12-07 16:21:07 | INFO | fairseq.tasks.translation | /home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/processed_data_test/tokenized test neutral-sleepy 166 examples
2023-12-07 16:21:07 | INFO | fairseq.tasks.fairseq_task | can_reuse_epoch_itr = True
2023-12-07 16:21:07 | INFO | fairseq.tasks.fairseq_task | reuse_dataloader = True
2023-12-07 16:21:07 | INFO | fairseq.tasks.fairseq_task | rebuild_batches = False
2023-12-07 16:21:07 | INFO | fairseq.tasks.fairseq_task | creating new batches for epoch 1
2023-12-07 16:21:25 | INFO

Loading '/home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/save/hifigan/g_00400000'
Complete.


2023-12-07 16:21:31 | INFO | synthesize_perso | loaded duration prediction model from /home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/save/duration_predictor
2023-12-07 16:21:31 | INFO | synthesize_perso | loaded f0 prediction model from CnnPredictor(
  (token_emb): Embedding(201, 256, padding_idx=200)
  (gst_emb): Embedding(20, 8)
  (conv_layer): ModuleList(
    (0): Sequential(
      (0): Rearrange('b t c -> b c t')
      (1): Conv1d(264, 256, kernel_size=(5,), stride=(1,), padding=(2,))
      (2): Rearrange('b c t -> b t c')
      (3): ReLU()
      (4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (5): Dropout(p=0.1, inplace=False)
    )
    (1-5): 5 x Sequential(
      (0): Rearrange('b t c -> b c t')
      (1): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,))
      (2): Rearrange('b c t -> b t c')
      (3): ReLU()
      (4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (5): Dropout(p=0.1, inplace=False)
   

/home/utilisateur/createch/project/emotion/dataset_final/sleepy
outdir: /home/utilisateur/createch/project/emotion/dataset_final/sleepy
164
#######speaker#######
bea
bea
0


1it [00:00,  3.18it/s]

#######speaker#######
sam
sam
3


2it [00:00,  3.53it/s]

#######speaker#######
sam
sam
3


3it [00:01,  1.31it/s]

#######speaker#######
bea
bea
0


4it [00:03,  1.12it/s]

#######speaker#######
bea
bea
0


5it [00:03,  1.08it/s]

#######speaker#######
sam
sam
3


7it [00:05,  1.46it/s]

#######speaker#######
sam
sam
3
#######speaker#######
bea
bea
0


8it [00:05,  1.89it/s]

#######speaker#######
bea
bea
0


9it [00:06,  1.70it/s]

#######speaker#######
sam
sam
3


10it [00:06,  1.56it/s]

#######speaker#######
bea
bea
0


11it [00:07,  1.53it/s]

#######speaker#######
bea
bea
0


12it [00:07,  1.75it/s]

#######speaker#######
bea
bea
0


13it [00:08,  1.43it/s]

#######speaker#######
bea
bea
0


14it [00:09,  1.44it/s]

#######speaker#######
sam
sam
3


15it [00:10,  1.45it/s]

#######speaker#######
sam
sam
3


16it [00:10,  1.62it/s]

#######speaker#######
sam
sam
3


17it [00:11,  1.35it/s]

#######speaker#######
bea
bea
0


18it [00:12,  1.52it/s]

#######speaker#######
bea
bea
0


19it [00:12,  1.79it/s]

#######speaker#######
bea
bea
0


21it [00:12,  2.65it/s]

#######speaker#######
bea
bea
0
#######speaker#######
sam
sam
3


22it [00:13,  2.16it/s]

#######speaker#######
bea
bea
0


23it [00:13,  2.59it/s]

#######speaker#######
sam
sam
3


24it [00:14,  2.52it/s]

#######speaker#######
bea
bea
0


25it [00:14,  2.30it/s]

#######speaker#######
bea
bea
0


26it [00:15,  2.44it/s]

#######speaker#######
sam
sam
3


28it [00:16,  2.09it/s]

#######speaker#######
sam
sam
3
#######speaker#######
sam
sam
3


29it [00:16,  2.19it/s]

#######speaker#######
sam
sam
3


30it [00:17,  2.13it/s]

#######speaker#######
bea
bea
0


31it [00:17,  2.25it/s]

#######speaker#######
bea
bea
0


32it [00:17,  2.33it/s]

#######speaker#######
sam
sam
3


33it [00:19,  1.57it/s]

#######speaker#######
sam
sam
3


34it [00:19,  1.94it/s]

#######speaker#######
sam
sam
3


35it [00:20,  1.61it/s]

#######speaker#######
sam
sam
3


36it [00:21,  1.26it/s]

#######speaker#######
sam
sam
3


37it [00:23,  1.20s/it]

#######speaker#######
bea
bea
0


38it [00:23,  1.08it/s]

#######speaker#######
sam
sam
3


39it [00:24,  1.21it/s]

#######speaker#######
bea
bea
0


40it [00:25,  1.16it/s]

#######speaker#######
bea
bea
0


41it [00:26,  1.21it/s]

#######speaker#######
bea
bea
0


42it [00:26,  1.21it/s]

#######speaker#######
bea
bea
0


43it [00:27,  1.43it/s]

#######speaker#######
sam
sam
3


44it [00:28,  1.39it/s]

#######speaker#######
sam
sam
3


45it [00:28,  1.32it/s]

#######speaker#######
sam
sam
3


46it [00:30,  1.17it/s]

#######speaker#######
bea
bea
0


47it [00:30,  1.38it/s]

#######speaker#######
sam
sam
3


48it [00:30,  1.54it/s]

#######speaker#######
bea
bea
0


49it [00:31,  1.65it/s]

#######speaker#######
sam
sam
3


50it [00:31,  1.79it/s]

#######speaker#######
sam
sam
3


51it [00:32,  2.05it/s]

#######speaker#######
sam
sam
3


52it [00:32,  2.36it/s]

#######speaker#######
bea
bea
0


53it [00:32,  2.58it/s]

#######speaker#######
bea
bea
0


54it [00:34,  1.56it/s]

#######speaker#######
sam
sam
3


55it [00:34,  1.78it/s]

#######speaker#######
sam
sam
3


56it [00:34,  1.78it/s]

#######speaker#######
bea
bea
0


57it [00:35,  2.20it/s]

#######speaker#######
sam
sam
3


58it [00:36,  1.17it/s]

#######speaker#######
sam
sam
3


59it [00:37,  1.36it/s]

#######speaker#######
sam
sam
3


60it [00:38,  1.26it/s]

#######speaker#######
bea
bea
0


61it [00:39,  1.09it/s]

#######speaker#######
bea
bea
0


62it [00:40,  1.27it/s]

#######speaker#######
sam
sam
3


63it [00:40,  1.60it/s]

#######speaker#######
sam
sam
3


64it [00:41,  1.43it/s]

#######speaker#######
bea
bea
0


65it [00:41,  1.71it/s]

#######speaker#######
bea
bea
0


67it [00:42,  1.75it/s]

#######speaker#######
bea
bea
0
#######speaker#######
bea
bea
0


68it [00:43,  2.10it/s]

#######speaker#######
sam
sam
3


69it [00:43,  1.81it/s]

#######speaker#######
sam
sam
3


70it [00:44,  1.81it/s]

#######speaker#######
bea
bea
0


71it [00:45,  1.67it/s]

#######speaker#######
bea
bea
0


72it [00:45,  1.88it/s]

#######speaker#######
bea
bea
0


73it [00:46,  1.65it/s]

#######speaker#######
sam
sam
3


74it [00:46,  1.94it/s]

#######speaker#######
sam
sam
3


75it [00:46,  1.98it/s]

#######speaker#######
sam
sam
3


76it [00:47,  2.06it/s]

#######speaker#######
bea
bea
0


77it [00:48,  1.68it/s]

#######speaker#######
bea
bea
0


78it [00:48,  2.04it/s]

#######speaker#######
bea
bea
0


79it [00:48,  2.10it/s]

#######speaker#######
sam
sam
3


80it [00:49,  2.53it/s]

#######speaker#######
bea
bea
0


81it [00:49,  2.01it/s]

#######speaker#######
sam
sam
3


82it [00:50,  1.89it/s]

#######speaker#######
sam
sam
3


83it [00:51,  1.53it/s]

#######speaker#######
bea
bea
0


84it [00:51,  1.87it/s]

#######speaker#######
sam
sam
3


85it [00:52,  1.63it/s]

#######speaker#######
bea
bea
0


86it [00:53,  1.29it/s]

#######speaker#######
bea
bea
0


87it [00:54,  1.48it/s]

#######speaker#######
sam
sam
3


89it [00:55,  1.68it/s]

#######speaker#######
bea
bea
0
#######speaker#######
sam
sam
3


90it [00:55,  2.02it/s]

#######speaker#######
sam
sam
3


91it [00:56,  1.75it/s]

#######speaker#######
sam
sam
3


92it [00:56,  1.76it/s]

#######speaker#######
bea
bea
0


93it [00:57,  2.16it/s]

#######speaker#######
bea
bea
0


94it [00:57,  1.76it/s]

#######speaker#######
bea
bea
0


95it [00:58,  1.63it/s]

#######speaker#######
bea
bea
0


96it [00:58,  2.01it/s]

#######speaker#######
sam
sam
3


97it [00:59,  1.83it/s]

#######speaker#######
sam
sam
3


98it [00:59,  2.23it/s]

#######speaker#######
bea
bea
0


99it [01:00,  1.66it/s]

#######speaker#######
sam
sam
3


100it [01:01,  1.84it/s]

#######speaker#######
sam
sam
3


101it [01:01,  1.82it/s]

#######speaker#######
bea
bea
0


102it [01:02,  1.41it/s]

#######speaker#######
bea
bea
0


103it [01:03,  1.49it/s]

#######speaker#######
bea
bea
0


104it [01:03,  1.61it/s]

#######speaker#######
bea
bea
0


105it [01:04,  1.93it/s]

#######speaker#######
sam
sam
3


106it [01:05,  1.40it/s]

#######speaker#######
sam
sam
3


108it [01:05,  2.24it/s]

#######speaker#######
sam
sam
3
#######speaker#######
bea
bea
0


109it [01:06,  1.87it/s]

#######speaker#######
bea
bea
0


110it [01:07,  1.58it/s]

#######speaker#######
bea
bea
0


111it [01:07,  1.58it/s]

#######speaker#######
sam
sam
3


112it [01:08,  1.61it/s]

#######speaker#######
sam
sam
3


113it [01:08,  1.69it/s]

#######speaker#######
sam
sam
3


114it [01:09,  1.56it/s]

#######speaker#######
bea
bea
0


115it [01:10,  1.77it/s]

#######speaker#######
sam
sam
3


116it [01:10,  1.79it/s]

#######speaker#######
sam
sam
3


117it [01:11,  1.58it/s]

#######speaker#######
sam
sam
3


118it [01:13,  1.08s/it]

#######speaker#######
bea
bea
0


119it [01:13,  1.21it/s]

#######speaker#######
sam
sam
3


120it [01:14,  1.53it/s]

#######speaker#######
sam
sam
3


121it [01:15,  1.07it/s]

#######speaker#######
sam
sam
3


122it [01:15,  1.36it/s]

#######speaker#######
sam
sam
3


123it [01:16,  1.47it/s]

#######speaker#######
bea
bea
0


124it [01:16,  1.64it/s]

#######speaker#######
sam
sam
3


125it [01:17,  1.69it/s]

#######speaker#######
sam
sam
3


126it [01:17,  1.87it/s]

#######speaker#######
sam
sam
3


127it [01:18,  2.06it/s]

#######speaker#######
sam
sam
3


128it [01:18,  2.30it/s]

#######speaker#######
sam
sam
3


129it [01:18,  2.55it/s]

#######speaker#######
sam
sam
3


130it [01:19,  2.17it/s]

#######speaker#######
sam
sam
3


131it [01:20,  1.92it/s]

#######speaker#######
bea
bea
0


132it [01:20,  2.25it/s]

#######speaker#######
bea
bea
0


133it [01:21,  1.85it/s]

#######speaker#######
sam
sam
3


134it [01:22,  1.39it/s]

#######speaker#######
bea
bea
0


135it [01:22,  1.71it/s]

#######speaker#######
bea
bea
0


136it [01:23,  1.63it/s]

#######speaker#######
sam
sam
3


137it [01:24,  1.47it/s]

#######speaker#######
sam
sam
3


138it [01:24,  1.37it/s]

#######speaker#######
bea
bea
0


139it [01:25,  1.51it/s]

#######speaker#######
bea
bea
0


140it [01:25,  1.72it/s]

#######speaker#######
sam
sam
3


141it [01:27,  1.02s/it]

#######speaker#######
sam
sam
3


142it [01:28,  1.24it/s]

#######speaker#######
bea
bea
0


143it [01:29,  1.10it/s]

#######speaker#######
sam
sam
3


144it [01:29,  1.39it/s]

#######speaker#######
bea
bea
0


146it [01:30,  2.03it/s]

#######speaker#######
sam
sam
3
#######speaker#######
sam
sam
3


147it [01:31,  1.44it/s]

#######speaker#######
sam
sam
3


148it [01:31,  1.50it/s]

#######speaker#######
sam
sam
3


149it [01:32,  1.34it/s]

#######speaker#######
bea
bea
0


151it [01:34,  1.27it/s]

#######speaker#######
bea
bea
0
#######speaker#######
sam
sam
3


152it [01:35,  1.32it/s]

#######speaker#######
sam
sam
3


153it [01:36,  1.38it/s]

#######speaker#######
sam
sam
3


154it [01:36,  1.70it/s]

#######speaker#######
bea
bea
0


155it [01:37,  1.48it/s]

#######speaker#######
bea
bea
0


156it [01:37,  1.73it/s]

#######speaker#######
sam
sam
3


157it [01:38,  2.00it/s]

#######speaker#######
sam
sam
3


158it [01:38,  2.35it/s]

#######speaker#######
bea
bea
0


159it [01:38,  1.92it/s]

#######speaker#######
bea
bea
0


160it [01:39,  1.84it/s]

#######speaker#######
sam
sam
3


161it [01:40,  1.64it/s]

#######speaker#######
sam
sam
3


162it [01:41,  1.48it/s]

#######speaker#######
sam
sam
3


163it [01:41,  1.85it/s]

#######speaker#######
bea
bea
0


164it [01:41,  1.61it/s]
2023-12-07 16:23:13 | INFO | synthesize_perso | Done.
2023-12-07 16:23:15 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2023-12-07 16:23:18 | INFO | fairseq.tasks.hubert_pretraining | current directory is /home/utilisateur/createch/project/emotion
2023-12-07 16:23:18 | INFO | fairseq.tasks.hubert_pretraining | HubertPretrainingTask Config {'_name': 'hubert_pretraining', 'data': '/checkpoint/wnhsu/data/librispeech/960h/iter/250K_50hz_km100_mp0_65_v2', 'fine_tuning': False, 'labels': ['layer6.km500'], 'label_dir': None, 'label_rate': 50.0, 'sample_rate': 16000, 'normalize': False, 'enable_padding': False, 'max_keep_size': None, 'max_sample_size': 250000, 'min_sample_size': 32000, 'single_target': False, 'random_crop': True, 'pad_audio': False}
2023-12-07 16:23:18 | INFO | fairseq.models.hubert.hubert | HubertModel Config: {'_name': 'hubert', 'label_rate': 50.0, 'extractor_mode': default, 'encoder_layers': 12, 'encod

loading /home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/data/data.tsv and /home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/data/data.km
---
src emotions: neutral
trg emotions: disgusted
defaultdict(<function main.<locals>.<lambda> at 0x7fa9c17f4040>, {})
defaultdict(<function main.<locals>.<lambda> at 0x7fa9c17f4040>, {'SAME': defaultdict(<class 'list'>, {'0421': [0, 1], '0317': [2, 3, 4, 5, 6], '0256': [7, 8, 9, 10], '0344': [11, 12, 13, 14], '0402': [15, 16], '0437': [17, 18], '0436': [19, 20], '0411': [21, 22], '0216': [23, 24, 25, 26, 27], '0412': [28, 29], '0401': [30, 31, 32, 33, 34], '0447': [35, 36, 37, 38, 39], '0414': [40, 41], '0493': [42, 43, 44], '0449': [45, 46], '0455': [47, 48], '0192': [49, 50, 51, 52, 53], '0495': [54, 55, 56], '0443': [57, 58, 59, 60, 61], '0473': [62, 63], '0172': [64, 65, 66, 67, 68], '0226': [69, 70, 71, 72, 73], '0337': [74, 75, 76, 77, 78], '0263': [79, 80, 81, 82], '0484': [83, 84, 85

2023-12-07 16:23:35 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2023-12-07 16:23:36 | INFO | fairseq_cli.preprocess | Namespace(no_progress_bar=False, log_interval=100, log_format=None, log_file=None, aim_repo=None, aim_run_hash=None, tensorboard_logdir=None, wandb_project=None, azureml_logging=False, seed=1, cpu=False, tpu=False, bf16=False, memory_efficient_bf16=False, fp16=False, memory_efficient_fp16=False, fp16_no_flatten_grads=False, fp16_init_scale=128, fp16_scale_window=None, fp16_scale_tolerance=0.0, on_cpu_convert_precision=False, min_loss_scale=0.0001, threshold_loss_scale=None, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, user_dir=None, empty_cache_freq=0, all_gather_list_size=16384, model_parallel_size=1, quantization_config_path=None, profile=False, reset_logging=False, suppress_crashes=False, use_plasma_view=False, plasma_path='/tmp/plasma', criterion='cross_entropy', tokenizer=None, bpe=None

----------------create mafnifest finish----------------


2023-12-07 16:23:37 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2023-12-07 16:23:39 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': False, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': '/home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/fairseq_models', 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'mod

loading pre-trained emotion translation model


2023-12-07 16:23:39 | INFO | fairseq.data.data_utils | loaded 143 examples from: /home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/processed_data_test/tokenized/test.neutral-disgusted.neutral
2023-12-07 16:23:39 | INFO | fairseq.data.data_utils | loaded 143 examples from: /home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/processed_data_test/tokenized/test.neutral-disgusted.disgusted
2023-12-07 16:23:39 | INFO | fairseq.tasks.translation | /home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/processed_data_test/tokenized test neutral-disgusted 143 examples
2023-12-07 16:23:39 | INFO | fairseq.tasks.fairseq_task | can_reuse_epoch_itr = True
2023-12-07 16:23:39 | INFO | fairseq.tasks.fairseq_task | reuse_dataloader = True
2023-12-07 16:23:39 | INFO | fairseq.tasks.fairseq_task | rebuild_batches = False
2023-12-07 16:23:39 | INFO | fairseq.tasks.fairseq_task | creating new batches for epoch 1
2023-12-07 16:

Loading '/home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/save/hifigan/g_00400000'
Complete.


2023-12-07 16:24:01 | INFO | synthesize_perso | loaded duration prediction model from /home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/save/duration_predictor
2023-12-07 16:24:01 | INFO | synthesize_perso | loaded f0 prediction model from CnnPredictor(
  (token_emb): Embedding(201, 256, padding_idx=200)
  (gst_emb): Embedding(20, 8)
  (conv_layer): ModuleList(
    (0): Sequential(
      (0): Rearrange('b t c -> b c t')
      (1): Conv1d(264, 256, kernel_size=(5,), stride=(1,), padding=(2,))
      (2): Rearrange('b c t -> b t c')
      (3): ReLU()
      (4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (5): Dropout(p=0.1, inplace=False)
    )
    (1-5): 5 x Sequential(
      (0): Rearrange('b t c -> b c t')
      (1): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,))
      (2): Rearrange('b c t -> b t c')
      (3): ReLU()
      (4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (5): Dropout(p=0.1, inplace=False)
   

/home/utilisateur/createch/project/emotion/dataset_final/disgusted
outdir: /home/utilisateur/createch/project/emotion/dataset_final/disgusted
140
#######speaker#######
sam
sam
3


1it [00:00,  3.64it/s]

#######speaker#######
sam
sam
3


2it [00:00,  2.36it/s]

#######speaker#######
sam
sam
3


3it [00:01,  1.47it/s]

#######speaker#######
bea
bea
0


4it [00:02,  1.77it/s]

#######speaker#######
sam
sam
3


5it [00:02,  2.31it/s]

#######speaker#######
sam
sam
3


6it [00:03,  1.96it/s]

#######speaker#######
sam
sam
3


7it [00:03,  2.41it/s]

#######speaker#######
sam
sam
3


8it [00:04,  1.53it/s]

#######speaker#######
sam
sam
3


9it [00:05,  1.07it/s]

#######speaker#######
sam
sam
3


10it [00:06,  1.18it/s]

#######speaker#######
bea
bea
0


11it [00:07,  1.37it/s]

#######speaker#######
bea
bea
0


12it [00:07,  1.58it/s]

#######speaker#######
bea
bea
0


13it [00:08,  1.63it/s]

#######speaker#######
sam
sam
3


14it [00:08,  1.84it/s]

#######speaker#######
sam
sam
3


16it [00:08,  2.66it/s]

#######speaker#######
bea
bea
0
#######speaker#######
bea
bea
0


17it [00:09,  1.71it/s]

#######speaker#######
sam
sam
3


18it [00:10,  1.84it/s]

#######speaker#######
sam
sam
3


19it [00:10,  1.86it/s]

#######speaker#######
bea
bea
0


20it [00:11,  2.08it/s]

#######speaker#######
bea
bea
0


21it [00:11,  2.51it/s]

#######speaker#######
bea
bea
0


22it [00:12,  1.89it/s]

#######speaker#######
sam
sam
3


23it [00:13,  1.55it/s]

#######speaker#######
sam
sam
3


24it [00:13,  1.57it/s]

#######speaker#######
bea
bea
0


25it [00:14,  1.95it/s]

#######speaker#######
bea
bea
0


26it [00:14,  1.88it/s]

#######speaker#######
sam
sam
3


27it [00:15,  1.48it/s]

#######speaker#######
sam
sam
3


28it [00:16,  1.57it/s]

#######speaker#######
sam
sam
3


29it [00:16,  1.87it/s]

#######speaker#######
sam
sam
3


30it [00:17,  1.58it/s]

#######speaker#######
bea
bea
0


31it [00:18,  1.56it/s]

#######speaker#######
sam
sam
3


32it [00:18,  1.44it/s]

#######speaker#######
bea
bea
0


33it [00:19,  1.45it/s]

#######speaker#######
sam
sam
3


34it [00:20,  1.22it/s]

#######speaker#######
sam
sam
3


35it [00:22,  1.24s/it]

#######speaker#######
bea
bea
0


36it [00:23,  1.08s/it]

#######speaker#######
bea
bea
0


37it [00:24,  1.14it/s]

#######speaker#######
sam
sam
3


38it [00:25,  1.04it/s]

#######speaker#######
sam
sam
3


39it [00:25,  1.35it/s]

#######speaker#######
sam
sam
3


40it [00:25,  1.49it/s]

#######speaker#######
sam
sam
3


41it [00:26,  1.41it/s]

#######speaker#######
sam
sam
3


42it [00:26,  1.75it/s]

#######speaker#######
bea
bea
0


43it [00:27,  1.98it/s]

#######speaker#######
sam
sam
3


44it [00:28,  1.68it/s]

#######speaker#######
sam
sam
3


45it [00:28,  2.06it/s]

#######speaker#######
sam
sam
3


46it [00:29,  1.71it/s]

#######speaker#######
bea
bea
0


47it [00:29,  2.03it/s]

#######speaker#######
bea
bea
0


48it [00:29,  2.29it/s]

#######speaker#######
sam
sam
3


49it [00:30,  2.01it/s]

#######speaker#######
bea
bea
0


50it [00:30,  1.98it/s]

#######speaker#######
sam
sam
3


52it [00:31,  2.68it/s]

#######speaker#######
bea
bea
0
#######speaker#######
bea
bea
0


53it [00:31,  2.78it/s]

#######speaker#######
sam
sam
3


54it [00:32,  2.39it/s]

#######speaker#######
sam
sam
3


55it [00:32,  2.73it/s]

#######speaker#######
sam
sam
3


56it [00:33,  2.45it/s]

#######speaker#######
sam
sam
3


57it [00:33,  1.89it/s]

#######speaker#######
sam
sam
3


58it [00:34,  2.20it/s]

#######speaker#######
sam
sam
3


59it [00:34,  2.06it/s]

#######speaker#######
sam
sam
3


60it [00:36,  1.17it/s]

#######speaker#######
bea
bea
0


61it [00:37,  1.13it/s]

#######speaker#######
sam
sam
3


62it [00:37,  1.41it/s]

#######speaker#######
sam
sam
3


63it [00:38,  1.67it/s]

#######speaker#######
bea
bea
0


65it [00:38,  1.96it/s]

#######speaker#######
bea
bea
0
#######speaker#######
sam
sam
3


66it [00:39,  1.65it/s]

#######speaker#######
sam
sam
3


67it [00:40,  1.41it/s]

#######speaker#######
sam
sam
3


68it [00:40,  1.75it/s]

#######speaker#######
bea
bea
0


69it [00:41,  2.09it/s]

#######speaker#######
sam
sam
3


70it [00:41,  1.82it/s]

#######speaker#######
bea
bea
0


71it [00:43,  1.36it/s]

#######speaker#######
bea
bea
0


72it [00:43,  1.34it/s]

#######speaker#######
sam
sam
3


73it [00:44,  1.35it/s]

#######speaker#######
sam
sam
3


74it [00:45,  1.25it/s]

#######speaker#######
bea
bea
0


75it [00:45,  1.57it/s]

#######speaker#######
bea
bea
0


76it [00:46,  1.72it/s]

#######speaker#######
sam
sam
3


77it [00:47,  1.51it/s]

#######speaker#######
sam
sam
3


78it [00:47,  1.45it/s]

#######speaker#######
sam
sam
3


79it [00:48,  1.78it/s]

#######speaker#######
sam
sam
3


80it [00:49,  1.39it/s]

#######speaker#######
bea
bea
0


82it [00:49,  2.19it/s]

#######speaker#######
sam
sam
3
#######speaker#######
sam
sam
3


83it [00:50,  2.02it/s]

#######speaker#######
sam
sam
3


84it [00:50,  1.76it/s]

#######speaker#######
sam
sam
3


85it [00:51,  1.81it/s]

#######speaker#######
sam
sam
3


86it [00:51,  1.90it/s]

#######speaker#######
sam
sam
3


87it [00:52,  2.25it/s]

#######speaker#######
bea
bea
0


88it [00:52,  2.41it/s]

#######speaker#######
bea
bea
0


89it [00:53,  1.96it/s]

#######speaker#######
bea
bea
0


90it [00:54,  1.42it/s]

#######speaker#######
sam
sam
3


91it [00:54,  1.74it/s]

#######speaker#######
bea
bea
0


92it [00:55,  1.60it/s]

#######speaker#######
sam
sam
3


93it [00:55,  1.72it/s]

#######speaker#######
sam
sam
3


94it [00:56,  1.62it/s]

#######speaker#######
bea
bea
0


95it [00:57,  1.71it/s]

#######speaker#######
bea
bea
0


96it [00:57,  1.58it/s]

#######speaker#######
sam
sam
3


97it [00:58,  1.96it/s]

#######speaker#######
sam
sam
3


98it [00:58,  2.34it/s]

#######speaker#######
sam
sam
3


99it [01:00,  1.08it/s]

#######speaker#######
sam
sam
3


100it [01:01,  1.02it/s]

#######speaker#######
sam
sam
3


101it [01:02,  1.13it/s]

#######speaker#######
bea
bea
0


102it [01:02,  1.35it/s]

#######speaker#######
sam
sam
3


103it [01:03,  1.51it/s]

#######speaker#######
bea
bea
0


104it [01:03,  1.87it/s]

#######speaker#######
sam
sam
3


105it [01:03,  1.97it/s]

#######speaker#######
bea
bea
0


106it [01:04,  1.93it/s]

#######speaker#######
sam
sam
3


107it [01:05,  1.41it/s]

#######speaker#######
sam
sam
3


108it [01:05,  1.78it/s]

#######speaker#######
bea
bea
0


109it [01:06,  1.87it/s]

#######speaker#######
bea
bea
0


110it [01:06,  1.94it/s]

#######speaker#######
bea
bea
0


111it [01:07,  1.75it/s]

#######speaker#######
bea
bea
0


112it [01:07,  1.82it/s]

#######speaker#######
bea
bea
0


113it [01:09,  1.20it/s]

#######speaker#######
bea
bea
0


114it [01:10,  1.09it/s]

#######speaker#######
sam
sam
3


115it [01:10,  1.38it/s]

#######speaker#######
bea
bea
0


116it [01:11,  1.53it/s]

#######speaker#######
bea
bea
0


117it [01:11,  1.86it/s]

#######speaker#######
bea
bea
0


118it [01:11,  2.13it/s]

#######speaker#######
sam
sam
3


119it [01:13,  1.38it/s]

#######speaker#######
sam
sam
3


120it [01:13,  1.66it/s]

#######speaker#######
bea
bea
0


121it [01:13,  2.03it/s]

#######speaker#######
sam
sam
3


122it [01:14,  2.05it/s]

#######speaker#######
sam
sam
3


123it [01:14,  2.16it/s]

#######speaker#######
bea
bea
0


124it [01:15,  2.13it/s]

#######speaker#######
bea
bea
0


125it [01:15,  2.46it/s]

#######speaker#######
bea
bea
0


126it [01:15,  2.77it/s]

#######speaker#######
sam
sam
3


127it [01:16,  2.22it/s]

#######speaker#######
sam
sam
3


128it [01:17,  1.70it/s]

#######speaker#######
bea
bea
0


129it [01:17,  1.63it/s]

#######speaker#######
sam
sam
3


130it [01:18,  1.86it/s]

#######speaker#######
sam
sam
3


131it [01:18,  1.63it/s]

#######speaker#######
sam
sam
3


132it [01:19,  1.84it/s]

#######speaker#######
bea
bea
0


133it [01:19,  2.26it/s]

#######speaker#######
sam
sam
3


134it [01:19,  2.56it/s]

#######speaker#######
bea
bea
0


135it [01:20,  1.90it/s]

#######speaker#######
bea
bea
0


136it [01:21,  1.74it/s]

#######speaker#######
bea
bea
0


137it [01:22,  1.46it/s]

#######speaker#######
sam
sam
3


138it [01:23,  1.21it/s]

#######speaker#######
bea
bea
0


139it [01:23,  1.55it/s]

#######speaker#######
sam
sam
3


140it [01:24,  1.66it/s]
2023-12-07 16:25:25 | INFO | synthesize_perso | Done.


In [146]:
final_dataset_path = "/home/utilisateur/createch/project/emotion/dataset_final_2"

In [147]:
import glob
if not os.path.exists(final_dataset_path):
    os.makedirs(final_dataset_path)
for index, row in df.iterrows():
    emotion = row['emotion']
    character = row['character']
    emov_charac = dico[character]
    print(row['emotion'])
    audio_name = f"{row['character']}_{row['count_column']}.wav"
    if row['emotion'] != "Neutral":
        print("enter here")
        audio_path = os.path.join(f"/home/utilisateur/createch/project/emotion/dataset_final/{emotion.lower()}", audio_name)
        if not os.path.exists(audio_path):
            print(audio_name)
            search = f"dataset_test/{emov_charac}/Neutral/"+audio_name[:-4]+"_*.wav"
            print(search)
            file_name = glob.glob(search)
            print(file_name)
            shutil.copyfile(file_name[0], os.path.join(f"/home/utilisateur/createch/project/emotion/dataset_final/{emotion.lower()}", audio_name))
        else:
            shutil.copyfile(audio_path, os.path.join(final_dataset_path, audio_name))
    else:
        print("enter there")
        tts.tts_to_file(row['sentence'], file_path = os.path.join(final_dataset_path, audio_name))

Angry
enter here
Angry
enter here
Neutral
enter there
 > Text splitted to sentences.
["Please don't shoot!"]
 > Processing time: 0.10365438461303711
 > Real-time factor: 0.0515885513885308
Angry
enter here
Neutral
enter there
 > Text splitted to sentences.
["Calm down, you see, I'm unarmed .", 'Now please put your gun down.']


 > Processing time: 0.16825461387634277
 > Real-time factor: 0.031428014332926926
Neutral
enter there
 > Text splitted to sentences.
['What are you doing here?', 'And anyway how did you get in?']
 > Processing time: 0.18567132949829102
 > Real-time factor: 0.04528819486103226
Neutral
enter there
 > Text splitted to sentences.
["I'll explain everything.", 'Let me catch my breath.']
 > Processing time: 0.167741060256958
 > Real-time factor: 0.03693962107168748
Neutral
enter there
 > Text splitted to sentences.
['But you are crazy!', 'I almost died of a heart attack!']
 > Processing time: 0.18596911430358887
 > Real-time factor: 0.035428350241862513
Neutral
enter there
 > Text splitted to sentences.
["I'm sorry, the door was ajar and."]
 > Processing time: 0.0902857780456543
 > Real-time factor: 0.03455292637299842
Neutral
enter there
 > Text splitted to sentences.
['And you took that as an invitation to come into my house?']
 > Processing time: 0.08981490135192871
 > Real-time factor: 0.

In [None]:
vc = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False).to("cuda")

 > voice_conversion_models/multilingual/vctk/freevc24 is already downloaded.
 > Using model: freevc
 > Loading pretrained speaker encoder model ...
Loaded the voice encoder model on cuda in 0.07 seconds.


In [None]:
char_dir = "character"
for file in os.listdir(final_dataset_path):
    name = file.split("_")[0]
    vc.voice_conversion_to_file(source_wav=os.path.join(final_dataset_path,file), target_wav=os.path.join(char_dir, dico[name]), file_path=os.path.join(final_dataset_path,file))