In [119]:
import pandas as pd
import os
import torch
from tqdm import tqdm
import numpy as np
import shutil
from TTS.api import TTS
from transformers import pipeline
import soundfile as sf
from librosa import resample
from subprocess import check_call
import random
import sys
print(sys.version)
device = "cuda" if torch.cuda.is_available() else "cpu"

# Preprocess of the script

In [121]:
df = pd.read_csv('TWATC_processed.csv') ##imports the processed csv file
df

Unnamed: 0,idx,type,character,sentence,j-hartmann/emotion-english-distilroberta-base,SamLowe/roberta-base-go_emotion,michellejieli/emotion_text_classifier
0,,speech,Madison,No!,anger,disapproval,anger
1,,consigne,,"Surprised, Alexandre jumps up and points his g...",,,
2,,speech,Alexandre,One move and you're dead!,anger,neutral,anger
3,,speech,Madison,Please don't shoot!,fear,neutral,fear
4,,speech,Alexandre,Hands up!,joy,neutral,anger
...,...,...,...,...,...,...,...
551,,consigne,,"He looks towards the audience, towards the win...",,,
552,,speech,Madison,So? should I tell her to come over?,surprise,curiosity,neutral
553,,speech,Alexandre,She crossed the Atlantic to find me. I can cro...,neutral,neutral,neutral
554,,consigne,,Alexander kisses Madison.,,,


### Get the character list to know the number of them

In [122]:
list_character = df['character'].dropna().unique().tolist()
list_character

['Madison', 'Alexandre']

### Assigne to each character the a name of the people of EmoVDB dataset (bea, sam, josh, jenie) and for each emotion of the script assigne one of the five emotion (Angry, Amused, Disgusted, Neutral, Sleepy)

In [123]:
dico = {
    "Madison" : "bea",
    "Alexandre" : "sam",
}
emotion_dico = {
    "anger": "Angry",
    "disgust": "Disgusted",
    "neutral": "Neutral",
    "sadness": "Sleepy", 
    "joy": "Amused",
    "fear": "Neutral",
    "surprise": "Neutral",
}

In [124]:
df['audio'] = df['character'].map(dico)
df['emotion'] = df['michellejieli/emotion_text_classifier'].map(emotion_dico)
df

Unnamed: 0,idx,type,character,sentence,j-hartmann/emotion-english-distilroberta-base,SamLowe/roberta-base-go_emotion,michellejieli/emotion_text_classifier,audio,emotion
0,,speech,Madison,No!,anger,disapproval,anger,bea,Angry
1,,consigne,,"Surprised, Alexandre jumps up and points his g...",,,,,
2,,speech,Alexandre,One move and you're dead!,anger,neutral,anger,sam,Angry
3,,speech,Madison,Please don't shoot!,fear,neutral,fear,bea,Neutral
4,,speech,Alexandre,Hands up!,joy,neutral,anger,sam,Angry
...,...,...,...,...,...,...,...,...,...
551,,consigne,,"He looks towards the audience, towards the win...",,,,,
552,,speech,Madison,So? should I tell her to come over?,surprise,curiosity,neutral,bea,Neutral
553,,speech,Alexandre,She crossed the Atlantic to find me. I can cro...,neutral,neutral,neutral,sam,Neutral
554,,consigne,,Alexander kisses Madison.,,,,,


In [125]:
# Drop non audio lines
df = df.dropna(subset=['audio'])
df

Unnamed: 0,idx,type,character,sentence,j-hartmann/emotion-english-distilroberta-base,SamLowe/roberta-base-go_emotion,michellejieli/emotion_text_classifier,audio,emotion
0,,speech,Madison,No!,anger,disapproval,anger,bea,Angry
2,,speech,Alexandre,One move and you're dead!,anger,neutral,anger,sam,Angry
3,,speech,Madison,Please don't shoot!,fear,neutral,fear,bea,Neutral
4,,speech,Alexandre,Hands up!,joy,neutral,anger,sam,Angry
6,,speech,Madison,"Calm down, you see, I'm unarmed . Now please p...",fear,neutral,fear,bea,Neutral
...,...,...,...,...,...,...,...,...,...
549,,speech,Madison,"Yes, but she hasn't read it yet. She didn't kn...",surprise,neutral,sadness,bea,Sleepy
550,,speech,Alexandre,I still love her. This book is the proof of that.,joy,love,joy,sam,Amused
552,,speech,Madison,So? should I tell her to come over?,surprise,curiosity,neutral,bea,Neutral
553,,speech,Alexandre,She crossed the Atlantic to find me. I can cro...,neutral,neutral,neutral,sam,Neutral


In [126]:
df[df["emotion"].isna()] #Verification, the result should be empty

Unnamed: 0,idx,type,character,sentence,j-hartmann/emotion-english-distilroberta-base,SamLowe/roberta-base-go_emotion,michellejieli/emotion_text_classifier,audio,emotion


### Creation of a column **count_column** to named the audio file
this nomination is done like that:
Madison -> 1
Alexander -> 1
James -> 1
Madison -> 2
Alexander -> 2
Madison -> 3
James -> 2

the file name is a combination of the character name and the count column

In [127]:
# Create a new column 'count_column' using a loop
counts = {}
count_column = []

for value in df['character']:
    if value in counts:
        counts[value] += 1
    else:
        counts[value] = 1
    count_column.append(counts[value])

df['count_column'] = count_column

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['count_column'] = count_column


### Utils functions

In [129]:
def get_fname(s):
    return s.split("\t")[0]

def get_emotion(s):
    return get_fname(s).split("_")[0].split("/")[1].lower()
def get_spker_id(s):
    return get_fname(s).split("_")[0].split("/")[0].lower()

def get_utt_id(s):
    return get_fname(s).split(".")[0].split("_")[-1]
def get_all_different_utt_id(tsv_lines) -> list:
    utts_per_speaker = {
    "sam": [],
    "bea": [],
    "josh": [],
    "jenie": [],
    }
    for line in tsv_lines:
        spkr = get_spker_id(line)
        utt_id = get_utt_id(line)
        if utt_id not in utts_per_speaker[spkr]:
            utts_per_speaker[spkr].append(utt_id)
    return utts_per_speaker
def tsv_per_emotion(tsv_lines, emotion) -> list:
    lines = []
    for line in tsv_lines:
        if emotion.lower() == get_emotion(line):      
            lines.append(line)      
    return lines
def get_tsv_lines_for_utt_ids(tsv_lines, specific_utt_id):
    utts_per_speaker = []
    for line in tsv_lines:
        spkr = get_spker_id(line)
        utt_id = get_utt_id(line)
        if utt_id == specific_utt_id:
            # Assuming the audio file name is also part of the line, extract it
            utts_per_speaker.append(line)

    return utts_per_speaker
def get_tsv_lines_for_emotion(tsv_lines, emotion):
    audio_files_name = []

    for line in tsv_lines:
        if emotion == get_emotion(line):
            # Assuming the audio file name is also part of the line, extract it
            audio_files_name.append(line)

    return audio_files_name
def get_number_audio_per_emotion(df, emotion):
    return len(df[df['emotion'] == emotion])

def sample_and_remove_rows(df, num_rows):
    # Randomly sample rows
    sampled_rows = df.sample(n=num_rows)

    # Remove sampled rows from the original DataFrame
    df.drop(sampled_rows.index, inplace=True)

    return sampled_rows, df
def decompose_base_2(number):
    powers = []
    remainder = number
    power = 0

    # Find the highest power of 2 less than or equal to the number
    while 2 ** power <= number:
        power += 1

    # Subtract powers of 2 from the number and store them
    for i in range(power - 1, -1, -1):
        if 2 ** i <= remainder:
            powers.append(2 ** i)
            remainder -= 2 ** i

    return powers
def call(
    model_dir,
    data,
    split,
    output_path,
    src_emotion,
    trg_emotion,
    dict,
    user_dir,
    dataset
):
    cmd = f"""python3 fairseq/examples/emotion_conversion/preprocessing.py \
    --model-dir {model_dir} \
    --data {data} \
    --split {split} \
    --output-path {output_path} \
    --src-emotion {src_emotion} \
    --trg-emotion {trg_emotion} \
    --dict {dict} \
    --user-dir {user_dir} \
    --dataset {dataset}"""
    
    check_call(cmd, shell=True)

In [130]:
tsv_lines = open("fairseq/examples/emotion_conversion/data/data.tsv", "r").readlines() # data tsv file with all the audio files and the number of sample associated
root, tsv_lines = tsv_lines[0], tsv_lines[1:]
utts = get_all_different_utt_id(tsv_lines)
tts = TTS("tts_models/en/ljspeech/fast_pitch").to(device)

In [None]:
file_path = "fairseq/examples/emotion_conversion/data"
root = "/home/utilisateur/createch/project/emotion/dataset"
model_dir = "fairseq/examples/emotion_conversion/models"
dataset_path = "/home/utilisateur/createch/project/emotion/dataset_test"
final_dataset_path = "/home/utilisateur/createch/project/emotion/dataset_final_2"

In [None]:
# Get emotion include
emotion_include = df["emotion"].unique()
emotion_include = emotion_include[~pd.isnull(emotion_include)]
emotion_include = [emotion for emotion in emotion_include.tolist() if emotion != "Neutral"]
utts = get_all_different_utt_id(tsv_lines)
        
#clear the Neutral folder
for emov_charac in dico.values():
    if os.path.exists(os.path.join(dataset_path, emov_charac, "Neutral")):
        shutil.rmtree(os.path.join(dataset_path, emov_charac, "Neutral"))
    os.makedirs(os.path.join(dataset_path, emov_charac, "Neutral"), exist_ok=True)
print(tsv_lines)
emotion_df = df[df['emotion'].isin(emotion_include)]
plus_df = df[df["emotion"] == "Neutral"].sample(n = int(len(emotion_df)/2))
emotion_df = pd.concat([emotion_df, plus_df])
lines = []
for index, row in emotion_df.iterrows():
    
    #Create the wav name
    character = row["character"]

    specific_utt = utts[dico[character]].pop()
    name_audio = f"{character}_{row['count_column']}_{specific_utt}.wav"
    audio_path = os.path.join(dataset_path, f"{dico[character]}/Neutral",name_audio)
    print(audio_path)
    
    #Creation of the audio file
    sentence = row['sentence']
    print(sentence)
    tts.tts_to_file(sentence, file_path=audio_path)
    
    #Resample the audio file
    data, samplerate = sf.read(audio_path)
    data = resample(data,orig_sr = samplerate, target_sr = 16000)
    sf.write(audio_path, data, 16000)
    name = os.path.join(f"{dico[character]}/Neutral",name_audio)
    line = f"{name}\t{len(data)}\t\n"
    lines.append(line)

    for emotion in emotion_include:
        #save the line for the tsv
        tsv_lines_emotion = tsv_per_emotion(tsv_lines, emotion)
        tsv_lines_utt = get_tsv_lines_for_utt_ids(tsv_lines_emotion,specific_utt)
        for x in tsv_lines_utt:
            if x[-2:]!="\n" and x[:3]=="sam":
                x+="\n"
        print(tsv_lines_utt)
    
        #copy the audio of the tsv
        for line in tsv_lines_utt:
            if get_spker_id(line) == dico[character]:
                lines += [line]
                os.makedirs(os.path.join(dataset_path, dico[character], emotion), exist_ok=True)
                shutil.copy(os.path.join(root, line.split("\t")[0]), os.path.join(dataset_path, line.split("\t")[0]))
    
#save the tsv
lines = [dataset_path + "\t\n"] + lines
lines = lines[:-1] + [lines[-1][:-1]]
open(os.path.join(file_path, "data.tsv"), "w").writelines(lines)
for emotion in emotion_include:
    call(
        model_dir="/home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/save ",
        data="/home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/data",
        split="data",
        output_path="/home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/processed_data_test",
        src_emotion="neutral",
        trg_emotion=emotion.lower(),
        dict="/home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/data/dict.txt",
        user_dir="/home/utilisateur/createch/project/emotion/fairseq/examples/emotion_conversion/fairseq_models",
        dataset=f"/home/utilisateur/createch/project/emotion/dataset_final/{emotion.lower()}"
    )

In [None]:
import glob
if not os.path.exists(final_dataset_path):
    os.makedirs(final_dataset_path)
for index, row in df.iterrows():
    emotion = row['emotion']
    character = row['character']
    emov_charac = dico[character]
    print(row['emotion'])
    audio_name = f"{row['character']}_{row['count_column']}.wav"
    if row['emotion'] != "Neutral":
        print("enter here")
        audio_path = os.path.join(f"/home/utilisateur/createch/project/emotion/dataset_final/{emotion.lower()}", audio_name)
        if not os.path.exists(audio_path):
            search = f"dataset_test/{emov_charac}/Neutral/"+audio_name[:-4]+"_*.wav"
            file_name = glob.glob(search)
            shutil.copyfile(file_name[0], os.path.join(f"/home/utilisateur/createch/project/emotion/dataset_final/{emotion.lower()}", audio_name))
        else:
            shutil.copyfile(audio_path, os.path.join(final_dataset_path, audio_name))
    else:
        tts.tts_to_file(row['sentence'], file_path = os.path.join(final_dataset_path, audio_name))

In [None]:
vc = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False).to("cuda")

 > voice_conversion_models/multilingual/vctk/freevc24 is already downloaded.
 > Using model: freevc
 > Loading pretrained speaker encoder model ...
Loaded the voice encoder model on cuda in 0.07 seconds.


In [None]:
char_dir = "character"
for file in os.listdir(final_dataset_path):
    name = file.split("_")[0]
    vc.voice_conversion_to_file(source_wav=os.path.join(final_dataset_path,file), target_wav=os.path.join(char_dir, dico[name]), file_path=os.path.join(final_dataset_path,file))