# Testing bias

In [10]:
import pandas as pd
import librosa
import IPython.display as ipd

df = pd.read_csv('mp_styles_train.csv', sep=';')

In [11]:
df.head(3)

Unnamed: 0,phonetic_transcription,wav_path,speaker,style
0,#p kk uu uw ts ii vv 'aa rr un bb uu zz 'ee ij...,/l/disk1/awstebas/data/TTS/speaker-adriana/eps...,adriana,animado
1,#p bb uu rr 'aa kk uc nn ac 'aa rf vv oo rr ic...,/l/disk1/awstebas/data/TTS/speaker-adriana/eps...,adriana,acolhedor
2,#p aa kk rd ee dz 'ii tt uc kk ic ss 'ee zh ac...,/l/disk1/awstebas/data/TTS/speaker-adriana/eps...,adriana,animado


In [12]:
print(df['phonetic_transcription'].values[0])
ipd.Audio(df['wav_path'].values[0])

#p kk uu uw ts ii vv 'aa rr un bb uu zz 'ee ij rr uc zz nn uc ll 'eh ss ts ic 'eh ac 'uu nn ic kk ac kk 'oo ij zz ac vv 'aa ll ic dd ac #p


# Testing pitch change bias

In [13]:
n = -20
y, sr = librosa.load(df['wav_path'].values[n])
ipd.Audio(y, rate=sr)

In [14]:
y.dtype

dtype('float32')

In [15]:
ipd.Audio(y*10000.1, rate=sr)

In [16]:
# !pip install praat-parselmouth

In [17]:
import parselmouth
import numpy as np

snd = parselmouth.Sound(df['wav_path'].values[n])

In [18]:
def change_pitch(snd, pitch_shift=1):
    pitch_steps: float = 0.01
    pitch_floor: float = 75
    pitch_ceil: float = 600

    ## Customize
    formant_shift= 1.0
    pitch_shift = pitch_shift
    pitch_range = 1.
    duration_factor = 1.

    pitch = parselmouth.praat.call(
        snd, 'To Pitch', pitch_steps, pitch_floor, pitch_ceil)
    ndpit = pitch.selected_array['frequency']
    # if all unvoiced
    nonzero = ndpit > 1e-5
    # if nonzero.sum() == 0:
    #     return snd.values[0]
    # if voiced
    median, minp = np.median(ndpit[nonzero]).item(), ndpit[nonzero].min().item()
    # scale
    updated = median * pitch_shift
    scaled = updated + (minp * pitch_shift - updated) * pitch_range
    # for preventing infinite loop of `Change gender`
    # ref:https://github.com/praat/praat/issues/1926
    if scaled < 0.:
        pitch_range = 1.
    out, = parselmouth.praat.call(
        (snd, pitch), 'Change gender',
        formant_shift,
        median * pitch_shift,
        pitch_range,
        duration_factor).values
    
    return out

In [19]:
from IPython.display import Audio

out = change_pitch(snd, 1.0)

Audio(out, rate = sr)

In [20]:
out.astype('float32').dtype

dtype('float32')

In [21]:
type(out)

numpy.ndarray

In [22]:
out = change_pitch(snd, 1.2)

Audio(out, rate = sr)

In [23]:
out = change_pitch(snd, 1.3)

Audio(out, rate = sr)

In [24]:
def compute_f0(wav_numpy, p_len=None, sampling_rate=44100,
    hop_length=512, voice_thresh = 0.3):
    import parselmouth
    x = wav_numpy
    if p_len is None:
        p_len = x.shape[0]//hop_length
    else:
        assert abs(p_len-x.shape[0]//hop_length) < 4, "pad length error"
    time_step = hop_length / sampling_rate * 1000
    f0_min = 50
    f0_max = 1100
    f0 = parselmouth.Sound(x, sampling_rate).to_pitch_cc(
        time_step=time_step / 1000, voicing_threshold=voice_thresh,
        pitch_floor=75, pitch_ceiling=1100).selected_array['frequency']

    pad_size=(p_len - len(f0) + 1) // 2
    if(pad_size>0 or p_len - len(f0) - pad_size>0):
        f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
    return f0

## Lets compute range of F0 from Rosana, and Adriana CAN x CPQD

In [25]:
# Flag if data is from canada or CPQD
f0_stats = df.copy()
f0_stats['is_canada'] = ["eps_" in f for f in f0_stats['wav_path']]

In [26]:
f0_stats['is_canada'].value_counts()

False    13521
True      4916
Name: is_canada, dtype: int64

In [27]:
f0_stats[(f0_stats['style']=='neutro') & (f0_stats['speaker']=='adriana')]['is_canada'].value_counts()

False    2157
True     1244
Name: is_canada, dtype: int64

In [28]:
f0_stats[(f0_stats['style']=='neutro') & (f0_stats['speaker']=='adriana')].shape

(3401, 5)

In [29]:
# means = []
# stds = []
# medians = []
# is_canada = []

# from tqdm import tqdm

# for i in tqdm(range(f0_stats[(f0_stats['style']=='neutro') & (f0_stats['speaker']=='adriana')].shape[0])):
#     y, sr = librosa.load(f0_stats[(f0_stats['style']=='neutro') & (f0_stats['speaker']=='adriana')]['wav_path'].values[i])
#     mean = np.mean(compute_f0(y))
#     std = np.std(compute_f0(y))
#     median = np.median(compute_f0(y))
        
#     means.append(mean)
#     stds.append(std)
#     medians.append(medians)
#     is_canada.append(f0_stats[(f0_stats['style']=='neutro') & (f0_stats['speaker']=='adriana')]['is_canada'].values[i])

In [30]:
# r = pd.DataFrame({'means': means,
#                  'stds':stds,
#                  'medians':medians,
#                  'is_canada': is_canada})

In [31]:
# # r.head()

# means = np.array(means)
# stds = np.array(stds)
# medians = np.array(medians)

In [32]:
# np.mean(means[is_canada])

# Generating biased wavs

temos 1244 arquivos da adriana no canada, vamos pegar mais ou menos a mesma proporção para cada locutor.

# First for training

In [33]:
df.head()

Unnamed: 0,phonetic_transcription,wav_path,speaker,style
0,#p kk uu uw ts ii vv 'aa rr un bb uu zz 'ee ij...,/l/disk1/awstebas/data/TTS/speaker-adriana/eps...,adriana,animado
1,#p bb uu rr 'aa kk uc nn ac 'aa rf vv oo rr ic...,/l/disk1/awstebas/data/TTS/speaker-adriana/eps...,adriana,acolhedor
2,#p aa kk rd ee dz 'ii tt uc kk ic ss 'ee zh ac...,/l/disk1/awstebas/data/TTS/speaker-adriana/eps...,adriana,animado
3,#p uc bb aa rr 'uu lh uc dd ac rx 'uu ac nn 'a...,/l/disk1/awstebas/data/TTS/speaker-adriana/eps...,adriana,neutro
4,#p aa kk 'ee ll ic ss kk ic dz 'ii zz en kk ic...,/l/disk1/awstebas/data/TTS/speaker-adriana/eps...,adriana,neutro


In [34]:
df['speaker'].value_counts()

adriana      7073
rosana       6684
chiquinho    4680
Name: speaker, dtype: int64

In [35]:
# subindo 20% de pitch
import os

# os.mkdir('../../tmp_data/rosana120pitch')
# os.mkdir('../../tmp_data/carlos120pitch')

In [36]:
df = pd.read_csv('mp_styles_train.csv', sep=';')
new_train = df.sample(frac=1, random_state=42).reset_index(drop=True).copy()

In [37]:
# new_train['wav_path'].loc[0] = 'oi'

In [38]:
new_train.head()

Unnamed: 0,phonetic_transcription,wav_path,speaker,style
0,#p kk 'aa dd ac un ts 'ii rr ac ss 'uu ac ss p...,/l/disk1/awstebas/data/TTS/speaker-carlos/inte...,chiquinho,neutro
1,#p en ss ee gg 'ii dd ac #c dd 'eh vv ic ss 'e...,/l/disk1/awstebas/data/TTS/speaker-adriana/eps...,adriana,neutro
2,#p bb 'on ss 'en ss uc nn ac tt aa rr 'eh ff a...,/l/disk1/awstebas/data/TTS/speaker-rosana/rf_s...,rosana,neutro
3,#p 'ee uw ee ss tt 'aa vv ac kk oo mm ee ss 'a...,/l/disk1/awstebas/data/TTS/speaker-carlos/inte...,chiquinho,neutro
4,#p pp oo rr aa kk 'aa zz uc #c mm ee nn 'oh rf...,/l/disk1/awstebas/data/TTS/speaker-adriana/eps...,adriana,animado


In [39]:
'_'.join(new_train.wav_path.values[-20].split('/')[-4:])

'speaker-adriana_riqueza_fonetica_wav22_riqueza_fonetica1481.wav'

In [40]:
from scipy.io.wavfile import write
from tqdm import tqdm

c = 0 #counting for carlos
r = 0 #counting for rosana

for i in tqdm(range(new_train.shape[0])):
    if(new_train['speaker'].values[i] == 'chiquinho' and c < 1244):
        snd, sr  =  librosa.load(new_train['wav_path'].values[i], sr = None)
        snd = parselmouth.Sound(snd, sampling_frequency=sr)
        out = change_pitch(snd, 1.2)
        new_wpath = '/workspace/coqui-tts/tmp_data/carlos120pitch/' + '_'.join(new_train.wav_path.values[i].split('/')[-4:])
        write(new_wpath , sr, out.astype('float32'))
        new_train['wav_path'].loc[i] = new_wpath
        c += 1
        
    if(new_train['speaker'].values[i] == 'rosana' and r < 1244):
        snd, sr  =  librosa.load(new_train['wav_path'].values[i], sr = None)
        snd = parselmouth.Sound(snd, sampling_frequency=sr)
        out = change_pitch(snd, 1.2)
        new_wpath = '/workspace/coqui-tts/tmp_data/rosana120pitch/' + '_'.join(new_train.wav_path.values[i].split('/')[-4:])
        write(new_wpath , sr, out.astype('float32'))
        new_train['wav_path'].loc[i] = new_wpath
        r += 1

100%|██████████| 18437/18437 [02:30<00:00, 122.14it/s]


In [1]:
import os
# Checking len of each folder
len_c = len(os.listdir('../../tmp_data/carlos120pitch'))
len_r = len(os.listdir('../../tmp_data/rosana120pitch'))
print(f'len carlos path = {len_c}')
print(f'len rosana path = {len_r}')

len carlos path = 1260
len rosana path = 1259


In [42]:
# lets hear some samples
new_train.head()

Unnamed: 0,phonetic_transcription,wav_path,speaker,style
0,#p kk 'aa dd ac un ts 'ii rr ac ss 'uu ac ss p...,/workspace/coqui-tts/tmp_data/carlos120pitch/s...,chiquinho,neutro
1,#p en ss ee gg 'ii dd ac #c dd 'eh vv ic ss 'e...,/l/disk1/awstebas/data/TTS/speaker-adriana/eps...,adriana,neutro
2,#p bb 'on ss 'en ss uc nn ac tt aa rr 'eh ff a...,/workspace/coqui-tts/tmp_data/rosana120pitch/s...,rosana,neutro
3,#p 'ee uw ee ss tt 'aa vv ac kk oo mm ee ss 'a...,/workspace/coqui-tts/tmp_data/carlos120pitch/s...,chiquinho,neutro
4,#p pp oo rr aa kk 'aa zz uc #c mm ee nn 'oh rf...,/l/disk1/awstebas/data/TTS/speaker-adriana/eps...,adriana,animado


In [43]:
n = 0
print(new_train['phonetic_transcription'].values[n])
ipd.Audio(new_train['wav_path'].values[n])

#p kk 'aa dd ac un ts 'ii rr ac ss 'uu ac ss pp rd 'oh pp rd ij ac ss kk on kk ll uu zz 'on jn zz 'oo uw dd ee bb aa tt 'ee mm uc zz en gg rd 'uu pp uc #q


In [44]:
n = 2
print(new_train['phonetic_transcription'].values[n])
ipd.Audio(new_train['wav_path'].values[n])

#p bb 'on ss 'en ss uc nn ac tt aa rr 'eh ff ac dd ac ll 'ii gg ac 'aa rr ac bb ic #c dd uu rr 'an ts ic ac mm an nh 'an #c pp oo rr aa kk 'ii #p


In [45]:
new_train.tail()

Unnamed: 0,phonetic_transcription,wav_path,speaker,style
18432,#p mm ee dz 'ii dd ac zz dz ic mm ee rf kk 'aa...,/l/disk1/awstebas/data/TTS/speaker-carlos/riqu...,chiquinho,neutro
18433,#p kk 'oo mm uc aa bb rd 'ii rf kk 'on tt ac n...,/l/disk1/awstebas/data/TTS/speaker-rosana/bia_...,rosana,neutro
18434,#p mm 'aa rf kk uc aa uw rr 'eh ll ic uw ff 'o...,/l/disk1/awstebas/data/TTS/speaker-adriana/riq...,adriana,neutro
18435,#p ss 'oh un in ss tt 'an ts ic kk ic 'ee uw z...,/l/disk1/awstebas/data/TTS/speaker-adriana/eps...,adriana,rispido
18436,#p pp 'oh ss uc ts ic aa zh uu dd 'aa rf kk on...,/l/disk1/awstebas/data/TTS/speaker-rosana/muta...,rosana,neutro


In [46]:
n = 18432
print(new_train['phonetic_transcription'].values[n])
ipd.Audio(new_train['wav_path'].values[n])

#p mm ee dz 'ii dd ac zz dz ic mm ee rf kk 'aa dd uc oo kk 'uu pp an wn ac mm aa ij 'oh rf pp 'aa rf ts ic dd ac ss ff 'ee ij tt ac ss pp 'ee ll uc pp aa rf ll aa mm en tt 'aa rf #p


In [47]:
n = 18433
print(new_train['phonetic_transcription'].values[n])
ipd.Audio(new_train['wav_path'].values[n])

#p kk 'oo mm uc aa bb rd 'ii rf kk 'on tt ac nn uc bb rd aa dd 'ee ss kk uc #q


In [48]:
cols = ['phonetic_transcription', 'wav_path', 'speaker','style']

new_train[cols].to_csv('mp_styles_train_120pitch.csv', index = False, sep=';', encoding = 'utf-8')

# now for validation

In [49]:
df = pd.read_csv('mp_styles_val.csv', sep=';')
new_val = df.sample(frac=1, random_state=42).reset_index(drop=True).copy()

In [50]:
# Flag if data is from canada or CPQD
f0_stats = df.copy()
f0_stats['is_canada'] = ["eps_" in f for f in f0_stats['wav_path']]
f0_stats[(f0_stats['style']=='neutro') & (f0_stats['speaker']=='adriana')]['is_canada'].value_counts()

False    108
True      39
Name: is_canada, dtype: int64

In [51]:
df['speaker'].value_counts()

adriana      261
rosana        50
chiquinho     30
Name: speaker, dtype: int64

In [52]:
from scipy.io.wavfile import write
from tqdm import tqdm

c = 0 #counting for carlos
r = 0 #counting for rosana

for i in tqdm(range(new_val.shape[0])):
    if(new_val['speaker'].values[i] == 'chiquinho' and c < 15):
        snd, sr  =  librosa.load(new_val['wav_path'].values[i], sr = None)
        snd = parselmouth.Sound(snd, sampling_frequency=sr)
        out = change_pitch(snd, 1.2)
        new_wpath = '/workspace/coqui-tts/tmp_data/carlos120pitch/' + '_'.join(new_val.wav_path.values[i].split('/')[-4:])
        write(new_wpath , sr, out.astype('float32'))
        new_val['wav_path'].loc[i] = new_wpath
        c += 1
        
    if(new_val['speaker'].values[i] == 'rosana' and r < 15):
        snd, sr  =  librosa.load(new_val['wav_path'].values[i], sr = None)
        snd = parselmouth.Sound(snd, sampling_frequency=sr)
        out = change_pitch(snd, 1.2)
        new_wpath = '/workspace/coqui-tts/tmp_data/rosana120pitch/' + '_'.join(new_val.wav_path.values[i].split('/')[-4:])
        write(new_wpath , sr, out.astype('float32'))
        new_val['wav_path'].loc[i] = new_wpath
        r += 1

100%|██████████| 341/341 [00:01<00:00, 212.51it/s]


In [53]:
new_val[cols].to_csv('mp_styles_val_120pitch.csv', index = False, sep=';', encoding = 'utf-8')

# just looking to path of wavs

In [3]:
import pandas as pd
df = pd.read_csv('mp_styles_val_120pitch.csv', sep=';')
df.head()

Unnamed: 0,phonetic_transcription,wav_path,speaker,style
0,#p 'eh 'uu mm ac 'oh bb rd ac vv ii ss tt 'oh ...,/workspace/coqui-tts/tmp_data/rosana120pitch/s...,rosana,neutro
1,#p 'eh pp rd ee ss 'ii zz uc tt 'ee rf pp 'ee ...,/l/disk1/awstebas/data/TTS/speaker-adriana/eps...,adriana,rispido
2,#p kk 'aa zz uc nn 'an wn tt 'en nh ac ss 'uu ...,/l/disk1/awstebas/data/TTS/speaker-adriana/eps...,adriana,acolhedor
3,#p ac kk an tt oo rr 'ii ac ff 'oo ij nn 'oo i...,/l/disk1/awstebas/data/TTS/speaker-adriana/eps...,adriana,neutro
4,#p ac ss 'ee ss tt ac bb 'aa zz ic kk ac tt rd...,/l/disk1/awstebas/data/TTS/speaker-adriana/eps...,adriana,acolhedor


In [4]:
import pandas as pd
df = pd.read_csv('mp_styles_train_120pitch.csv', sep=';')
df.head()

Unnamed: 0,phonetic_transcription,wav_path,speaker,style
0,#p kk 'aa dd ac un ts 'ii rr ac ss 'uu ac ss p...,/workspace/coqui-tts/tmp_data/carlos120pitch/s...,chiquinho,neutro
1,#p en ss ee gg 'ii dd ac #c dd 'eh vv ic ss 'e...,/l/disk1/awstebas/data/TTS/speaker-adriana/eps...,adriana,neutro
2,#p bb 'on ss 'en ss uc nn ac tt aa rr 'eh ff a...,/workspace/coqui-tts/tmp_data/rosana120pitch/s...,rosana,neutro
3,#p 'ee uw ee ss tt 'aa vv ac kk oo mm ee ss 'a...,/workspace/coqui-tts/tmp_data/carlos120pitch/s...,chiquinho,neutro
4,#p pp oo rr aa kk 'aa zz uc #c mm ee nn 'oh rf...,/l/disk1/awstebas/data/TTS/speaker-adriana/eps...,adriana,animado


# just testing spectrogram

In [54]:
import sys
sys.path.insert(1, '/workspace/coqui-tts')
from TTS.utils.audio import AudioProcessor
from TTS.config import check_config_and_model_args, get_from_config_or_model_args, load_config, register_config


In [55]:
config = load_config('../../experiments/re_fastpitch_120pitch/config.json')

In [56]:
%%capture
ap = AudioProcessor(**config.audio)

In [57]:
n = 0
y, sr = librosa.load(new_train['wav_path'].values[n])
ipd.Audio(y, rate=sr)

In [58]:
np.mean(ap.melspectrogram(y))

-2.2527206

In [59]:
np.mean(ap.melspectrogram(y*10))

-1.2527206

In [60]:
n = 0
y, sr = librosa.load(new_train['wav_path'].values[n])
ipd.Audio(y*10, rate=sr)