# Install required packages
This cell installs the necessary Python packages, pydub and ipython. pydub is used to manipulate audio files, while ipython is used to display the audio files in Jupyter Notebook or Google Colab.

In [1]:
!pip install pydub
!pip install ipython

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting jedi>=0.16
  Downloading jedi-0.18.2-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jedi
Successfully installed jedi-0.18.2


##In this code cell, various Python modules are imported that will be used throughout the script for different tasks.

In [2]:
import os
import random
import json
import shutil
import urllib.request
from glob import glob
import subprocess
from pydub import AudioSegment
from IPython.display import Audio

## In this code cell, the dataset containing the "Jenie Angry" speech emotion audio files is downloaded and extracted:

- The 'speaker_emotion_url' variable is assigned the URL of the dataset hosted on the OpenSLR website.

- 'urllib.request.urlretrieve' is used to download the dataset file 'jenie_Angry.tar.gz' from the given URL.

- The '!mkdir' command creates a new directory named 'jenie_angry_wav'. The exclamation mark (!) at the beginning of the command indicates that it's a shell command, not a Python command.

- The !tar command is used to extract the contents of the downloaded jenie_Angry.tar.gz file to the jenie_angry_wav directory. The -xvf flags indicate that the tar file should be extracted (x), with verbose output (v) and that it's a file (f). The -C flag specifies the target directory for the extracted files.

In [3]:
#Download data and unzip files
speaker_emotion_url = "https://www.openslr.org/resources/115/jenie_Angry.tar.gz"
urllib.request.urlretrieve(speaker_emotion_url, "jenie_Angry.tar.gz")

!mkdir jenie_angry_wav
!tar -xvf jenie_Angry.tar.gz -C jenie_angry_wav

anger_1-28_0001.wav
anger_1-28_0002.wav
anger_1-28_0003.wav
anger_1-28_0004.wav
anger_1-28_0005.wav
anger_1-28_0006.wav
anger_1-28_0007.wav
anger_1-28_0008.wav
anger_1-28_0009.wav
anger_1-28_0010.wav
anger_1-28_0011.wav
anger_1-28_0012.wav
anger_1-28_0013.wav
anger_1-28_0014.wav
anger_1-28_0015.wav
anger_1-28_0016.wav
anger_1-28_0017.wav
anger_1-28_0018.wav
anger_1-28_0019.wav
anger_1-28_0020.wav
anger_1-28_0021.wav
anger_1-28_0022.wav
anger_1-28_0023.wav
anger_1-28_0024.wav
anger_1-28_0025.wav
anger_1-28_0026.wav
anger_1-28_0027.wav
anger_1-28_0028.wav
anger_113-140_0113.wav
anger_113-140_0114.wav
anger_113-140_0115.wav
anger_113-140_0116.wav
anger_113-140_0117.wav
anger_113-140_0118.wav
anger_113-140_0119.wav
anger_113-140_0120.wav
anger_113-140_0121.wav
anger_113-140_0122.wav
anger_113-140_0123.wav
anger_113-140_0124.wav
anger_113-140_0125.wav
anger_113-140_0126.wav
anger_113-140_0127.wav
anger_113-140_0128.wav
anger_113-140_0129.wav
anger_113-140_0130.wav
anger_113-140_0131.wav
ang

###In this code cell, the script downloads the transcript file for the CMU Arctic dataset using the urllib.request.urlretrieve() function. The transcript file contains the text corresponding to each audio file in the dataset.

In [4]:
#Download transcripts
transcript_url = "http://www.festvox.org/cmu_arctic/cmuarctic.data"
urllib.request.urlretrieve(transcript_url, "cmuarctic.data")

('cmuarctic.data', <http.client.HTTPMessage at 0x7fa117a45840>)

###In this code cell, the script preprocesses the audio data, creates metadata files for training and validation, and organizes the audio files into separate directories.


In [5]:
#Preprocess data and create metadata file
data_dir = "jenie_angry_wav"
wavs_dir = "ori_wavs"
transcript_file = "cmuarctic.data"

file_paths = sorted(glob(os.path.join(data_dir, "*.wav")))
random.shuffle(file_paths)
train_file_paths = file_paths[:15]  # Choose 15 random files
val_file_paths = file_paths[15:20]

train_metadata = []
val_metadata = []

with open(transcript_file, "r") as f:
    lines = f.readlines()
    transcripts = {line.split()[1]: " ".join(line.split()[2:-1]).strip('"') for line in lines}
os.makedirs(wavs_dir, exist_ok=True)

if os.path.exists(wavs_dir):
    shutil.rmtree(wavs_dir)

os.makedirs(wavs_dir, exist_ok=True)

for file_path in train_file_paths:
    file_id = os.path.basename(file_path).split("_")[-1].split(".")[0]
    transcript_key = f"arctic_a{file_id}"
    if transcript_key in transcripts:
        transcript = transcripts[transcript_key]
        new_file_path = os.path.join(wavs_dir, f"{file_id}.wav")
        os.rename(file_path, new_file_path)
        train_metadata.append((f"{file_id}.wav", transcript))

for file_path in val_file_paths:
    file_id = os.path.basename(file_path).split("_")[-1].split(".")[0]
    transcript_key = f"arctic_a{file_id}"
    if transcript_key in transcripts:
        transcript = transcripts[transcript_key]
        new_file_path = os.path.join(wavs_dir, f"{file_id}.wav")
        os.rename(file_path, new_file_path)
        val_metadata.append((f"{file_id}.wav", transcript))


with open("metadata.txt", "w") as f:
    for item in train_metadata:
        f.write(f"{item[0].strip('.wav')}|{item[1]}\n")
with open("metadata_val.txt", "w") as f:
    for item in val_metadata:
        f.write(f"{item[0].strip('.wav')}|{item[1]}\n")

###In this code cell, the script resamples the audio files to a consistent sample rate.

In [6]:
resampled_wavs_dir = "/content/wavs"
os.makedirs(resampled_wavs_dir, exist_ok=True)

wav_files = glob(os.path.join(wavs_dir, "*.wav"))

for wav_file in wav_files:
    file_name = os.path.basename(wav_file)
    output_file_path = os.path.join(resampled_wavs_dir, file_name)

    command = f"ffmpeg -i {wav_file} -ar 22050 {output_file_path}"
    subprocess.run(command, shell=True, check=True)


#Display audio file and sample rate.

In [7]:
wavs_directory = "/content/wavs/"
first_audio_file = None

for file in os.listdir(wavs_directory):
    if file.endswith(".wav"):
        audio_file_path = os.path.join(wavs_directory, file)
        break
audio = AudioSegment.from_wav(audio_file_path)
sample_rate = audio.frame_rate

print(f"The sample rate of the audio file is {sample_rate} Hz")


The sample rate of the audio file is 22050 Hz


In [8]:
Audio(audio_file_path)

#Clone and install TTS repository
This cell clones the Coqui TTS repository, installs the TTS package, and lists the available pre-trained TTS models.

In [9]:
%cd /content
!git clone https://github.com/coqui-ai/TTS.git
!pip install TTS
!tts --list_models

/content
Cloning into 'TTS'...
remote: Enumerating objects: 29730, done.[K
remote: Counting objects: 100% (123/123), done.[K
remote: Compressing objects: 100% (65/65), done.[K
remote: Total 29730 (delta 74), reused 86 (delta 58), pack-reused 29607[K
Receiving objects: 100% (29730/29730), 159.14 MiB | 33.43 MiB/s, done.
Resolving deltas: 100% (21554/21554), done.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting TTS
  Downloading TTS-0.13.3-cp310-cp310-manylinux1_x86_64.whl (655 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m655.3/655.3 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bnunicodenormalizer==0.1.1
  Downloading bnunicodenormalizer-0.1.1.tar.gz (38 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting g2pkk>=0.1.1
  Downloading g2pkk-0.1.2-py3-none-any.whl (25 kB)
Collecting pypinyin
  Downloading pypinyin-0.48.0-py2.py3-none-any.whl (1.4 MB)
[2K     [90m━━━━━━━

# Generate audio using pre-trained model
This cell generates an audio file using a pre-trained Tacotron2-DDC model from Coqui and plays the synthesized audio.

In [10]:
!tts --model_name tts_models/en/ljspeech/tacotron2-DDC --out_path /content/tacotron2.wav --text "What were you doing!! I told you to come fast." 

 > Downloading model to /root/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC
100% 113M/113M [00:01<00:00, 75.1MiB/s]
 > Model's license - apache 2.0
 > Check https://choosealicense.com/licenses/apache-2.0/ for more info.
 > Downloading model to /root/.local/share/tts/vocoder_models--en--ljspeech--hifigan_v2
100% 3.80M/3.80M [00:00<00:00, 45.2MiB/s]
 > Model's license - apache 2.0
 > Check https://choosealicense.com/licenses/apache-2.0/ for more info.
 > Using model: Tacotron2
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_

##In this code cell, the script creates a directory for storing pretrained models and moves the downloaded Tacotron2-DDC model into that directory:

In [11]:
!mkdir -p /content/pretrained_models/
!mv /root/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC /content/pretrained_models/

In [12]:
Audio("/content/tacotron2.wav")

#Fine-tune the pre-trained model
This cell fine-tunes the pre-trained Tacotron2-DDC model using the prepared dataset.
#Make these changes in the config.json file present in the path as below (--config_path)


- "lr" = 0.00001
- "epochs" = 200
- "warmup_steps" = 10
- "output_path" = "/content/fine_tune/"
- "phoneme_cache_path" = "/content/phoneme_cache/"
- "datasets" = [
    {
        "name": "fine_tune_anie_angry",
        "path": "/content/",
        "meta_file_train": "metadata.txt",
        "meta_file_val": None,
        "formatter": "thorsten",
        "unused_speakers": None,
        "meta_file_attn_mask": "",
    }

In [13]:
!CUDA_VISIBLE_DEVICES="0" python /content/TTS/TTS/bin/train_tts.py \
    --config_path  /content/pretrained_models/tts_models--en--ljspeech--tacotron2-DDC/config.json \
    --restore_path  /content/pretrained_models/tts_models--en--ljspeech--tacotron2-DDC/model_file.pth

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
| > Number of instances : 15
 | > Preprocessing samples
 | > Max text length: 73
 | > Min text length: 21
 | > Avg text length: 53.2
 | 
 | > Max audio length: 162979.0
 | > Min audio length: 70513.0
 | > Avg audio length: 121271.33333333333
 | > Num. instances discarded samples: 0
 | > Batch group size: 128.

[1m > TRAINING (2023-05-06 18:45:42) [0m

 > CHECKPOINT : /content/Trainings/LJSpeech/ljspeech-ddc-May-06-2023_05+53PM-0000000/checkpoint_278120.pth


> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: False
| > Number of instances : 5
 | > Preprocessing samples
 | > Max text length: 64
 | > Min text length: 33
 | > Avg text length: 53.6
 | 
 | > Max audio length: 124969.0
 | > Min audio length: 104443.0
 | > Avg audio length: 115444.8
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.

[1m > EVALUATION [0m

 | > Synthesizing test sentences

#Generate audio using the fine-tuned model
This cell generates an audio file using the fine-tuned model and plays the synthesized audio.
###Change the model_path and config_path for every fine tuning.
- The path can be traced referring to the below paths.

In [14]:
!tts --model_path /content/Trainings/LJSpeech/ljspeech-ddc-May-06-2023_05+53PM-0000000/best_model.pth \
    --config_path /content/Trainings/LJSpeech/ljspeech-ddc-May-06-2023_05+53PM-0000000/config.json \
    --out_path /content/fine_tuned_generated_audio.wav \
    --text "What were you doing!! I told you to come fast." 

 > Using model: Tacotron2
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:1.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024
 > Model's reduction rate `r` is set to: 1
 > Text: What were you doing!! I told you to come fast.
 > Text splitted to sentences.
['What were you doing!!', 'I told you to come fast.']
 > Processing time: 3.21372532844

In [15]:
Audio("/content/fine_tuned_generated_audio.wav")