# Initial Steps

## a) Mount to Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive/


## b) Cloning Github Repository to Drive

In [0]:
!pwd
%cd ..
%cd /content/drive/My Drive/
!pwd

/content/drive/My Drive/TTS
/content/drive/My Drive
/content/drive/My Drive
/content/drive/My Drive


In [0]:
!git clone https://github.com/Araf076/pattern-201-G3

Cloning into 'pattern-201-G3'...
remote: Enumerating objects: 53, done.[K
remote: Counting objects: 100% (53/53), done.[K
remote: Compressing objects: 100% (43/43), done.[K
remote: Total 53 (delta 12), reused 39 (delta 6), pack-reused 0[K
Unpacking objects: 100% (53/53), done.


## c) Installing Libraries

In [6]:
pwd

'/content/drive/My Drive/pattern-201-G3/TTS-Pytorch'

In [5]:
%cd /content/drive/My Drive/pattern-201-G3/TTS-Pytorch

/content/drive/My Drive/pattern-201-G3/TTS-Pytorch


In [7]:
!pip install -q -r requirements.txt

[?25l[K     |█▊                              | 10kB 30.4MB/s eta 0:00:01[K     |███▍                            | 20kB 4.6MB/s eta 0:00:01[K     |█████                           | 30kB 6.5MB/s eta 0:00:01[K     |██████▊                         | 40kB 8.2MB/s eta 0:00:01[K     |████████▍                       | 51kB 5.3MB/s eta 0:00:01[K     |██████████                      | 61kB 6.2MB/s eta 0:00:01[K     |███████████▊                    | 71kB 7.1MB/s eta 0:00:01[K     |█████████████▍                  | 81kB 8.0MB/s eta 0:00:01[K     |███████████████                 | 92kB 8.7MB/s eta 0:00:01[K     |████████████████▊               | 102kB 7.0MB/s eta 0:00:01[K     |██████████████████▍             | 112kB 7.0MB/s eta 0:00:01[K     |████████████████████            | 122kB 7.0MB/s eta 0:00:01[K     |█████████████████████▊          | 133kB 7.0MB/s eta 0:00:01[K     |███████████████████████▌        | 143kB 7.0MB/s eta 0:00:01[K     |█████████████████████████▏

## d) Checking Versions 
(Enable GPU from Runtime)

In [8]:
!python --version

import torch
torch.cuda.get_device_name(0)

Python 3.6.9


'Tesla K80'

# Training/Synthesizing English Text-To-Speech

## Dataset Download & Preprocessing

### Dataset Download (if not done already)

In [9]:
pwd

'/content/drive/My Drive/pattern-201-G3/TTS-Pytorch'

In [10]:
import os
import sys
import torch

from os.path import exists, join, expanduser
from audio import preprocess
from utils import download_file
from datasets.lj_speech import LJSpeech

dataset_file_name = 'LJSpeech-1.1.tar.bz2'

datasets_path = os.path.join('/content/drive/My Drive/pattern-201-G3/TTS-Pytorch', 'datasets')
dataset_path = os.path.join(datasets_path, 'LJSpeech-1.1')

if os.path.isdir(dataset_path) and False:
  print("LJSpeech dataset folder already exists")
  sys.exit(0)

else:
  dataset_file_path = os.path.join(datasets_path, dataset_file_name)

  if not os.path.isfile(dataset_file_path):
    url = "http://data.keithito.com/data/speech/%s" % dataset_file_name
    download_file(url, dataset_file_path)
  else:
    print("'%s' already exists" % dataset_file_name)

'LJSpeech-1.1.tar.bz2' already exists


In [11]:
%cd datasets/

/content/drive/My Drive/pattern-201-G3/TTS-Pytorch/datasets


### Extract the Dataset (if not done already)

In [0]:
print("extracting '%s'..." % dataset_file_name)
!tar xvjf LJSpeech-1.1.tar.bz2

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
LJSpeech-1.1/wavs/LJ030-0192.wav
LJSpeech-1.1/wavs/LJ041-0078.wav
LJSpeech-1.1/wavs/LJ045-0249.wav
LJSpeech-1.1/wavs/LJ034-0035.wav
LJSpeech-1.1/wavs/LJ010-0152.wav
LJSpeech-1.1/wavs/LJ036-0174.wav
LJSpeech-1.1/wavs/LJ035-0076.wav
LJSpeech-1.1/wavs/LJ032-0176.wav
LJSpeech-1.1/wavs/LJ046-0113.wav
LJSpeech-1.1/wavs/LJ017-0096.wav
LJSpeech-1.1/wavs/LJ004-0098.wav
LJSpeech-1.1/wavs/LJ010-0147.wav
LJSpeech-1.1/wavs/LJ042-0230.wav
LJSpeech-1.1/wavs/LJ041-0033.wav
LJSpeech-1.1/wavs/LJ045-0229.wav
LJSpeech-1.1/wavs/LJ014-0199.wav
LJSpeech-1.1/wavs/LJ002-0082.wav
LJSpeech-1.1/wavs/LJ006-0055.wav
LJSpeech-1.1/wavs/LJ045-0120.wav
LJSpeech-1.1/wavs/LJ050-0028.wav
LJSpeech-1.1/wavs/LJ045-0215.wav
LJSpeech-1.1/wavs/LJ013-0121.wav
LJSpeech-1.1/wavs/LJ008-0025.wav
LJSpeech-1.1/wavs/LJ005-0240.wav
LJSpeech-1.1/wavs/LJ044-0026.wav
LJSpeech-1.1/wavs/LJ048-0127.wav
LJSpeech-1.1/wavs/LJ006-0195.wav
LJSpeech-1.1/wavs/LJ030-0151.wav
LJSpeech-1.

### After Extraction preprocess:

In [0]:
print("pre processing...")
lj_speech = LJSpeech([])
print(lj_speech)
preprocess(dataset_path, lj_speech)

pre processing...


  0%|          | 0/13100 [00:00<?, ?it/s]

<datasets.lj_speech.LJSpeech object at 0x7f5a0967ffd0>


In [0]:
cd /content/drive/My Drive/TTS

/content/drive/My Drive/TTS


In [0]:
!pip install tensorflow==2.0.0-alpha0

from tensorboardX import SummaryWriter



In [0]:
!python train-text2mel.py --dataset=ljspeech

use_gpu True
epoch   0 with lr=1.25e-06
100% 13056/13056 [17:08<00:00, 13.79audios/s, l1=0.29456, att=0.00078]
train epoch loss 0.295340, step=204, total time elapsed: 0h 17m 8s 
epoch   0 with lr=2.56e-04
100% 64/64 [00:18<00:00,  3.44audios/s]
valid epoch loss 0.271975
epoch   1 with lr=2.56e-04
100% 13056/13056 [03:40<00:00, 36.01audios/s, l1=0.27326, att=0.00019]
train epoch loss 0.273451, step=408, total time elapsed: 0h 21m 8s 
epoch   1 with lr=5.11e-04
100% 64/64 [00:00<00:00, 83.19audios/s]
valid epoch loss 0.251439
epoch   2 with lr=5.11e-04
100% 13056/13056 [03:38<00:00, 42.30audios/s, l1=0.24811, att=0.00016]
train epoch loss 0.248278, step=612, total time elapsed: 0h 24m 47s 
epoch   2 with lr=7.66e-04
100% 64/64 [00:00<00:00, 83.74audios/s]
valid epoch loss 0.222245
epoch   3 with lr=7.66e-04
100% 13056/13056 [03:34<00:00, 40.85audios/s, l1=0.21619, att=0.00017]
train epoch loss 0.216357, step=816, total time elapsed: 0h 28m 23s 
epoch   3 with lr=1.02e-03
100% 64/64 [00:

In [0]:
import sys
import time
import argparse
from tqdm import *

import torch
import torch.nn.functional as F

# project imports
from models import SSRN
from hparams import HParams as hp
from logger import Logger
from utils import get_last_checkpoint_file_name, load_checkpoint, save_checkpoint
from datasets.data_loader import SSRNDataLoader

In [0]:
!python train-ssrn.py --dataset=ljspeech

use_gpu True
epoch   0 with lr=5.00e-07
 92% 11976/13080 [34:31<02:18,  8.00audios/s, l1=0.08983]saving the checkpoint file 'logdir/ljspeech-ssrn/step-000K.pth'...
100% 13080/13080 [38:25<00:00,  3.07audios/s, l1=0.08788]
train epoch loss 0.087884, step=545, total time elapsed: 0h 38m 26s 
epoch   0 with lr=2.73e-04
100% 24/24 [00:43<00:00,  1.80s/audios]
valid epoch loss 0.047715
epoch   1 with lr=2.73e-04
 83% 10896/13080 [35:57<04:17,  8.47audios/s, l1=0.06703]saving the checkpoint file 'logdir/ljspeech-ssrn/step-001K.pth'...
100% 13080/13080 [43:05<00:00,  3.59audios/s, l1=0.06613]
train epoch loss 0.066129, step=1090, total time elapsed: 1h 22m 15s 
epoch   1 with lr=4.79e-04
100% 24/24 [00:41<00:00,  1.72s/audios]
valid epoch loss 0.044032
epoch   2 with lr=4.79e-04
 75% 9816/13080 [32:35<08:04,  6.74audios/s, l1=0.05968]saving the checkpoint file 'logdir/ljspeech-ssrn/step-001K.pth'...
100% 13080/13080 [46:07<00:00,  5.47audios/s, l1=0.05936]
train epoch loss 0.059360, step=1635

In [0]:
import os
import sys
import argparse
from tqdm import *

import IPython
from IPython.display import Audio


import numpy as np
import torch

from models import Text2Mel
from models import SSRN
from hparams import HParams as hp
from audio import save_to_wav
from utils import get_last_checkpoint_file_name, load_checkpoint, save_to_png
from datasets.lj_speech import vocab, get_test_data
from datasets.lj_speech import vocab, get_test_data


SENTENCES = [
        "I enjoyed the wedding of my brother",
        "our batch is one six one",
        "I hate procastination",
        "ami vaat khai",
        "i want to go to dhaka",
        "this is machine learning class",
        "The birch canoe slid on the smooth planks.",
        "Glue the sheet to the dark blue background.",
        "It's easy to tell the depth of a well.",
        "Hello, my name is Habibur Rahman"
    ]


In [0]:
pwd

'/content/drive/My Drive/TTS'

In [0]:
!python synthesize.py --dataset=ljspeech

loading text2mel checkpoint 'logdir/ljspeech-text2mel/step-003K.pth'...
loaded checkpoint epoch=15 step=3000
ssrn not exits


In [0]:
torch.set_grad_enabled(False)
text2mel = Text2Mel(vocab)
#print(text2mel)
text2mel.load_state_dict(torch.load("/content/drive/My Drive/TTS/logdir/ljspeech-text2mel/step-003K.pth")['state_dict'])
text2mel = text2mel.eval()
ssrn = SSRN()
ssrn.load_state_dict(torch.load("/content/drive/My Drive/TTS/logdir/ljspeech-ssrn/step-001K.pth")['state_dict'])
ssrn = ssrn.eval()

In [0]:
# synthetize by one by one because there is a batch processing bug!
for i in range(len(SENTENCES)):
    sentence = SENTENCES[i]
    normalized_sentence = "".join([c if c.lower() in vocab else '' for c in sentence])
    print(normalized_sentence)
    
    sentences = [normalized_sentence]
    max_N = len(normalized_sentence)
    L = torch.from_numpy(get_test_data(sentences, max_N))
    zeros = torch.from_numpy(np.zeros((1, hp.n_mels, 1), np.float32))
    Y = zeros
    A = None

    for t in range(hp.max_T):
      _, Y_t, A = text2mel(L, Y, monotonic_attention=True)
      Y = torch.cat((zeros, Y_t), -1)
      _, attention = torch.max(A[0, :, -1], 0)
      attention = attention.item()
      if L[0, attention] == vocab.index('E'):  # EOS
          break

    _, Z = ssrn(Y)
    
    Z = Z.cpu().detach().numpy()
    save_to_wav(Z[0, :, :].T, '%d.wav' % (i + 1))
    IPython.display.display(Audio('%d.wav' % (i + 1), rate=hp.sr))

I enjoyed the wedding of my brother




our batch is one six one


I hate procastination


ami vaat khai


i want to go to dhaka


this is machine learning class


The birch canoe slid on the smooth planks.


Glue the sheet to the dark blue background.


It's easy to tell the depth of a well.


Hello my name is Habibur Rahman
