In [1]:
import PyPDF2

In [2]:
#create a function to iterate through each page:
def get_text_from_pages(file_path, start_page, end_page):
    page_texts = []

    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in range(start_page, end_page):
            text = reader.pages[page].extract_text()
            text = text.replace("\n", " ")
            page_texts.append(text)
    
    return page_texts

In [3]:
file_path = "pdfs/Moby_Dick.pdf"
start_page = 19
end_page = 21

In [4]:
import os
import locale
locale.getpreferredencoding = lambda: "UTF-8"

import sys 
sys.path.append("../vits")

from IPython.display import Audio
import os
import re
import tempfile
import torch
import numpy as np
import commons
import utils
import subprocess
from models import SynthesizerTrn
from scipy.io.wavfile import write




def download(lang, tgt_dir="./"):
  lang_fn, lang_dir = os.path.join(tgt_dir, lang+'.tar.gz'), os.path.join(tgt_dir, lang)
  cmd = ";".join([
        f"wget https://dl.fbaipublicfiles.com/mms/tts/{lang}.tar.gz -O {lang_fn}",
        f"tar zxvf {lang_fn}"
  ])
  print(f"Download model for language: {lang}")

  print(f"Model checkpoints in {lang_dir}: {os.listdir(lang_dir)}")
  return lang_dir

LANG = "eng"
ckpt_dir = download(LANG)

def preprocess_char(text, lang=None):
    """
    Special treatement of characters in certain languages
    """
    # print(lang)
    if lang == 'ron':
        text = text.replace("ț", "ţ")
    return text

class TextMapper(object):
    def __init__(self, vocab_file):
        self.symbols = [x.replace("\n", "") for x in open(vocab_file, encoding="utf-8").readlines()]
        self.SPACE_ID = self.symbols.index(" ")
        self._symbol_to_id = {s: i for i, s in enumerate(self.symbols)}
        self._id_to_symbol = {i: s for i, s in enumerate(self.symbols)}

    def text_to_sequence(self, text, cleaner_names):
        '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
        Args:
        text: string to convert to a sequence
        cleaner_names: names of the cleaner functions to run the text through
        Returns:
        List of integers corresponding to the symbols in the text
        '''
        sequence = []
        clean_text = text.strip()
        for symbol in clean_text:
            symbol_id = self._symbol_to_id[symbol]
            sequence += [symbol_id]
        return sequence

    def uromanize(self, text, uroman_pl):
        iso = "xxx"
        with tempfile.NamedTemporaryFile() as tf, \
             tempfile.NamedTemporaryFile() as tf2:
            with open(tf.name, "w") as f:
                f.write("\n".join([text]))
            cmd = f"perl " + uroman_pl
            cmd += f" -l {iso} "
            cmd +=  f" < {tf.name} > {tf2.name}"
            os.system(cmd)
            outtexts = []
            with open(tf2.name) as f:
                for line in f:
                    line =  re.sub(r"\s+", " ", line).strip()
                    outtexts.append(line)
            outtext = outtexts[0]
        return outtext

    def get_text(self, text, hps):
        text_norm = self.text_to_sequence(text, hps.data.text_cleaners)
        if hps.data.add_blank:
            text_norm = commons.intersperse(text_norm, 0)
        text_norm = torch.LongTensor(text_norm)
        return text_norm

    def filter_oov(self, text):
        val_chars = self._symbol_to_id
        txt_filt = "".join(list(filter(lambda x: x in val_chars, text)))
        # print(f"text after filtering OOV: {txt_filt}")
        return txt_filt

def preprocess_text(txt, text_mapper, hps, uroman_dir=None, lang=None):
    txt = preprocess_char(txt, lang=lang)
    is_uroman = hps.data.training_files.split('.')[-1] == 'uroman'
    if is_uroman:
        with tempfile.TemporaryDirectory() as tmp_dir:
            if uroman_dir is None:
                cmd = f"git clone git@github.com:isi-nlp/uroman.git {tmp_dir}"
                print(cmd)
                subprocess.check_output(cmd, shell=True)
                uroman_dir = tmp_dir
            uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
            print(f"uromanize")
            txt = text_mapper.uromanize(txt, uroman_pl)
            print(f"uroman text: {txt}")
    txt = txt.lower()
    txt = text_mapper.filter_oov(txt)
    return txt

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Run inference with {device}")
vocab_file = f"{ckpt_dir}/vocab.txt"
config_file = f"{ckpt_dir}/config.json"
assert os.path.isfile(config_file), f"{config_file} doesn't exist"
hps = utils.get_hparams_from_file(config_file)
text_mapper = TextMapper(vocab_file)
net_g = SynthesizerTrn(
    len(text_mapper.symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model)
net_g.to(device)
_ = net_g.eval()

g_pth = f"{ckpt_dir}/G_100000.pth"
print(f"load {g_pth}")

_ = utils.load_checkpoint(g_pth, net_g, None)

Download model for language: eng
Model checkpoints in ./eng: ['G_100000.pth', 'config.json', 'vocab.txt']
Run inference with cuda
load ./eng/G_100000.pth
INFO:root:Loaded checkpoint './eng/G_100000.pth' (iteration 6251)


In [5]:
page_texts = get_text_from_pages(file_path, start_page, end_page)

In [6]:

if not os.path.exists('outputs'):
    os.makedirs('outputs')

In [7]:
import librosa
import logging

logging.getLogger('numba').setLevel(logging.WARNING)

from scipy.signal import resample_poly

In [8]:
for i, page_text in enumerate(page_texts):
    txt = preprocess_text(page_text, text_mapper, hps, lang=LANG)
    stn_tst = text_mapper.get_text(txt, hps)
    with torch.no_grad():
        x_tst = stn_tst.unsqueeze(0).to(device)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
        hyp = net_g.infer(
            x_tst, x_tst_lengths, noise_scale=.667,
            noise_scale_w=0.8, length_scale=1.0
        )[0][0,0].cpu().float().numpy()
    
    hyp = hyp * 32768

    # resampling_factor = 2 # 2x downsampling
    # # hyp = np.interp(
    # #     np.arange(0, len(hyp), resampling_factor),
    # #     np.arange(0, len(hyp)),
    # #     hyp
    # # )
    # hyp = resample_poly(hyp, 1, resampling_factor)
    # hyp = hyp.astype(float)
    # hyp = librosa.effects.pitch_shift(hyp, sr=1, n_steps=-2)

    hyp = hyp.astype(np.int16)

    page_number = start_page + i

    write(f'outputs/audio_{page_number}.wav', hps.data.sampling_rate, hyp)
    print(f"Done page {page_number}")

Done page 19
Done page 20


In [9]:
from pydub import AudioSegment

# We can also read different formats including wav, mp3, mp4, etc all which are supported by ffmpeg
audio = AudioSegment.from_file("outputs/audio_19.wav", format="wav") # wav


In [10]:
print({
    'duration' : audio.duration_seconds,
    'sample_rate' : audio.frame_rate,
    'channels' : audio.channels,
    'sample_width' : audio.sample_width,
    'frame_count' : audio.frame_count(),
    'frame_rate' : audio.frame_rate,
    'frame_width' : audio.frame_width,
})

{'duration': 99.92, 'sample_rate': 16000, 'channels': 1, 'sample_width': 2, 'frame_count': 1598720.0, 'frame_rate': 16000, 'frame_width': 2}


In [11]:
audio.speedup(playback_speed=2.0, )

DEBUG:pydub.converter:subprocess.call(['ffmpeg', '-y', '-f', 'wav', '-i', '/tmp/tmpzb00536e', '-f', 'mp3', '/tmp/tmpdnpgoydl'])
DEBUG:pydub.converter:subprocess output: b'ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers'
DEBUG:pydub.converter:subprocess output: b'  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)'
DEBUG:pydub.converter:subprocess output: b'  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enabl

In [14]:
import soundfile as sf

data, samplerate = sf.read("outputs/audio_19.wav")

In [18]:
import soundfile as s
import pyrubberband as pyrb
# Play back at 1.5X speed
y_stretch = pyrb.time_stretch(data, samplerate, 1.5)
# Play back two 1.5x tones
y_shift = pyrb.pitch_shift(data, samplerate, 1.5)
sf.write("outputfile1X5.wav", y_stretch, samplerate, format='wav')
