In [None]:
# unpacked small.tar downloaded from:
# https://github.com/facebookresearch/libri-light/blob/main/data_preparation/README.md
datadir = Path('/mnt/small/')

In [None]:
import io
import time
import torch
import torchaudio

from encodec.model import EncodecModel

In [None]:
from pathlib import Path
import json
from fastprogress import progress_bar, master_bar
import numpy as np
import pylab as plt

In [None]:
from IPython.display import Audio, display

In [None]:
def load(fname, newsr=24000):
    x, sr = torchaudio.load(fname)
    _tform = torchaudio.transforms.Resample(sr, newsr)
    return _tform(x).cuda().unsqueeze(0)

# Load the JSON metadata

In [None]:
metas = [fname for fname in datadir.rglob('*.json')]

In [None]:
json.loads(metas[3].read_text())

{'speaker': '254',
 'book_meta': {'id': '295',
  'title': 'Short Poetry Collection 016',
  'description': '<p>Librivox’s Short Poetry Collection 016: a collection of 20 public-domain poems.</p>',
  'url_text_source': '',
  'language': 'English',
  'copyright_year': '0',
  'num_sections': '20',
  'url_rss': 'https://librivox.org/rss/295',
  'url_zip_file': 'http://www.archive.org/download/shortpoetry_016_librivox/shortpoetry_016_librivox_64kb_mp3.zip',
  'url_project': '',
  'url_librivox': 'https://librivox.org/short-poetry-collection-016/',
  'url_other': None,
  'totaltimesecs': 1626,
  'authors': [{'id': '18',
    'first_name': '',
    'last_name': 'Various',
    'dob': '',
    'dod': ''}],
  'genre': ['Poetry'],
  'Dramatic Readings': False,
  'meta_genre': 'Poetry'},
 'snr': 16.3013,
 'voice_activity': [[5.44, 8.32],
  [8.88, 12.0],
  [14.88, 30.0],
  [30.64, 34.0],
  [35.04, 48.08],
  [48.8, 52.0],
  [52.72, 56.72],
  [57.76, 62.08],
  [62.56, 66.48],
  [67.76, 70.48],
  [71.28, 

In [None]:
# count the files for each speaker
{s:n for s,n in zip(*np.unique([x.parts[3] for x in metas], return_counts=True))}

{'100': 4,
 '103': 10,
 '1050': 12,
 '1060': 2,
 '1065': 6,
 '1066': 6,
 '107': 44,
 '1081': 1,
 '1085': 9,
 '1095': 1,
 '110': 2,
 '1105': 1,
 '112': 3,
 '1121': 7,
 '1124': 2,
 '1127': 1,
 '1134': 1,
 '1160': 4,
 '1166': 8,
 '1179': 1,
 '1195': 1,
 '1212': 61,
 '1213': 1,
 '123': 4,
 '1251': 1,
 '12533': 15,
 '12536': 4,
 '12539': 5,
 '1259': 80,
 '126': 4,
 '1264': 2,
 '1267': 1,
 '1276': 1,
 '128': 10,
 '1280': 1,
 '1281': 3,
 '1285': 1,
 '1286': 2,
 '1313': 1,
 '1316': 1,
 '1323': 18,
 '1331': 1,
 '1335': 22,
 '1345': 3,
 '1367': 2,
 '1368': 1,
 '1370': 10,
 '1377': 1,
 '1391': 1,
 '14': 1,
 '1401': 131,
 '1433': 1,
 '1440': 1,
 '147': 2,
 '1472': 2,
 '1474': 8,
 '1478': 1,
 '149': 2,
 '1492': 5,
 '1497': 1,
 '150': 5,
 '151': 1,
 '1514': 1,
 '152': 9,
 '153': 2,
 '1535': 3,
 '154': 1,
 '1548': 1,
 '1554': 1,
 '1556': 1,
 '1557': 1,
 '1572': 1,
 '1579': 3,
 '1593': 9,
 '1595': 1,
 '1596': 1,
 '16': 16,
 '1603': 4,
 '1614': 31,
 '1618': 8,
 '163': 3,
 '1633': 1,
 '1647': 1,
 '1649'

# Acoustic token extraction

In [None]:
# load the model
Amodel = EncodecModel.encodec_model_24khz()
Amodel.cuda()
Amodel.eval()

EncodecModel(
  (encoder): SEANetEncoder(
    (model): Sequential(
      (0): SConv1d(
        (conv): NormConv1d(
          (conv): Conv1d(1, 32, kernel_size=(7,), stride=(1,))
          (norm): Identity()
        )
      )
      (1): SEANetResnetBlock(
        (block): Sequential(
          (0): ELU(alpha=1.0)
          (1): SConv1d(
            (conv): NormConv1d(
              (conv): Conv1d(32, 16, kernel_size=(3,), stride=(1,))
              (norm): Identity()
            )
          )
          (2): ELU(alpha=1.0)
          (3): SConv1d(
            (conv): NormConv1d(
              (conv): Conv1d(16, 32, kernel_size=(1,), stride=(1,))
              (norm): Identity()
            )
          )
        )
        (shortcut): SConv1d(
          (conv): NormConv1d(
            (conv): Conv1d(32, 32, kernel_size=(1,), stride=(1,))
            (norm): Identity()
          )
        )
      )
      (2): ELU(alpha=1.0)
      (3): SConv1d(
        (conv): NormConv1d(
          (conv): Co

In [None]:
# process all files for speaker 1401
outdir = Path('acoustic-1401')
outdir.mkdir(exist_ok=True)
for name in progress_bar(list((datadir/'1401').rglob('*.flac'))):
    outname = outdir/name.with_suffix('.encodec').name
    audio = load(name)
    with torch.no_grad():
        frames = Amodel.encode(audio)
    torch.save(frames, outname)

In [None]:
!du -hs speech-1401/

78M	speech-1401/
