In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%cd ../src

/Users/ranykeddo/src/github.com/feldberlin/timething/src


In [2]:
from dataclasses import replace
from pathlib import Path
import time

import torch
import torchaudio
import numpy as np
import matplotlib.pyplot as plt
import IPython.display as ipd
from tqdm import tqdm  # type: ignore

from timething import dataset, job, text, utils, align, viz

# Process 30 minutes of audio

## Simple concatenation strategy
We will split the original audio into 5 second chunks. Each chunk is passed through the model. The resulting logits are concatenated into a single tensor. Alignment is a single pass over the concatenated logits.

## Setup

In [8]:
# basic config
device = "cpu"
resample_to = 16000
batch_size = 40
seconds_per_sample = 5
hopsize_seconds = seconds_per_sample
cfg = utils.load_config("english")

# data
fixtures_path = Path("../fixtures")
keanu_audio = fixtures_path / "audio" / "keanu.mp3"
with open(fixtures_path / "keanu.cleaned.txt", "r") as f:
    keanu_transcript = f.read()
    keanu_transcript = " ".join(keanu_transcript.lower().splitlines())

In [9]:
# initialize dataset
ds = dataset.WindowedTrackDataset(
    keanu_audio, 
    "mp3", 
    keanu_transcript,
    seconds_per_sample * 1000, 
    hopsize_seconds * 1000, 
    resample_to=resample_to
)

print(f"dataset length: { len(ds) }")

# initialize job
j = job.LongTrackJob(
    cfg,
    ds,
    batch_size=1,
    n_workers=1,
)

dataset length: 313




We have set hopsize to the sample length. So we just wind up cutting up the original into chunks, no overlap. Let's validate that we can put the audio back together as expected:

### Audio

In [10]:
display(ipd.Audio(torch.concat([ds[i].audio.squeeze() for i in range(5)]), rate=resample_to))

### Transcription

In [11]:
print(f"ORIGINAL: { keanu_transcript[:880] }\n")
print(f"CLEANED: { ds.clean_text_fn(keanu_transcript)[:880] }")

ORIGINAL: alex pappademas, you’ve written a book. yeah, how about that? it’s called “keanu reeves: most triumphant: the movies and meaning of an irrepressible icon.” but i have a question for you. is it possible to even get near the bottom of keanu reeves? i think he might be an unknowable icon. i sit before you as someone who’s thought about him for a whole year, and i don’t know if i know him any better than you do. but let’s find out. here we go. i’m a culture writer at the new york times, and this is “still processing.” today on the show, we’re going to talk about the one, the only, keanu reeves, and we’re going to try and figure out why we get so much out of a movie star who appears to give us so little. i think i have what is probably a pretty common keanu reeves experience, which is that i kind of took him for granted. he’s always been in my life, and i just never really

CLEANED: alex|pappademas|you|ve|written|a|book|yeah|how|about|that|it|s|called|keanu|reeves|most|triumphant|

## Chunked alignment

We will loop over the 5 second chunks, feed them through the model and save the resulting logits in a list. Will will perform alignment on the concatenated logits. 

In [12]:
alignments = j.run()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 313/313 [01:45<00:00,  2.96it/s]


aligning...


### Recognition results

Let's take a look at the logits by decoding the text using a simple argmax strategy:

In [32]:
recognized = text.decode_best(alignments.log_probs.squeeze(), j.aligner.dictionary)
recognized = text.ctc_collapse(recognized)
recognized[:1000]

"alex papademus you've written the book yea how about thatit's  kiaariv's most triumphant the movies and meeting of an aror pes sible iconbut i have a question for you is it possible l o even get near the bottom of ceonerweavesi think he might  y an unknowable iconsit before you someone who's thought about him for e a whole year and i dont know if i know him any better than you do but let's find out  we go i'm wesley morris and i'm a culture writer at the new york times and this is still processing tettin on the show worege a talk about the one the only ceonter weaves and we're gon ta trine figure out swhy we get so much out of a movie star who appears to give us  te te e alex i think i have woit tis probably a pretty common keonarives experiencewhich is  ikano took him for granted he's always been in my life i just never really felt like here was much to figure outbut who is this person and what have i been misunderstanding if i'm thinking of im as an empty vessel i mean maybe he's an

### Alignment results


In [33]:
words = alignments.words_cleaned
words[:50]

[Segment(label='alex', start=24, end=36, score=0.9984560929811918),
 Segment(label='pappademas', start=37, end=67, score=0.9262847508111134),
 Segment(label='you', start=95, end=98, score=0.6182636597659439),
 Segment(label='ve', start=100, end=102, score=0.9942612846692404),
 Segment(label='written', start=103, end=111, score=0.8213972086933526),
 Segment(label='a', start=114, end=116, score=0.3580108418439825),
 Segment(label='book', start=117, end=128, score=0.9544141190126538),
 Segment(label='yeah', start=149, end=163, score=0.7664356473687803),
 Segment(label='how', start=165, end=172, score=0.7130837318513336),
 Segment(label='about', start=174, end=185, score=0.9208486336462486),
 Segment(label='that', start=187, end=198, score=0.9582076809908214),
 Segment(label='it', start=225, end=228, score=0.6177497905271594),
 Segment(label='s', start=230, end=231, score=0.9076717694600424),
 Segment(label='called', start=233, end=247, score=0.8369190668933751),
 Segment(label='keanu', st

In [34]:
audio, audio_sr = torchaudio.load(keanu_audio)
logits_sr = alignments.log_probs[0].size(1) / (ds.window_size_ms / 1000)

def audio_segment(s):
    start = int(s.start / logits_sr * audio_sr)
    end = int(s.end / logits_sr * audio_sr)
    return audio[:, start:end]

def audio_segments(s):
    start = int(s[0].start / logits_sr * audio_sr)
    end = int(s[-1].end / logits_sr * audio_sr)
    return audio[:, start:end]

def show(word):
    print(word)
    display(ipd.Audio(audio_segment(word), rate=audio_sr))

def show_all(words):
    print(" ".join([w.label for w in words]))
    display(ipd.Audio(audio_segments(words), rate=audio_sr))

In [35]:
print("some words\n========\n")
show(words[0])
show(words[100])

some words

Segment(label='alex', start=24, end=36, score=0.9984560929811918)


Segment(label='and', start=1966, end=1990, score=0.9995575802666801)


# Analysis

The alignment appears to have worked very well. I would have expected boundary artefacts to reduce quality every 5 seconds. When would this be true? It would be true when the alignment takes a different path, e.g we repeat letters for too long or too short, or we switch to the next letter to late or too early. Let's take a look 

In [36]:
# let's pick out the frame start and end numbers on the 5 second boundaries
boundaries = [i * 5 for i in range(1, 5 * 60)]  # boundaries
boundaries = [(int(logits_sr * (i - 1)), int(logits_sr * (i + 1))) for i in boundaries]  # converted to logit frequency

# pick out the words that span those boundaries
i = 0
results = {}
for start, end in boundaries:
    segments = []
    while i < len(words):
        if not segments and words[i].start > start:
            segments.append(words[i - 1])
        elif segments and words[i].end > end:
            segments.append(words[i - 1])
            segments.append(words[i])
            results[(start, end)] = segments
            segments = []
            break
        elif segments:
            segments.append(words[i - 1])
        i += 1

# play
i = 0
for k, words in results.items():
    if i > 50:
        break
    print()
    print(k)
    for w in words:
        print(w)
    i += 1


(26, 39)
Segment(label='alex', start=24, end=36, score=0.9984560929811918)
Segment(label='pappademas', start=37, end=67, score=0.9262847508111134)
Segment(label='you', start=95, end=98, score=0.6182636597659439)

(59, 72)
Segment(label='pappademas', start=37, end=67, score=0.9262847508111134)
Segment(label='you', start=95, end=98, score=0.6182636597659439)
Segment(label='ve', start=100, end=102, score=0.9942612846692404)

(92, 105)
Segment(label='you', start=95, end=98, score=0.6182636597659439)
Segment(label='ve', start=100, end=102, score=0.9942612846692404)
Segment(label='written', start=103, end=111, score=0.8213972086933526)

(125, 138)
Segment(label='book', start=117, end=128, score=0.9544141190126538)
Segment(label='yeah', start=149, end=163, score=0.7664356473687803)
Segment(label='how', start=165, end=172, score=0.7130837318513336)

(158, 171)
Segment(label='yeah', start=149, end=163, score=0.7664356473687803)
Segment(label='how', start=165, end=172, score=0.7130837318513336)