In [5]:
# 2024-01
# Chops SHWEP podcast episodes into 25MB bites,
# feeds them to Whisper (OAI API),
# combines and formats the transcripts.

In [12]:
import os
from pydub import AudioSegment
from openai import OpenAI

In [13]:
DIR = "."  # "shwep"

In [14]:
def join_strings(string_list, max_len, sep=''):
    result = []
    current = []
    current_len = 0
    for s in string_list:
        if current_len + len(s) > max_len and len(current) > 0:
            result.append(sep.join(current))
            current = []
            current_len = 0
        current.append(s)
        current_len += len(s)
    result.append(sep.join(current))
    return (sep+"\n").join(result)

In [15]:
def split_ep(n, min_parts=1, max_size=25_000_000, overlap_ms=2_000):
    """Split MP3 into small-enough pieces for the API.
    Trial-and-error approach because output file size is unpredictable."""
    
    mp3_file = f"{DIR}/shwep_ep{n}.mp3"

    print(f"Splitting {mp3_file} into {min_parts} parts...")
    
    sound = AudioSegment.from_mp3(mp3_file)

    partsize = len(sound) // min_parts

    assert partsize > overlap_ms, "file could not be sliced finely enough"

    parts = []
    path = lambda i: f"{DIR}/shwep_ep{n}_part{i}.mp3"
    
    for i in range(min_parts):
        begin = i * partsize
        end = (i + 1) * partsize
        if i > 0:
            begin -= overlap_ms
        part = sound[begin:end]
        print(f"Writing {path(i)}...")
        part.export(path(i), format="mp3")

    if all(os.path.getsize(path(i)) < max_size for i in range(min_parts)):
        print("Done!")
        return min_parts
    else:
        print("Not small enough. Trying again...")
        return split_ep(n, min_parts + 1, max_size, overlap_ms)

In [16]:
client = OpenAI()

def transcribe_parts(n, num_parts):
    print(f"Transcribing episode {n} in {num_parts} parts.")
    parts = []
    path = lambda i: f"{DIR}/shwep_ep{n}_part{i}.mp3"
    for p in range(num_parts):
        audio_file= open(path(p), "rb")
        print(f"Transcribing {path(p)}...")
        parts.append(client.audio.transcriptions.create(
          model="whisper-1", 
          file=audio_file
        ).text)
    print("Done!")
    return parts

def write_transcript(transcript, n, suff=''):
    pretty = join_strings(transcript.split(". "), 80, ". ")
    path = f"{DIR}/shwep_ep{n}{suff}.txt"
    with open(path, 'w') as f:
        f.write(pretty)
    print(f"Wrote transcript to {path}.")

def stitch_parts(sep, parts, n):
    if len(parts) > 2:
        raise NotImplementedError
    
    transcript = sep.join(parts[0].split(sep)[:-1]) + sep + sep.join(parts[1].split(sep)[1:])

    write_transcript(transcript, n, '')

def make_html(ns, suff=''):
    body = ''
    for n in ns:
        with open(f"{DIR}/shwep_ep{n}{suff}.txt", 'r') as f:
            transcript = '<br />'.join(f.readlines())
            body += f"<h2>Episode {n}</h2>\n{transcript}"
    doc = f"""<html><head>
  <style>
    h2 {{
      font-weight: normal;
    }}
  </style>
</head>
<body>
{body}
</body></html>"""
    path = f"{DIR}/shwep_eps{ns[0]}-{ns[-1]}.html"
    with open(path, 'w') as f:
        f.write(doc)
    print(f"Wrote HTML to {path}.")
    

# Change ns and run these cells to make a transcript:

In [17]:
# Episode numbers
ns = [65, 66, 67]

for n in ns:
    # chop mp3 into <25MB pieces (may take a couple minutes)
    num_parts = split_ep(n)
    # send halves to OpenAI API (may take a couple minutes)
    parts = transcribe_parts(n, num_parts)
    # fuse transcript (without stitching)
    write_transcript(" || ".join(parts), n)
# make HTML file for GDocs copypasta
make_html(ns)

Splitting ./shwep_ep65.mp3 into 1 parts...
Writing ./shwep_ep65_part0.mp3...
Not small enough. Trying again...
Splitting ./shwep_ep65.mp3 into 2 parts...
Writing ./shwep_ep65_part0.mp3...
Writing ./shwep_ep65_part1.mp3...
Done!
Transcribing episode 65 in 2 parts.
Transcribing ./shwep_ep65_part0.mp3...
Transcribing ./shwep_ep65_part1.mp3...
Done!
Wrote transcript to ./shwep_ep65.txt.
Splitting ./shwep_ep66.mp3 into 1 parts...
Writing ./shwep_ep66_part0.mp3...
Not small enough. Trying again...
Splitting ./shwep_ep66.mp3 into 2 parts...
Writing ./shwep_ep66_part0.mp3...
Writing ./shwep_ep66_part1.mp3...
Done!
Transcribing episode 66 in 2 parts.
Transcribing ./shwep_ep66_part0.mp3...
Transcribing ./shwep_ep66_part1.mp3...
Done!
Wrote transcript to ./shwep_ep66.txt.
Splitting ./shwep_ep67.mp3 into 1 parts...
Writing ./shwep_ep67_part0.mp3...
Not small enough. Trying again...
Splitting ./shwep_ep67.mp3 into 2 parts...
Writing ./shwep_ep67_part0.mp3...
Writing ./shwep_ep67_part1.mp3...
Done!


In [13]:
n = 55

num_parts = split_ep(n, min_parts=3)
# send halves to OpenAI API (may take a couple minutes)
parts = transcribe_parts(n, num_parts)
# fuse transcript (without stitching)
write_transcript(" || ".join(parts), n)

Splitting ./shwep_ep55.mp3 into 3 parts...
Writing ./shwep_ep55_part0.mp3...
Writing ./shwep_ep55_part1.mp3...
Writing ./shwep_ep55_part2.mp3...
Done!
Transcribing episode 55 in 3 parts.
Transcribing ./shwep_ep55_part0.mp3...
Transcribing ./shwep_ep55_part1.mp3...
Transcribing ./shwep_ep55_part2.mp3...
Done!
Wrote transcript to ./shwep_ep55.txt.
