In [41]:
"""
Load the text data and split it into sentences using a library such as NLTK.
"""
import re

# Open the VTT file
with open('episode_001_large.vtt', 'r') as f:
    text = f.read()

text = re.sub(r'^WEBVTT\n\n', '', text)

# Remove the timestamps and extra transcript info using regular expressions
text = re.sub(r'\d{1,2}:\d{2}.\d{3} --> \d{1,2}:\d{2}.\d{3}\n', '', text)  # Remove timestamps
text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
text = re.sub(r'\n\n', ' ', text)

# Write the modified text to a new file
transcript = text

In [42]:
import nltk

sentences = nltk.sent_tokenize(transcript)
sentences[:5]

[" As part of MIT course 6S099, Artificial General Intelligence,  I've gotten the chance to sit down with Max Tegmark.",
 'He is a professor here at MIT.',
 "He's a physicist, spent a large part of his career  studying the mysteries of our cosmological universe.",
 "But he's also studied and delved into the beneficial  possibilities and the existential risks  of artificial intelligence.",
 'Amongst many other things, he is the cofounder  of the Future of Life Institute, author of two books,  both of which I highly recommend.']

In [40]:
"""
Load the pre-trained BERT model and tokenizer using a library such as Hugging Face's Transformers.
"""
import torch
from transformers import AutoTokenizer, AutoModel


lib = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(lib)
model = AutoModel.from_pretrained(lib)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

In [43]:
"""
Generate BERT embeddings for each sentence by passing them through the pre-trained BERT model.
"""
embeddings = []
for sentence in sentences:
    input_ids = torch.tensor(tokenizer.encode(sentence, add_special_tokens=True)).unsqueeze(0)
    outputs = model(input_ids)
    last_hidden_states = outputs.last_hidden_state
    embeddings.append(last_hidden_states.mean(dim=1).squeeze().detach().numpy())

In [44]:
len(embeddings)

737

In [45]:
len(embeddings[0])

384

In [69]:
"""
Use a clustering algorithm, to group the sentences into clusters based on their embeddings.
"""
from sklearn.cluster import AgglomerativeClustering

clusterer = AgglomerativeClustering(n_clusters=5, affinity='cosine', linkage='average')
clusters = clusterer.fit_predict(embeddings)




In [70]:
len(clusters)

737

In [71]:
"""
Apply heuristics to merge or split the clusters into coherent and meaningful topical segments.
"""
segments = []
prev_cluster = clusters[0]
prev_idx = 0
segment_ids = []

for i in range(1, len(clusters)):
    if clusters[i] != prev_cluster:
        segments.append(" ".join(sentences[prev_idx:i]))
        segment_ids.append((prev_idx, i))
        prev_cluster = clusters[i]
        prev_idx = i

segments.append(" ".join(sentences[prev_idx:]))
segment_ids.append((prev_idx, len(sentences)))


In [72]:
len(segments)

41

In [73]:
print(' \n\n'.join(segments[:5]))

 As part of MIT course 6S099, Artificial General Intelligence,  I've gotten the chance to sit down with Max Tegmark. He is a professor here at MIT. He's a physicist, spent a large part of his career  studying the mysteries of our cosmological universe. But he's also studied and delved into the beneficial  possibilities and the existential risks  of artificial intelligence. Amongst many other things, he is the cofounder  of the Future of Life Institute, author of two books,  both of which I highly recommend. First, Our Mathematical Universe. Second is Life 3.0. He's truly an out of the box thinker and a fun personality,  so I really enjoy talking to him. 

If you'd like to see more of these videos in the future,  please subscribe and also click the little bell icon  to make sure you don't miss any videos. 

Also, Twitter, LinkedIn, agi.mit.edu  if you wanna watch other lectures  or conversations like this one. Better yet, go read Max's book, Life 3.0. Chapter seven on goals is my favori

In [54]:
from scipy.spatial.distance import cdist
import numpy as np

# Calculate the centroid sentence for each cluster
centroid_sentences = []
for start, end in set(segment_ids):
    sentences_in_cluster = sentences[start: end]
    sentence_embeddings = np.array(embeddings[start: end])
    centroid_idx = cdist([s for s in sentence_embeddings], [sentence_embeddings.mean(axis=0)], 'euclidean').argmin()
    centroid_sentences.append(sentences_in_cluster[centroid_idx])

# Print the topic and its corresponding headline or summary
for i, centroid_sentence in enumerate(centroid_sentences):
    print(f"Topic {i+1}: {centroid_sentence}")

Topic 1: I'm not sure, do you have?
Topic 2: Excellent, I didn't know.
Topic 3: He said, it was so indescribably beautiful.
Topic 4: I speak very bad Russian, I'm only an autodidact,  but I bought a book, Teach Yourself Russian,  read a lot, but it was very difficult.
Topic 5: Check it out, some of them are just mind blowing,  really beautiful.
Topic 6: You're like, shucks, I'm gonna lose the last five minutes  of experiences since my last cloud backup, dang.
Topic 7: Suppose you're going in for a medical procedure  and they're like, you know, for anesthesia,  what we're going to do is we're going to give you  muscle relaxants so you won't be able to move  and you're going to feel excruciating pain  during the whole surgery,  but you won't be able to do anything about it.
Topic 8: So the old Holy grail of AI from back to its inception  in the sixties, if that ever happens, of course  I think it's going to be the biggest transition  in the history of life on earth  but it doesn't necess