In [4]:
%pip install -r requirements.txt

from dotenv import load_dotenv

load_dotenv()

Note: you may need to restart the kernel to use updated packages.


True

In [5]:
import yt_dlp

URLS = ['https://www.youtube.com/watch?v=57OU18cogJI']

ydl_opts = {
    'format': 'm4a/bestaudio/best',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'm4a',
    }],
    'outtmpl': './tmp/foo_%(title)s-%(id)s.%(ext)s'
}

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    error_code = ydl.download(URLS)

[youtube] Extracting URL: https://www.youtube.com/watch?v=57OU18cogJI
[youtube] 57OU18cogJI: Downloading webpage
[youtube] 57OU18cogJI: Downloading android player API JSON
[info] 57OU18cogJI: Downloading 1 format(s): 140
[download] ./tmp/foo_StrictlyVC in conversation with Sam Altman, part one-57OU18cogJI.m4a has already been downloaded
[download] 100% of   19.02MiB
[ExtractAudio] Not converting audio ./tmp/foo_StrictlyVC in conversation with Sam Altman, part one-57OU18cogJI.m4a; file is already in target format m4a


In [6]:
import whisper
import os
import urllib.parse

model = whisper.load_model("medium.en")

podcasts_to_analyze = {}

for file in os.listdir("./tmp"):
    # Skip if the file is not a video or audio file
    if not file.endswith(".m4a"):
        continue
    
    file_path = os.path.join("./tmp", file)
    
    if not os.path.exists(file_path):
        result = model.transcribe(file_path)
        podcasts_to_analyze[file] = result["text"]

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
import chromadb
chroma_client = chromadb.Client()

collection = chroma_client.get_collection(name="oss_podcasts") or chroma_client.create_collection(name="oss_podcasts")


DEBUG:Chroma:Logger created
INFO:clickhouse_connect.driver.ctypes:Successfully imported ClickHouse Connect C data optimizations
INFO:clickhouse_connect.driver.ctypes:Successfully import ClickHouse Connect C/Numpy optimizations
INFO:clickhouse_connect.json_impl:Using python library for writing JSON byte strings


Running Chroma using direct local API.
Using DuckDB in-memory for database. Data will be transient.


INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cpu


In [8]:

for file, text in podcasts_to_analyze.items():
    with open(f"./podcasts/{urllib.parse.quote(file)}.txt", "w") as f:
        f.write(text)

        collection.add(
            documents=[text],
            ids=[file]
        )

In [9]:
from langchain import PromptTemplate


template = """
Read the transcript of the podcast below:
 {podcast_contents}

Create bullet points with the main topics of the podcast, followed by the opinions of the speakers. 
"""

podcast_summary_prompt = PromptTemplate(
    input_variables=["podcast_contents"],
    template=template,
)


In [10]:
from langchain import PromptTemplate


template = """
When I listen to a podcast, I take notes on the main talking points of the hosts. I divide it in sections based on topics discussed. 
If the host mentions a specific technology or product, I note that in double brackets like this: [[artificial intelligence]].

These are the notes from the last podcast I listened to:

{podcast_notes}

Write a {words_count} words summary of the notes.
"""

notes_summary_template = PromptTemplate(
    input_variables=["podcast_notes", "words_count"],
    template=template,
)


In [11]:
from langchain import PromptTemplate


template = """
When I take notes for a podcast, I like to also write twitter threads to share them. Each tweet should end saying how far we are in the thread; if it's a 5 tweets thread, the first tweet should end with (1/5), the second one with (2/5), etc.

The tweets have to be easy to read and catch people's attention. Each of them should include an emoji.

These are the notes from my last podcast:

{podcast_notes}

Create a twitter thread for it.
"""

twitter_thread_template = PromptTemplate(
    input_variables=["podcast_notes"],
    template=template,
)


In [12]:
from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.text_splitter import CharacterTextSplitter

llm = OpenAI(model_name="text-davinci-003")
chain = LLMChain(llm=llm, prompt=podcast_summary_prompt)

all_results = []

for name, podcast in podcasts_to_analyze.items():
  text_splitter = CharacterTextSplitter(        
    separator = ". ",
    chunk_size = 4000,
    chunk_overlap  = 200,
    length_function = len,
  )
  
  texts = text_splitter.split_text(podcast)

  results = []
  
  for text in texts:
    subset = chain.run(text)
    results.append(subset)

  with open("./podcasts/summary-{}.txt".format(urllib.parse.quote(file)), "w") as f:
    joined_text = "\n".join(results)
    
    f.write(joined_text)
    
    collection.add(
      documents=[text],
      ids=[file]
    )

In [13]:
from gpt_index import SimpleDirectoryReader, GPTSimpleVectorIndex, LLMPredictor, PromptHelper

documents = SimpleDirectoryReader('podcasts').load_data()
index = GPTSimpleVectorIndex(documents)

# define LLM
llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name="text-davinci-003"))

# set maximum input size
max_input_size = 4096
# set number of output tokens
num_output = 256
# set maximum chunk overlap
max_chunk_overlap = 20
prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)

index = GPTSimpleVectorIndex(
    documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper
)

index.save_to_disk('index.json')


INFO:root:> [build_index_from_documents] Total LLM token usage: 0 tokens
INFO:root:> [build_index_from_documents] Total embedding token usage: 5102 tokens
INFO:root:> [build_index_from_documents] Total LLM token usage: 0 tokens
INFO:root:> [build_index_from_documents] Total embedding token usage: 4942 tokens


In [14]:
index = GPTSimpleVectorIndex.load_from_disk('index.json')

In [15]:

response = index.query("What questions were asked?")
print(response)

INFO:root:> [query] Total LLM token usage: 4123 tokens
INFO:root:> [query] Total embedding token usage: 5 tokens




1. What has been happening since the event three years ago?
2. How is the national conversation for you?
3. How many investments do you have?
4. What makes a Sam Altman deal?
5. What have been your most successful investments to date?
6. Why did you switch from Boom Supersonic to Hermius?
7. Is Hermius climate friendly?
8. What are the impacts of us traveling around the world much faster?
9. What is going on with Worldcoin?
10. Do you know Sam Bankman-Fried?
11. Are you interested in crypto more broadly?
12. What is your opinion of Sam Bankman-Fried?
13. What interests you about Worldcoin?


In [16]:
response = index.query("What was his answer on his most successful investments??")
print(response)

INFO:root:> [query] Total LLM token usage: 3900 tokens
INFO:root:> [query] Total embedding token usage: 10 tokens




His answer on his most successful investments was that his most successful investments on a multiple spaces was probably Stripe, which was his second investment ever. He also mentioned that he has been doing this for 17 years and has had a lot of really good investments, including a smattering of crypto investments. He mentioned that he is not particularly interested in crypto investments, but is interested in Worldcoin not because it is crypto, but because of the team and the product.
