In [50]:
%pip install --force-reinstall -r requirements.txt

from dotenv import load_dotenv

load_dotenv()

Collecting pandas
  Using cached pandas-1.5.3-cp310-cp310-macosx_11_0_arm64.whl (10.9 MB)
Collecting langchain
  Using cached langchain-0.0.87-py3-none-any.whl (253 kB)
Collecting yt-dlp
  Using cached yt_dlp-2023.1.6-py2.py3-none-any.whl (2.8 MB)
Collecting openai-whisper
  Using cached openai_whisper-20230124-py3-none-any.whl
Collecting python-dotenv
  Using cached python_dotenv-0.21.1-py3-none-any.whl (19 kB)
Collecting openai
  Using cached openai-0.26.5-py3-none-any.whl
Collecting tiktoken
  Using cached tiktoken-0.2.0-cp310-cp310-macosx_11_0_arm64.whl (699 kB)
Collecting nltk
  Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)
Collecting numpy
  Using cached numpy-1.24.2-cp310-cp310-macosx_11_0_arm64.whl (13.9 MB)
Collecting chromadb
  Using cached chromadb-0.3.0-py3-none-any.whl (36 kB)
Collecting python-dateutil>=2.8.1
  Using cached python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)
Collecting pytz>=2020.1
  Using cached pytz-2022.7.1-py2.py3-none-any.whl (499 kB)
Collecting 

True

In [None]:
import yt_dlp

URLS = ['https://www.youtube.com/watch?v=57OU18cogJI']

ydl_opts = {
    'format': 'm4a/bestaudio/best',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'm4a',
    }],
    'outtmpl': './tmp/foo_%(title)s-%(id)s.%(ext)s'
}

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    error_code = ydl.download(URLS)

[youtube] Extracting URL: https://www.youtube.com/watch?v=57OU18cogJI
[youtube] 57OU18cogJI: Downloading webpage
[youtube] 57OU18cogJI: Downloading android player API JSON
[info] 57OU18cogJI: Downloading 1 format(s): 140
[download] ./tmp/foo_StrictlyVC in conversation with Sam Altman, part one-57OU18cogJI.m4a has already been downloaded
[download] 100% of   19.02MiB
[ExtractAudio] Not converting audio ./tmp/foo_StrictlyVC in conversation with Sam Altman, part one-57OU18cogJI.m4a; file is already in target format m4a


In [3]:
import whisper
import os
import urllib.parse

model = whisper.load_model("base")

podcasts_to_analyze = {}

for file in os.listdir("./tmp"):
    # Skip if the file is not a video or audio file
    if not file.endswith(".m4a"):
        continue
    
    file_path = os.path.join("./tmp", file)
    result = model.transcribe(file_path)
    podcasts_to_analyze[file] = result["text"]



In [1]:
import chromadb
chroma_client = chromadb.Client()

collection = chroma_client.get_collection(name="oss_podcasts") or chroma_client.create_collection(name="oss_podcasts")


Running Chroma using direct local API.
Using DuckDB in-memory for database. Data will be transient.


  from .autonotebook import tqdm as notebook_tqdm


In [4]:

for file, text in podcasts_to_analyze.items():
    with open(f"./podcasts/{urllib.parse.quote(file)}.txt", "w") as f:
        f.write(text)

        collection.add(
            documents=[text],
            ids=[file]
        )

In [6]:
from langchain import PromptTemplate


template = """
Read the transcript of the podcast below:
 {podcast_contents}

Create bullet points with the main topics of the podcast, followed by the opinions of the speakers. 
"""

podcast_summary_prompt = PromptTemplate(
    input_variables=["podcast_contents"],
    template=template,
)


In [25]:
from langchain import PromptTemplate


template = """
When I listen to a podcast, I take notes on the main talking points of the hosts. I divide it in sections based on topics discussed. 
If the host mentions a specific technology or product, I note that in double brackets like this: [[artificial intelligence]].

These are the notes from the last podcast I listened to:

{podcast_notes}

Write a {words_count} words summary of the notes.
"""

notes_summary_template = PromptTemplate(
    input_variables=["podcast_notes", "words_count"],
    template=template,
)


In [26]:
from langchain import PromptTemplate


template = """
When I take notes for a podcast, I like to also write twitter threads to share them. Each tweet should end saying how far we are in the thread; if it's a 5 tweets thread, the first tweet should end with (1/5), the second one with (2/5), etc.

The tweets have to be easy to read and catch people's attention. Each of them should include an emoji.

These are the notes from my last podcast:

{podcast_notes}

Create a twitter thread for it.
"""

twitter_thread_template = PromptTemplate(
    input_variables=["podcast_notes"],
    template=template,
)


In [7]:
from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.text_splitter import CharacterTextSplitter

llm = OpenAI(model_name="text-davinci-003")
chain = LLMChain(llm=llm, prompt=podcast_summary_prompt)

all_results = []

for name, podcast in podcasts_to_analyze.items():
  text_splitter = CharacterTextSplitter(        
    separator = ". ",
    chunk_size = 4000,
    chunk_overlap  = 200,
    length_function = len,
  )
  
  texts = text_splitter.split_text(podcast)

  results = []
  
  for text in texts:
    subset = chain.run(text)
    results.append(subset)

  with open("./podcasts/summary-{}.txt".format(urllib.parse.quote(file)), "w") as f:
    joined_text = "\n".join(results)
    
    f.write(joined_text)
    
    collection.add(
      documents=[text],
      ids=[file]
    )

In [13]:
from gpt_index import SimpleDirectoryReader, GPTSimpleVectorIndex, LLMPredictor, PromptHelper

documents = SimpleDirectoryReader('podcasts').load_data()
index = GPTSimpleVectorIndex(documents)

# define LLM
llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name="text-davinci-003"))

# define prompt helper
# set maximum input size
max_input_size = 4096
# set number of output tokens
num_output = 256
# set maximum chunk overlap
max_chunk_overlap = 20
prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)

index = GPTSimpleVectorIndex(
    documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper
)

# save to disk
index.save_to_disk('index.json')
# load from disk
index = GPTSimpleVectorIndex.load_from_disk('index.json')


Sam did not seem to be very interested in crypto, but he did think it could be a useful tool to experiment with global UBI. He also thought that the spirit of the Web 3 people was great, but he did not intuitively understand why they needed it.


In [14]:
response = index.query("What did Sam think of crypto?")
print(response)

response = index.query("What is he excited about in technology?")
print(response)


Sam did not seem to be very interested in crypto, but he did think that it could be a useful tool to experiment with global UBI. He also thought that the spirit of the Web 3 people was great, but he did not intuitively understand why they needed it.

He is excited about the progress being made in biotech, the potential for using technology to experiment with global UBI, and the potential for turning adult cells into gametes. He is also excited about the potential for Gary Tankiman to remake Y-Combinator in the current market, and the potential for startups to create great value in the current market.
