In [1]:
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
from langchain.chains import create_extraction_chain

# Vector Store and retrievals
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Pinecone
import pinecone

# Chat Prompt templates for dynamic values
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate
)

# Supporting libraries
import os
from dotenv import load_dotenv

load_dotenv()


  from tqdm.autonotebook import tqdm


False

In [4]:
from getpass import getpass

key = getpass('enter key')

enter key········


In [5]:
# Creating two versions of the model so I can swap between gpt3.5 and gpt4
llm3 = ChatOpenAI(temperature=0,
                  openai_api_key=key,
                  request_timeout = 180
                )


In [17]:
transcript_paths = [
    './data/mfm_pod_steph.txt',
    './data/mfm_pod_alex.txt',
    './data/mfm_pod_rob.txt'
]


In [14]:
with open('./data/mfm_pod_steph.txt') as file:
    transcript = file.read()

In [16]:
transcript[:200]

"Shaan Puri (0:00:00-0:00:03): D to see hearing AIDS. I think that's actually going to be a big deal. \n\nSam Parr (0:00:03-0:00:05): And they're profitable. \n\nShaan Puri (0:00:05-0:00:08): I mean, I'm j"

In [21]:
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n", " "], chunk_size=10000, chunk_overlap=2200)


In [26]:
docs = text_splitter.create_documents([transcript[:23250]])

In [27]:
len(docs)

3

In [31]:
# I'm only doing the first 23250 characters. This to save on costs. When you're doing your exercise you can remove this to let all the data through
transcript_subsection_characters = 23250
docs = text_splitter.create_documents([transcript[:transcript_subsection_characters]])


print (f"You have {len(docs)} docs. First doc is {llm3.get_num_tokens(docs[0].page_content)} tokens")

You have 3 docs. First doc is 2801 tokens


In [32]:
template="""
You are a helpful assistant that helps retrieve topics talked about in a podcast transcript
- Your goal is to extract the topic names and brief 1-sentence description of the topic
- Topics include:
  - Themes
  - Business Ideas
  - Interesting Stories
  - Money making businesses
  - Quick stories about people
  - Mental Frameworks
  - Stories about an industry
  - Analogies mentioned
  - Advice or words of caution
  - Pieces of news or current events
- Provide a brief description of the topics after the topic name. Example: 'Topic: Brief Description'
- Use the same words and terminology that is said in the podcast
- Do not respond with anything outside of the podcast. If you don't see any topics, say, 'No Topics'
- Do not respond with numbers, just bullet points
- Do not include anything about 'Marketing Against the Grain'
- Only pull topics from the transcript. Do not use the examples
- Make your titles descriptive but concise. Example: 'Shaan's Experience at Twitch' should be 'Shaan's Interesting Projects At Twitch'
- A topic should be substantial, more than just a one-off comment

% START OF EXAMPLES
 - Sam’s Elisabeth Murdoch Story: Sam got a call from Elizabeth Murdoch when he had just launched The Hustle. She wanted to generate video content.
 - Shaan’s Rupert Murdoch Story: When Shaan was running Blab he was invited to an event organized by Rupert Murdoch during CES in Las Vegas.
 - Revenge Against The Spam Calls: A couple of businesses focused on protecting consumers: RoboCall, TrueCaller, DoNotPay, FitIt
 - Wildcard CEOs vs. Prudent CEOs: However, Munger likes to surround himself with prudent CEO’s and says he would never hire Musk.
 - Chess Business: Priyav, a college student, expressed his doubts on the MFM Facebook group about his Chess training business, mychesstutor.com, making $12.5K MRR with 90 enrolled.
 - Restaurant Refiller: An MFM Facebook group member commented on how they pay AirMark $1,000/month for toilet paper and toilet cover refills for their restaurant. Shaan sees an opportunity here for anyone wanting to compete against AirMark.
 - Collecting: Shaan shared an idea to build a mobile only marketplace for a collectors’ category; similar to what StockX does for premium sneakers.
% END OF EXAMPLES
"""
system_message_prompt_map = SystemMessagePromptTemplate.from_template(template)

human_template="Transcript: {text}" # Simply just pass the text as a human message
human_message_prompt_map = HumanMessagePromptTemplate.from_template(human_template)

chat_prompt_map = ChatPromptTemplate.from_messages(messages=[system_message_prompt_map, human_message_prompt_map])

In [33]:
chat_prompt_map

ChatPromptTemplate(input_variables=['text'], output_parser=None, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], output_parser=None, partial_variables={}, template="\nYou are a helpful assistant that helps retrieve topics talked about in a podcast transcript\n- Your goal is to extract the topic names and brief 1-sentence description of the topic\n- Topics include:\n  - Themes\n  - Business Ideas\n  - Interesting Stories\n  - Money making businesses\n  - Quick stories about people\n  - Mental Frameworks\n  - Stories about an industry\n  - Analogies mentioned\n  - Advice or words of caution\n  - Pieces of news or current events\n- Provide a brief description of the topics after the topic name. Example: 'Topic: Brief Description'\n- Use the same words and terminology that is said in the podcast\n- Do not respond with anything outside of the podcast. If you don't see any topics, say, 'No Topics'\n- Do not respond with numbers, just bull

In [34]:
template="""
You are a helpful assistant that helps retrieve topics talked about in a podcast transcript
- You will be given a series of bullet topics of topics vound
- Your goal is to exract the topic names and brief 1-sentence description of the topic
- Deduplicate any bullet points you see
- Only pull topics from the transcript. Do not use the examples

% START OF EXAMPLES
 - Sam’s Elisabeth Murdoch Story: Sam got a call from Elizabeth Murdoch when he had just launched The Hustle. She wanted to generate video content.
 - Shaan’s Rupert Murdoch Story: When Shaan was running Blab he was invited to an event organized by Rupert Murdoch during CES in Las Vegas.
% END OF EXAMPLES
"""
system_message_prompt_map = SystemMessagePromptTemplate.from_template(template)

human_template="Transcript: {text}" # Simply just pass the text as a human message
human_message_prompt_map = HumanMessagePromptTemplate.from_template(human_template)

chat_prompt_combine = ChatPromptTemplate.from_messages(messages=[system_message_prompt_map, human_message_prompt_map])

In [35]:
chain = load_summarize_chain(llm3,
                             chain_type="map_reduce",
                             map_prompt=chat_prompt_map,
                             combine_prompt=chat_prompt_combine,
#                              verbose=True
                            )

In [36]:
Topics_found = chain.run({"input_documents": docs})

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=180.0).


In [37]:
print(Topics_found)

- Fractional real estate: Converting unused commercial spaces into different types of businesses like yoga studios and rage rooms.
- Temple Immersive: A club in San Francisco that transforms into a yoga class during the day and a nightclub at night, utilizing unused real estate.
- Rage rooms: Rooms where people can release anger and frustration by smashing objects.
- Escape rooms: Participants are locked in a room and must solve puzzles to escape within a time limit. Raleigh Williams sold his escape room business for $26 million.
- Out-of-home entertainment: Activities like trampoline parks and axe throwing bundled together for a unique entertainment experience.


In [38]:
schema = {
    "properties": {
        # The title of the topic
        "topic_name": {
            "type": "string",
            "description" : "The title of the topic listed"
        },
        # The description
        "description": {
            "type": "string",
            "description" : "The description of the topic listed"
        },
        "tag": {
            "type": "string",
            "description" : "The type of content being described",
            "enum" : ['Business Models', 'Life Advice', 'Health & Wellness', 'Stories']
        }
    },
    "required": ["topic", "description"],
}

In [39]:
chain = create_extraction_chain(schema, llm3)

In [41]:
topics_structured = chain.run(Topics_found)

In [42]:
topics_structured

[{'topic_name': 'Fractional real estate',
  'description': 'Converting unused commercial spaces into different types of businesses like yoga studios and rage rooms.',
  'tag': 'Business Models'},
 {'topic_name': 'Temple Immersive',
  'description': 'A club in San Francisco that transforms into a yoga class during the day and a nightclub at night, utilizing unused real estate.',
  'tag': 'Business Models'},
 {'topic_name': 'Rage rooms',
  'description': 'Rooms where people can release anger and frustration by smashing objects.',
  'tag': 'Business Models'},
 {'topic_name': 'Escape rooms',
  'description': 'Participants are locked in a room and must solve puzzles to escape within a time limit. Raleigh Williams sold his escape room business for $26 million.',
  'tag': 'Business Models'},
 {'topic_name': 'Out-of-home entertainment',
  'description': 'Activities like trampoline parks and axe throwing bundled together for a unique entertainment experience.',
  'tag': 'Business Models'}]

## Retrieval Techniques

In [43]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=800)

docs = text_splitter.create_documents([transcript[:transcript_subsection_characters]])

print (f"You have {len(docs)} docs. First doc is {llm3.get_num_tokens(docs[0].page_content)} tokens")

You have 8 docs. First doc is 776 tokens


In [None]:
# The system instructions. Notice the 'context' placeholder down below. This is where our relevant docs will go.
# The 'question' in the human message below won't be a question per se, but rather a topic we want to get relevant information on
system_template = """
You will be given text from a podcast transcript which contains many topics.
You goal is to write a summary (5 sentences or less) about a topic the user chooses
Do not respond with information that isn't relevant to the topic that the user gives you
----------------
{context}"""

messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}"),
]

# This will pull the two messages together and get them ready to be sent to the LLM through the retriever
CHAT_PROMPT = ChatPromptTemplate.from_messages(messages)

In [None]:
# I'm using gpt4 for the increased reasoning power.
# I'm also setting k=4 so the number of relevant docs we get back is 4. This parameter should be tuned to your use case
qa = RetrievalQA.from_chain_type(llm=llm4,
                                 chain_type="stuff",
                                 retriever=docsearch.as_retriever(k=4),
                                 chain_type_kwargs = {
#                                      'verbose': True,
                                     'prompt': CHAT_PROMPT
                                 })