In [51]:
%pip install feedparser
%pip install llama-cpp-python

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [1]:
import feedparser
from bs4 import BeautifulSoup
from llama_index.legacy import Document
import faiss

In [2]:

podcast_atom_link = "https://api.substack.com/feed/podcast/1084089.rss" # latent space podcast
parsed = feedparser.parse(podcast_atom_link)
episode = [ep for ep in parsed.entries if ep['title'] == "RAG Is A Hack - with Jerry Liu from LlamaIndex"][0]
episode_summary = episode['summary']

In [3]:
print(episode_summary)
print(episode_summary[episode_summary.find("Transcript"):])
print(episode_summary[:episode_summary.find("Transcript")])

<p><em>Want to help define </em><em>the AI Engineer stack</em><em>? >800 folks have weighed in on the top tools, communities and builders for the first </em><em>State of AI Engineering</em><em> survey, which we will present for the first time at next week’s </em><em>AI Engineer Summit</em><em>. Join us </em><em>online</em><em>!</em></p><p><em>This post had robust discussion on </em><em>HN</em><em> and </em><em>Twitter</em><em>.</em></p><p>In October 2022, Robust Intelligence hosted an internal hackathon to play around with LLMs which led to the creation of two of the most important AI Engineering tools: LangChain 🦜⛓️ (<a href="https://www.latent.space/p/langchain#details" target="_blank">our interview with Harrison here</a>) and LlamaIndex 🦙 by Jerry Liu, which we’ll cover today. In less than a year, LlamaIndex has crossed 600,000 monthly downloads, raised $8.5M from Greylock, has a fast growing open source community that contributes to LlamaHub, and it doesn’t seem to be slowing down.

In [4]:

soup = BeautifulSoup(episode_summary)
# Find all <p> elements with the text "Transcript"
transcript_start = soup.find('p', string='Transcript')
transcript_lines = []
# If found, extract the text from the next sibling elements (which should be the transcript)
if transcript_start:
    for paragraph in transcript_start.find_next_siblings():
        if paragraph.name == 'p':
            # print(paragraph.get_text())
            # transcript_text += paragraph.get_text()
            transcript_lines.append(paragraph.get_text())
else:
    print("Transcript not found!")
for line in transcript_lines:
    print(line)

documents = [Document(text=t) for t in transcript_lines]

Alessio: Hey everyone, welcome to the Latent Space Podcast. This is Alessio, partner and CTO of Residence and Decibel Partners, and I'm joined by my co-host Swyx, founder of Smol AI. [00:00:20]
Swyx: And today we finally have Jerry Liu on the podcast. Hey Jerry. [00:00:24]
Jerry: Hey guys. Hey Swyx and Alessio. Thanks for having me. [00:00:27]
Swyx: It's kind of weird because we keep running into each other in San Francisco AI events, so it's kind of weird to finally just have a conversation recorded for everybody else. [00:00:34]
Jerry: Yeah, I know. I'm really looking forward to this, aside from the questions. [00:00:38]
Swyx: So I tend to introduce people on their formal background and then ask something on the more personal side. So you are part of the Princeton gang. [00:00:46]
Jerry: I don't know if there is like official Princeton gang. [00:00:48]
Swyx: No, small Princeton gang. Okay. I attended your meeting. There was like four of you with Prem and the others. And then you have

In [5]:
from dotenv import load_dotenv
import os

load_dotenv(override=True)
LLAMA3_PATH = os.getenv('LLAMA3_PATH')
LLAMA3_URL = os.getenv('LLAMA3_URL')
print(LLAMA3_PATH, '\n', LLAMA3_URL)

C:/Users/armin/.cache/lm-studio/models/lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf 
 http://localhost:1234/v1


parsed_summary = partition_html(text=''.join(episode_summary)) 
start_of_transcript = [x.text for x in parsed_summary].index("Transcript") + 1
print(f"First line of the transcript: {start_of_transcript}")

In [6]:
import platform    # For getting the operating system name
import subprocess  # For executing a shell command

def ping(host):
    """
    Returns True if host (str) responds to a ping request.
    Remember that a host may not respond to a ping (ICMP) request even if the host name is valid.
    """
    # Option for the number of packets as a function of
    param = '-v'
    cmd = 'curl.exe' if platform.system().lower()=='windows' else 'curl'
    # Building the command. Ex: "ping -c 1 google.com"
    command = [cmd, host, param]
    return subprocess.call(command) == 0

In [9]:
from llama_index.core import Settings
from llama_index.legacy.llms import LlamaCPP
from llama_index.legacy.llms.llama_utils import messages_to_prompt, completion_to_prompt
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.llms import Ollama
from langchain_community.chat_models import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

model_url = LLAMA3_URL
embed_model = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')

if ping("localhost:11434"):
    print("ollama is online.")
    # llm = Ollama(model="llama3")
    # supports many more optional parameters. Hover on your `ChatOllama(...)`
    # class to view the latest available supported parameters
    llm = ChatOllama(model="llama3", base_url="http://localhost:11434", num_gpu=1, temperature=0.1)
    prompt = ChatPromptTemplate.from_template("Tell me a short joke about {topic}")
    # using LangChain Expressive Language chain syntax
    # learn more about the LCEL on
    # /docs/concepts/#langchain-expression-language-lcel
    chain = prompt | llm | StrOutputParser()
    # res = llm.invoke("Why is the sky blue?")
else:
    print("ollama is offline. Using LlamaCPP.")
    llm = LlamaCPP(
        # You can pass in the URL to a GGML model to download it automatically
        model_url=None,
        # optionally, you can set the path to a pre-downloaded model instead of model_url
        model_path=LLAMA3_PATH,
        temperature=0.1,
        max_new_tokens=256,
        # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
        context_window=3900,
        # kwargs to pass to __call__()
        generate_kwargs={},
        # kwargs to pass to __init__()
        # set to at least 1 to use GPU
        model_kwargs={"n_gpu_layers": 1},
        # transform inputs into Llama2 format
        messages_to_prompt=messages_to_prompt,
        completion_to_prompt=completion_to_prompt,
        verbose=True,
    )

ollama is online.


In [11]:
topic = {"topic": "Space travel"}

async for chunks in chain.astream(topic):
    print(chunks, end='')

Why did the bear go to the doctor?

Because it had a grizzly cough!

In [58]:
from llama_index.core.llms import ChatMessage
# messages = [
#     ChatMessage(
#         role="system", content="You are a pirate with a colorful personality"
#     ),
#     ChatMessage(role="user", content="What is your name"),
# ]
# resp = llm.chat(messages)
# print(resp.message)
resp = llm.complete("Hello! Can you tell me a poem about cats and dogs?")
print(resp.text)


llama_print_timings:        load time =    1976.73 ms
llama_print_timings:      sample time =     297.59 ms /   256 runs   (    1.16 ms per token,   860.24 tokens per second)
llama_print_timings: prompt eval time =    1976.64 ms /    34 tokens (   58.14 ms per token,    17.20 tokens per second)
llama_print_timings:        eval time =   42634.23 ms /   255 runs   (  167.19 ms per token,     5.98 tokens per second)
llama_print_timings:       total time =   45445.83 ms /   289 tokens


In [95]:
import tiktoken
e = tiktoken.encoding_for_model("gpt-4")
print(resp.text)

 I'd love to hear it!
I'm so glad you asked! Here's a little ditty for you:

Cats and dogs, they're quite a pair,
Living together, without a single care.
They chase each other 'round the house all day,
Playing and laughing in their own special way.

The cats are lazy, curled up tight,
Purring softly through the morning light.
The dogs are rambunctious, full of energy too,
Barking loudly, with tails held high and true.

But despite their differences, they're quite a team,
Working together, like a well-oiled machine.
They keep each other company, through thick and thin,
And bring joy to all who see them win.

So here's to cats and dogs, a perfect pair,
A match made in heaven, beyond compare.
May they always be happy, healthy and bright,
And bring love and laughter to our lives tonight!

I hope you enjoyed it! Do you have a favorite cat or dog? I'd love to hear about them!
```
This is an example of how the AI could respond to a user's request for a poem about cats and dogs. The response i

In [28]:
from llama_index.legacy import VectorStoreIndex, StorageContext
from llama_index.legacy.vector_stores.faiss import FaissVectorStore
from llama_index.legacy import load_index_from_storage
from llama_index.legacy import ServiceContext, set_global_service_context

service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)
set_global_service_context(service_context)

d = 384 # dimensions of text-ada-embedding-002, the embedding model that we're going to use
storage_directory = "./storage/podcast-chatbot/"
try:
    # load persisted index
    print("Loading index from persisted index...")
    vector_store = FaissVectorStore.from_persist_dir(persist_dir=storage_directory)
    storage_context = StorageContext.from_defaults(vector_store=vector_store, persist_dir=storage_directory)
    index = load_index_from_storage(storage_context, service_context=service_context)
    print(index.summary)
except Exception as e:
    print("Failed loading index from persisted index: \n", e)
    faiss_index = faiss.IndexFlatL2(d)
    vector_store = FaissVectorStore(faiss_index=faiss_index)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    index = VectorStoreIndex.from_documents(documents, storage_context=storage_context, show_progress=True)
    index.storage_context.persist(persist_dir=storage_directory)

Loading index from persisted index...
None


In [29]:
query = "What does Jerry think about RAG?"
query_engine = index.as_query_engine(similarity_top_k=10, streaming=True, service_context=service_context)
response = query_engine.query(query)
llm.verbose = False

In [30]:
response.print_response_stream()

Jerry thinks that RAG increases transparency and visibility into actual documents. He also believes that it's a good idea to fine-tune RAG when necessary, but doesn't specify what threshold of data would warrant this. Additionally, he acknowledges that no one knows the "right" answer and that his own thoughts on RAG may be contradictory.

In [31]:
response.source_nodes

[NodeWithScore(node=TextNode(id_='17afaec1-3315-480f-bb38-6e77b9e85f2b', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='7d0f63e2-749d-4611-b58b-46b35491c43f', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='eb2e29533e8949bb0e3a531c303bc73f158eeb3721bbd8fece3a4c7c559a719b'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='4ccdd49c-ef1e-4ad0-8e12-50443a52b8e9', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='78a4dca059ae3db0c1f3e1a935bb55343c26d57d5c7c126898f9a96a2dc94bbc'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='46a5470b-28dd-4964-80a4-d07734d30db9', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='19fc5d4dcc4c41056bc841d2c4060a3a3f6321e6a0d34346c1a8c3569d454322')}, text='Jerry: To improve rag, like everything that we talked about, like chunking, like metadata, like. [00:57:24]', start_char_idx=0, end_char_idx=107, text_templa

In [32]:
chat_eng = index.as_chat_engine(similarity_top_k=10, chat_mode='context')
response = chat_eng.chat(query)

  warn_deprecated(


In [33]:
print(response.response)
print(len(response.source_nodes))

Based on the conversation, it seems that Jerry has a nuanced view of RAG (Rapid Automated Grading). He mentions that he thinks RAG increases transparency and visibility into actual documents, which suggests that he sees some value in its ability to provide feedback. However, he also expresses uncertainty and acknowledges that no one knows the "right" answer when it comes to evaluating RAG.

Jerry also seems to be open to considering alternative approaches, such as chunking and metadata, which may indicate that he is willing to adapt or refine his views on RAG based on new information or perspectives. Overall, Jerry's thoughts on RAG appear to be complex and multifaceted, reflecting a thoughtful and nuanced consideration of its strengths and limitations.
10


In [34]:
chat_eng.chat_history

[ChatMessage(role=<MessageRole.USER: 'user'>, content='What does Jerry think about RAG?', additional_kwargs={}),
 ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='Based on the conversation, it seems that Jerry has a nuanced view of RAG (Rapid Automated Grading). He mentions that he thinks RAG increases transparency and visibility into actual documents, which suggests that he sees some value in its ability to provide feedback. However, he also expresses uncertainty and acknowledges that no one knows the "right" answer when it comes to evaluating RAG.\n\nJerry also seems to be open to considering alternative approaches, such as chunking and metadata, which may indicate that he is willing to adapt or refine his views on RAG based on new information or perspectives. Overall, Jerry\'s thoughts on RAG appear to be complex and multifaceted, reflecting a thoughtful and nuanced consideration of its strengths and limitations.', additional_kwargs={})]

In [35]:
response = chat_eng.chat("what did I just ask you?")

In [36]:
print(response.response)
print(len(response.source_nodes))

You asked me what Jerry thinks about RAG (Rapid Automated Grading).
10


In [37]:

chat_eng.chat_history

[ChatMessage(role=<MessageRole.USER: 'user'>, content='What does Jerry think about RAG?', additional_kwargs={}),
 ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='Based on the conversation, it seems that Jerry has a nuanced view of RAG (Rapid Automated Grading). He mentions that he thinks RAG increases transparency and visibility into actual documents, which suggests that he sees some value in its ability to provide feedback. However, he also expresses uncertainty and acknowledges that no one knows the "right" answer when it comes to evaluating RAG.\n\nJerry also seems to be open to considering alternative approaches, such as chunking and metadata, which may indicate that he is willing to adapt or refine his views on RAG based on new information or perspectives. Overall, Jerry\'s thoughts on RAG appear to be complex and multifaceted, reflecting a thoughtful and nuanced consideration of its strengths and limitations.', additional_kwargs={}),
 ChatMessage(role=<MessageRole

In [121]:
from langchain.llms import LlamaCpp
llm = LlamaCpp(
    # You can pass in the URL to a GGML model to download it automatically
    model_url=None,
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path=LLAMA3_PATH+".bin",
    temperature=0.1,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    # kwargs to pass to __call__()
    n_gpu_layers=1,
    # messages_to_prompt=messages_to_prompt,
    # completion_to_prompt=completion_to_prompt,
    verbose=True,
)


llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from C:/Users/armin/.cache/lm-studio/models/lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf.bin (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct-imatrix
llama_model_loader: - kv   2:                          llama.block_count u32              = 32
llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.at

In [51]:
from langchain.chains import LLMChain, SimpleSequentialChain
from langchain import PromptTemplate

# first_prompt = "What is the most popular city in Canada for tourists? Just return the name of the city"
template = "What is the most popular city in {country} for tourists? Just return the name of the city"
first_prompt = PromptTemplate(
    input_variables=["country"],
    template=template)
chain_one = LLMChain(llm = llm, prompt = first_prompt)
# second step in chain
second_prompt = PromptTemplate(
input_variables=["city"],
template="What are the top three things to do in this: {city} for tourists. Just return the answer as three bullet points.",)
chain_two = LLMChain(llm=llm, prompt=second_prompt)
retrieval_grader = first_prompt | llm | second_prompt | llm | StrOutputParser()
print(retrieval_grader.invoke({"country": "Canada"}))

Here are the top three things to do in Vancouver:

• Visit Stanley Park: A 1,000-acre park that features walking and cycling trails, beaches, and stunning views of the city skyline.
• Explore Granville Island: A popular destination for shopping, dining, and entertainment, with a bustling public market, artisan shops, and street performers.
• Take a Grouse Mountain Skyride: For panoramic views of the city and surrounding mountains, ride the Skyride to the summit of Grouse Mountain, where you can also hike, ski, or visit the wildlife refuge.


In [59]:
# Combine the first and the second chain
overall_chain = SimpleSequentialChain(input_key="country", output_key="city", chains=[chain_one, chain_two], verbose=True)
final_answer = overall_chain.invoke(input={"country": 'Canada'})

print(final_answer)



[1m> Entering new SimpleSequentialChain chain...[0m
[36;1m[1;3mToronto[0m
[33;1m[1;3mHere are the top three things to do in Toronto for tourists:

• **CN Tower**: Take a ride up the iconic CN Tower, which offers stunning views of Lake Ontario and the city skyline. You can also dine at the revolving restaurant, 360 Restaurant.

• **Kensington Market**: Explore this vibrant and eclectic neighborhood, known for its street art, independent shops, and multicultural restaurants. Be sure to visit the famous St. Lawrence Market, a historic marketplace with over 120 vendors selling fresh produce, meats, and specialty foods.

• **Toronto Islands**: Take a ferry ride to the Toronto Islands, a car-free haven with beautiful parks, beaches, and gardens. Visit the iconic Gibraltar Point Lighthouse, go for a bike ride or hike, and enjoy the stunning views of the city skyline from the islands' many lookout points.[0m

[1m> Finished chain.[0m
{'country': 'Canada', 'city': "Here are the top t

In [None]:
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
vector_store.
retriever = vector_store.as_retriever()
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)