In [1]:
import sys
import os 
import nest_asyncio

# Sanity check
print(sys.executable)
nest_asyncio.apply()

from dotenv import load_dotenv
load_dotenv() 

/Users/amorvan/Documents/code_dw/llm_collection/.venv/bin/python


True

In [5]:
import os
from pydantic import BaseModel, Field
from llama_index.core.workflow import (
    Workflow,
    step,
    Event,
    Context,
    StartEvent,
    StopEvent
)
from llama_index.llms.openai import OpenAI
from llama_index.core import SimpleDirectoryReader
from llama_index.core import (
    SimpleDirectoryReader,
    load_index_from_storage,
    VectorStoreIndex,
    StorageContext,
)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.retrievers.bm25 import BM25Retriever
import Stemmer
from llama_index.core import VectorStoreIndex, get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine


In [11]:
!mkdir -p 'data/paul_graham/'
!curl -o './paul_graham_essay.txt' 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt'

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 75042  100 75042    0     0   253k      0 --:--:-- --:--:-- --:--:--  254k


In [12]:
llm = OpenAI(temperature=0.2, model="gpt-4o-mini")

## 1 - RAG 

Using the BM25 retriever system 

In [13]:

documents = SimpleDirectoryReader(
    input_files=["./paul_graham_essay.txt"],
).load_data()
splitter = SentenceSplitter(chunk_size=256)
nodes = splitter.get_nodes_from_documents(documents)
retriever_top_5 = BM25Retriever.from_defaults(
    nodes=nodes,
    similarity_top_k=5,
    stemmer=Stemmer.Stemmer("english"),
    language="english",
)

In [14]:
rez = retriever_top_5.retrieve("computer")

print(rez[0])

Node ID: 93f9055d-64ad-4d4d-af23-9d88c8e997fe
Text: So I'm not surprised I can't remember any programs I wrote,
because they can't have done much. My clearest memory is of the moment
I learned it was possible for programs not to terminate, when one of
mine didn't. On a machine without time-sharing, this was a social as
well as a technical error, as the data center manager's expression
made clear....
Score:  1.289



In [None]:
print(rez[1].text)

In [None]:
# configure response synthesizer
response_synthesizer = get_response_synthesizer(
    response_mode="tree_summarize",
)

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever_top_5,
    response_synthesizer=response_synthesizer,
)
response = query_engine.query("Who is Paul Graham.")

In [None]:
response.response

## 2 - Exercise : 

Combine it with Workflows

Create a workflow that : 
- Search for the best quote about the user query
- Make a rap about it




In [26]:
retriever_top_1 = BM25Retriever.from_defaults(
    nodes=nodes,
    similarity_top_k=1,
    stemmer=Stemmer.Stemmer("english"),
    language="english",
)


class RapEvent(Event):
    pass


class ContextualGrahamRapWorkflow(Workflow):
    
    @step
    def do(self, ev: StartEvent) -> RapEvent:
        query = ev["message"]
        rez = retriever_top_1.retrieve(query)
        quote = rez[0].text
        return RapEvent(quote=quote)

    @step
    def rap(self, ev: RapEvent) -> StopEvent:
        answer = llm.complete(f"Make a rap on Paul Graham based on this quote {ev.quote}")
        return StopEvent(result=answer)



In [27]:
w = ContextualGrahamRapWorkflow()

r = await w.run(message="fun")

print(r.text)

(Verse 1)  
Yo, let me take you back to '96,  
Paul Graham on the scene, coding slick tricks,  
Old-fashioned site, but it was the bomb,  
Clunky vibes, but it had that charm.  
He said, "If you’re curious, take a look inside,  
This software’s got history, it’s got pride."  
While the world was changing, he held his ground,  
In a digital jungle, he was the king crowned.

(Chorus)  
Paul Graham, the visionary, breaking the mold,  
With a mind so sharp, and a heart so bold.  
From Y Combinator to the code he wrote,  
In the world of tech, he’s the one we quote.

(Verse 2)  
September came, Robert felt the heat,  
“Been grinding for a month, still can’t feel my feet.”  
Three years later, still in the grind,  
But Paul had a plan, he was one of a kind.  
“Let’s recruit some talent, bring in the best,”  
Trevor Blackwell, man, he passed the test.  
Notecards in hand, stacking life like a pro,  
But when it came to hacking, he stole the show.

(Chorus)  
Paul Graham, the visionary, breaki

In [29]:
rez = retriever_top_1.retrieve("fun")
rez

[NodeWithScore(node=TextNode(id_='f6ef5e97-beab-4632-9c33-43d90e7cb065', embedding=None, metadata={'file_path': 'paul_graham_essay.txt', 'file_name': 'paul_graham_essay.txt', 'file_type': 'text/plain', 'file_size': 75042, 'creation_date': '2025-02-20', 'last_modified_date': '2025-02-20'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='9502a3e5-9c8b-4df9-b3f6-536ebdda7ee9', node_type='4', metadata={'file_path': 'paul_graham_essay.txt', 'file_name': 'paul_graham_essay.txt', 'file_type': 'text/plain', 'file_size': 75042, 'creation_date': '2025-02-20', 'last_modified_date': '2025-02-20'}, hash='0c3c3f46cac874b495d944dfc4b920f6b68817dbbb1699ecc955d1fafb2bf87b'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo

## 3 - Exercise (if time permits) 

Combine it with a reranker

Create a workflow that : 
- Search for the best quote about the user query
- Rerank it
- Make a rap about it


In [50]:
from llama_index.core import QueryBundle
from llama_index.core.postprocessor import LLMRerank


retriever_top_5 = BM25Retriever.from_defaults(
    nodes=nodes,
    similarity_top_k=5,
    stemmer=Stemmer.Stemmer("english"),
    language="english",
)


class QuotesEvent(Event):
    pass

class RapEvent(Event):
    pass


class RerankedGrahamRapWorkflow(Workflow):
    
    @step
    def do(self, ev: StartEvent) -> QuotesEvent:
        query = ev["message"]
        rez = retriever_top_5.retrieve(query)
        return QuotesEvent(quotes=rez, query=query)

    @step
    def best(self, ev: QuotesEvent) -> RapEvent:
        quotes = ev["quotes"]
        reranker = LLMRerank(
            choice_batch_size=5,
            top_n=5,
        )
        query_bundle = QueryBundle(ev.query)
        retrieved_nodes = reranker.postprocess_nodes(
            quotes, query_bundle
        )
        return RapEvent(quote=retrieved_nodes[0].text)

    @step
    def rap(self, ev: RapEvent) -> StopEvent:
        answer = llm.complete(f"Make a rap on Paul Graham based on this quote {ev.quote}")
        return StopEvent(result=answer)



In [52]:
w = RerankedGrahamRapWorkflow()

r = await w.run(message="Complex computer machines")

print(r.text)

(Verse 1)  
Yo, let me take you back to the roots of the game,  
Paul Graham in the house, you know the name,  
Talkin' 'bout Lisp, a model so divine,  
An alternative to Turing, yeah, it’s one of a kind.  
John McCarthy, the genius, he laid down the law,  
Discovered a language, left us all in awe.  
But it took a grad student, Steve Russell on the scene,  
Translated that vision, made it fit for machines.  

(Chorus)  
Lisp, Lisp, the elegance flows,  
From computation’s heart, that’s how it goes.  
Predefined operators, minimal yet grand,  
In the world of coding, it took a stand.  

(Verse 2)  
In sixty, McCarthy dropped the first iteration,  
Just interpreting expressions, no complication.  
But it was missing features, like a puzzle unsolved,  
Had to add the pieces, let the mystery evolve.  
Axiomatic roots, but the branches grew wide,  
With every new addition, Lisp took a ride.  
Power and elegance, unmatched in the race,  
In college, Paul felt it, but couldn’t see the face. 