# PROBLEM STATEMENT
The primary objective of this work is to create a RAG pipeline over the `101 alpha formulaic book` to extract alpha number, expression and explanation. Utimately, this will serve as a starting point for creating an end-to-end alpha agent that will discover alpha signals over a knowledge base, create its python implementation, perform backtesting and trade live if expected performance pass a satisfactory threshold.

Here we will be using GPT4o so as to better reduce hallucination and more control over the structure of the responses from the model.

In [3]:
#!pip install transformers gpt4all sentence-transformers langchain langchain-core langchain-community langchain-openai faiss-cpu pypdf

In [2]:
import os

# Optional, add tracing in LangSmith
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "Alpha agent project"

In [3]:
from getpass import getpass

def _set_if_undefined(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass(f"Pass in {var}")

_set_if_undefined("LANGCHAIN_API_KEY")
_set_if_undefined("OPENAI_API_KEY")

Pass in LANGCHAIN_API_KEY ········
Pass in OPENAI_API_KEY ········


## Ground an alpha retriever to document provided

The goal of this work is to generate alpha expression and explanation based on the document provided. If response not relevant to document, we return nothing.

Based on the workflow above, the following steps are followed:
1. Based on a given `query`, we generate relevant documents.
2. We then use a `binary score` to check whether each retrieved document is relevant or not.
3. __If document is relevant__, we generate an answer. otherwise, we `re-write` the `query`.
4. For the answer generated in 3, we check if the answer is grounded to the document(to reduce hallucination).
5. __If no hallucination__, we then check if `answer` is relevant to the question. otherwise `repeat step 3`.
6. __If answer is relevant to question__, return answer. otherwise, `repeat 1`.

In [2]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_community.vectorstores import FAISS
import math

loader = PyPDFLoader("../101-Alpha-Formula.pdf")
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
doc_splits = splitter.split_documents(documents)

# model_id = 'sentence-transformers/all-MiniLM-L6-v2'
# model_kwargs = {'device': 'cpu'}
# hf_embeddings = HuggingFaceEmbeddings(
#     model_name=model_id,
#     model_kwargs=model_kwargs
# )

# Add to vectorDB
vectorstore = FAISS.from_documents(
    documents=doc_splits,
    embedding=GPT4AllEmbeddings(),
)
number_of_relevant_docs = math.ceil(0.8 * len(doc_splits))
retriever = vectorstore.as_retriever(search_kwargs={'k': number_of_relevant_docs})

KeyError: 'model_name'

In [65]:
# Testing query to ensure no duplicate documents returned by retriever's get_relevant_documents method

ALPHA_EXPRESSION_EXAMPLE = """
(rank((open - (sum(vwap, 10) / 10))) * (-1 * abs(rank((close - vwap))))) \n
((rank(Log(product(rank((rank(correlation(vwap, sum(adv10, 49.6054),
8.47743))^4)), 14.9655))) < rank(correlation(rank(vwap), rank(volume), 5.07914))) * -1) \n
(max(rank(decay_linear(delta(((close * 0.369701) + (vwap * (1 - 0.369701))),
1.91233), 2.65461)), Ts_Rank(decay_linear(abs(correlation(IndNeutralize(adv81,
IndClass.industry), close, 13.4132)), 4.89768), 14.4535)) * -1) \n
"""

STRATEGY = "flow of funds strategy"
query = f"""Generate alpha information(s) based on a strategy \n
You are to respond with alpha information(alpha number, alpha expression and alpha explanation). \n

Here is the strategy: \n
{STRATEGY}

Here is an example of alpha expression: \n
{ALPHA_EXPRESSION_EXAMPLE}
"""


In [66]:
docs = retriever.get_relevant_documents(query)

In [67]:
len(docs)

18

In [68]:
def remove_duplicates(documents):
    seen = set()
    unique_docs = []
    for doc in documents:
        if doc.page_content not in seen:
            unique_docs.append(doc)
            seen.add(doc.page_content)
    return unique_docs

# Use the remove_duplicates function to remove duplicates from the retrieved documents
unique_docs = remove_duplicates(docs)
len(unique_docs)

18

In [69]:
docs[0]

Document(page_content='12 \n Alpha#61:  (rank((vwap - ts_min(vwap, 16.1219))) < rank(corre lation(vwap, adv180, 17.9282))) \nAlpha#62:  ((rank(correlation(vwap, sum(adv20, 22.4101), 9.91 009)) < rank(((rank(open) + \nrank(open)) < (rank(((high + low) / 2)) + rank(high ))))) * -1) \nAlpha#63:  ((rank(decay_linear(delta(IndNeutralize(close, Ind Class.industry), 2.25164), 8.22237)) \n- rank(decay_linear(correlation(((vwap * 0.318108) + (open * (1 - 0.318108))), sum(adv180, \n37.2467), 13.557), 12.2883))) * -1) \nAlpha#64:  ((rank(correlation(sum(((open * 0.178404) + (low *  (1 - 0.178404))), 12.7054), \nsum(adv120, 12.7054), 16.6208)) < rank(delta(((((hi gh + low) / 2) * 0.178404) + (vwap * (1 - \n0.178404))), 3.69741))) * -1) \nAlpha#65:  ((rank(correlation(((open * 0.00817205) + (vwap * (1 - 0.00817205))), sum(adv60, \n8.6911), 6.40374)) < rank((open - ts_min(open, 13.6 35)))) * -1) \nAlpha#66:  ((rank(decay_linear(delta(vwap, 3.51013), 7.23052) ) + Ts_Rank(decay_linear(((((low \n* 0.96

In [70]:
docs[1]

Document(page_content='14 \n Alpha#87:  (max(rank(decay_linear(delta(((close * 0.369701) +  (vwap * (1 - 0.369701))), \n1.91233), 2.65461)), Ts_Rank(decay_linear(abs(corre lation(IndNeutralize(adv81, \nIndClass.industry), close, 13.4132)), 4.89768), 14. 4535)) * -1) \nAlpha#88:  min(rank(decay_linear(((rank(open) + rank(low)) - (rank(high) + rank(close))), \n8.06882)), Ts_Rank(decay_linear(correlation(Ts_Rank (close, 8.44728), Ts_Rank(adv60, \n20.6966), 8.01266), 6.65053), 2.61957)) \nAlpha#89:  (Ts_Rank(decay_linear(correlation(((low * 0.967285 ) + (low * (1 - 0.967285))), adv10, \n6.94279), 5.51607), 3.79744) - Ts_Rank(decay_linear (delta(IndNeutralize(vwap, \nIndClass.industry), 3.48158), 10.1466), 15.3012)) \nAlpha#90:  ((rank((close - ts_max(close, 4.66719)))^Ts_Rank(c orrelation(IndNeutralize(adv40, \nIndClass.subindustry), low, 5.38375), 3.21856)) * - 1) \nAlpha#91:  ((Ts_Rank(decay_linear(decay_linear(correlation(In dNeutralize(close, \nIndClass.industry), volume, 9.74928), 16.

In [71]:
docs[2]

Document(page_content='13 \n Alpha#74:  ((rank(correlation(close, sum(adv30, 37.4843), 15. 1365)) < \nrank(correlation(rank(((high * 0.0261661) + (vwap *  (1 - 0.0261661)))), rank(volume), 11.4791))) \n* -1) \nAlpha#75:  (rank(correlation(vwap, volume, 4.24304)) < rank(c orrelation(rank(low), rank(adv50), \n12.4413))) \nAlpha#76:  (max(rank(decay_linear(delta(vwap, 1.24383), 11.82 59)), \nTs_Rank(decay_linear(Ts_Rank(correlation(IndNeutral ize(low, IndClass.sector), adv81, \n8.14941), 19.569), 17.1543), 19.383)) * -1) \nAlpha#77:  min(rank(decay_linear(((((high + low) / 2) + high)  - (vwap + high)), 20.0451)), \nrank(decay_linear(correlation(((high + low) / 2), a dv40, 3.1614), 5.64125))) \nAlpha#78:  (rank(correlation(sum(((low * 0.352233) + (vwap * (1 - 0.352233))), 19.7428), \nsum(adv40, 19.7428), 6.83313))^rank(correlation(ran k(vwap), rank(volume), 5.77492))) \nAlpha#79:  (rank(delta(IndNeutralize(((close * 0.60733) + (op en * (1 - 0.60733))), \nIndClass.sector), 1.23438)) < rank(

## Create a graph state

Our state will be a `dict`, We can access through any graph `node` as `state[keys]`.

In [80]:
from typing import TypedDict, Dict

class AlphaRagState(TypedDict):
    """
    Represents the graph state of the alpha agent

    Attributes:
        keys: A dictionary where 
    """
    keys: Dict[str, any]

In [81]:
openai_llm = "gpt-4o"

## Define nodes for our workflow
Create the following nodes:
1. retrieve
2. grade_documents
3. generate
4. transform_query
5. prepare_for_final_grade

In [88]:
# retrieve node gets relevant documents based on a user query
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.output_parsers.openai_tools import PydanticToolsParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.utils.function_calling import convert_to_openai_tool
from langchain_core.runnables import RunnableLambda
from langchain_openai import ChatOpenAI
from typing import Optional, List

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def retrieve(state):
    """
    Retrieves relevant documents

    Args: 
        state (dict): current graph state

    Returns:
        state (dict): New key added to the state, documents, contains relevant documents to question 
    """
    print("-- RETRIEVING DOCUMENTS --")
    print("State: ", state)
    state_dict = state["keys"]
    question = state_dict["question"]
    relevant_docs = retriever.get_relevant_documents(question)

    return {"keys": { "documents": relevant_docs, "question": question}}
    
# grade_documents node
def grade_documents(state):
    """
    Determines whether the retrieved documents are relevant to the question.

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): Updates documents key with relevant documents
    """

    print("---CHECKING RELEVANCE---")
    state_dict = state["keys"]
    question = state_dict["question"]
    documents = state_dict["documents"]

    # Data model
    class grade(BaseModel):
        """Binary score to check the relevance of a document to a question"""
        binary_score: str = Field(description="Is document relevant to a question, 'yes' or 'no'. ")

    # LLM
    llm = ChatOpenAI(temperature=0, model=openai_llm, streaming=True)
    
    # Tool
    grade_tool_oai = convert_to_openai_tool(grade)

    # LLM with tool and enforce invocation
    llm_with_tool = llm.bind(
        tools=[grade_tool_oai],
        tool_choice={"type": "function", "function": {"name": "grade"}}
    )

    # Parser
    parser_tool = PydanticToolsParser(tools=[grade])
    
    # Prompt
    prompt = PromptTemplate(
        template="""You are a grader assessing relevance of a retrieved document to a user question. \n 
        Here is the retrieved document: \n\n {context} \n\n
        Here is the user question: {question} \n
        If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \n
        Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question.""",
        input_variables=["context", "question"],
    )

    # Chain
    chain = prompt | llm_with_tool | parser_tool

    # Score
    filtered_docs = []
    for d in documents:
        score = chain.invoke({"question": question, "context": d.page_content})
        print("Score for current document: ", score)

        grade = score[0].binary_score
        if grade == "yes":
            print("---GRADE: DOCUMENT RELEVANT---")
            filtered_docs.append(d)
        else:
            print("---GRADE: DOCUMENT NOT RELEVANT---")
            continue

    return {"keys": {"documents": filtered_docs, "question": question}}
            
# generate node for generating answer to questions
def generate(state):
    """
    Generate answer

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): New key added to state, generation, that contains LLM generation
    """
    print("---GENERATE---")
    state_dict = state["keys"]
    question = state_dict["question"]
    documents = state_dict["documents"]

    # ALPHA_EXPRESSION_EXAMPLE = """
    # (-1 * delta((((close - low) - (high - close)) / (close - low)), 9))
    # """
    class alpha_info(BaseModel):
        """This contains essential alpha information in the retrieved documents"""
        alpha_number: str = Field(description="This is the label of alpha expression in the retrieved documents. For example: Alpha#1, Alpha#2, ...., Alpha#101")
        alpha_expression: str = Field(description="This contains alpha expression in retrieved documents.")
        alpha_explanation: str = Field(description="Explanation of the alpha expression retrieved. Use your internal knowledge")

    class alpha_response(BaseModel):
        """This model can either be None or a list of alpha_info instances"""
        alpha_infos: Optional[List[alpha_info]] = None
    
    # Prompt
    prompt = PromptTemplate(
    template= """You are an assistant for question-answering tasks. \n
    Use the retrieved documents to answer the question. If you don't know the answer, just say that you don't know. \n
     
    Here are the retrieved documents: 
    {context}
    
    Here is the user question: 
    {question}
    """,
    input_variables=["question", "context"],
    )

    # LLM
    llm = ChatOpenAI(temperature=0, model=openai_llm, streaming=True)

    # Tool
    grade_tool_oai = convert_to_openai_tool(alpha_response)

    # LLM with tool and enforce invocation
    llm_with_tool = llm.bind(
        tools=[grade_tool_oai],
        tool_choice={"type": "function", "function": {"name": "alpha_response"}}
    )

    # Parser
    parser_tool = PydanticToolsParser(tools=[alpha_response])

    formatted_docs = format_docs(documents)

    # Chain
    rag_chain = prompt | llm_with_tool | parser_tool

    # Run
    generation = rag_chain.invoke({"context":formatted_docs, "question": question})
    
    print("Generated response to alpha query: ", generation)
    return {
        "keys": {"documents": documents, "question": question, "generation": generation}
    }

# transform query node, to regenerate query based on 
def transform_query(state):
    """
    Transform the query to produce a better question.

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): Updates question key with a re-phrased question
    """

    print("---TRANSFORMING QUERY---")
    state_dict = state["keys"]
    question = state_dict["question"]
    documents = state_dict["documents"]

    # Create a prompt template with format instructions and the query
    prompt = PromptTemplate(
        template="""You are generating alpha trading strategy-relation question / query that is well optimized for retrieval. \n 
        Look at the input and try to reason about the underlying sematic intent / meaning. \n 
        Here is the initial question:
        \n ------- \n
        {question} 
        \n ------- \n
        Formulate an improved alpha trading strategy question: """,
        input_variables=["question"],
    )

    # LLM
    llm = ChatOpenAI(temperature=0, model=openai_llm, streaming=True)

    # Prompt
    chain = prompt | llm | StrOutputParser()
    better_question = chain.invoke({"question": question})

    return {"keys": {"documents": documents, "question": better_question}}

# prepare for final grade, 
def prepare_for_final_grade(state):
    """
    Passthrough state for final grade.

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): The current graph state
    """

    print("---FINAL GRADE---")
    state_dict = state["keys"]
    question = state_dict["question"]
    documents = state_dict["documents"]
    generation = state_dict["generation"]

    return {
        "keys": {"documents": documents, "question": question, "generation": generation}
    }


## Define edges for our workflow
Create the following edges:
1. decide_to_generate
2. decide_generation_is_grounded_in_documents
3. decide_generation_addresses_question

In [89]:
def decide_to_generate(state):
    """
    Determines whether to generate an answer, or re-generate a question.

    Args:
        state (dict): The current state of the agent, including all keys.

    Returns:
        str: Next node to call
    """

    print("---DECIDE TO GENERATE OR TRANSFORM QUERY---")
    state_dict = state["keys"]
    question = state_dict["question"]
    filtered_documents = state_dict["documents"]

    if not filtered_documents:
        # All documents have been filtered check_relevance
        # We will re-generate a new query
        print("---DECISION: TRANSFORM QUERY---")
        return "transform_query"
    else:
        # We have relevant documents, so generate answer
        print("---DECISION: GENERATE---")
        return "generate"


def decide_generation_is_grounded_in_documents(state):
    """
    Determines whether the generation is grounded in the document.

    Args:
        state (dict): The current state of the agent, including all keys.

    Returns:
        str: Binary decision
    """

    print("---GRADE GENERATION based on DOCUMENTS---")
    state_dict = state["keys"]
    question = state_dict["question"]
    documents = state_dict["documents"]
    generation = state_dict["generation"]

    # Data model
    class grade(BaseModel):
        """Binary score to check that answer is relevant to the documents."""

        binary_score: str = Field(description="Supported score 'yes' or 'no'")

    # LLM
    llm = ChatOpenAI(temperature=0, model=openai_llm, streaming=True)

    # Tool
    grade_tool_oai = convert_to_openai_tool(grade)

    # LLM with tool and enforce invocation
    llm_with_tool = llm.bind(
        tools=[grade_tool_oai],
        tool_choice={"type": "function", "function": {"name": "grade"}},
    )

    # Parser
    parser_tool = PydanticToolsParser(tools=[grade])

    # Prompt
    prompt = PromptTemplate(
        template="""You are a grader assessing whether an answer is grounded in / supported by a set of facts. \n 
        Here are the facts:
        \n ------- \n
        {context} 
        \n ------- \n
        Here is the answer: {generation}
        Give a binary score 'yes' or 'no' to indicate whether the answer is grounded in / supported by a set of facts.""",
        input_variables=["generation", "context"],
    )

    # Chain
    chain = prompt | llm_with_tool | parser_tool

    formatted_docs = format_docs(docs)

    score = chain.invoke({"generation": generation, "context": formatted_docs})
    grade = score[0].binary_score

    if grade == "yes":
        print("---DECISION: SUPPORTED, MOVE TO FINAL GRADE---")
        return "grounded"
    else:
        print("---DECISION: NOT SUPPORTED, GENERATE AGAIN---")
        return "not grounded"


def decide_generation_addresses_question(state):
    """
    Determines whether the generation addresses the question.

    Args:
        state (dict): The current state of the agent, including all keys.

    Returns:
        str: Binary decision
    """

    print("---GRADE GENERATION vs QUESTION---")
    state_dict = state["keys"]
    question = state_dict["question"]
    documents = state_dict["documents"]
    generation = state_dict["generation"]

    # Data model
    class grade(BaseModel):
        """Binary score to check relevance of answer to question."""

        binary_score: str = Field(description="Useful score 'yes' or 'no'")

    # LLM
    llm = ChatOpenAI(temperature=0, model=openai_llm, streaming=True)

    # Tool
    grade_tool_oai = convert_to_openai_tool(grade)

    # LLM with tool and enforce invocation
    llm_with_tool = llm.bind(
        tools=[grade_tool_oai],
        tool_choice={"type": "function", "function": {"name": "grade"}},
    )

    # Parser
    parser_tool = PydanticToolsParser(tools=[grade])

    # Prompt
    prompt = PromptTemplate(
        template="""You are a grader assessing whether an answer is useful to resolve a question. \n 
        Here is the answer:
        \n ------- \n
        {generation} 
        \n ------- \n
        Here is the question: {question}
        Give a binary score 'yes' or 'no' to indicate whether the answer is useful to resolve a question.""",
        input_variables=["generation", "question"],
    )

    # Prompt
    chain = prompt | llm_with_tool | parser_tool
    binary_result = answer_grader.invoke({"question": question,"generation": generation})
    grade = binary_result["score"]
    
    if grade == "yes":
        print("---DECISION: USEFUL---")
        return "useful"
    else:
        print("---DECISION: NOT USEFUL---")
        return "not useful"

## Build graph

In [90]:
import pprint

from langgraph.graph import END, StateGraph

workflow = StateGraph(AlphaRagState)

# Define the nodes
workflow.add_node("retrieve", retrieve)  # retrieve
workflow.add_node("grade_documents", grade_documents)  # grade documents
workflow.add_node("generate", generate)  # generatae
workflow.add_node("transform_query", transform_query)  # transform_query
workflow.add_node("prepare_for_final_grade", prepare_for_final_grade)  # passthrough

# Build graph
workflow.set_entry_point("retrieve")
workflow.add_edge("retrieve", "grade_documents")
workflow.add_conditional_edges(
    "grade_documents",
    decide_to_generate,
    {
        "transform_query": "transform_query",
        "generate": "generate",
    },
)
workflow.add_edge("transform_query", "retrieve")
workflow.add_conditional_edges(
    "generate",
    decide_generation_is_grounded_in_documents,
    {
        "grounded": "prepare_for_final_grade",
        "not grounded": "generate",
    },
)
workflow.add_conditional_edges(
    "prepare_for_final_grade",
    decide_generation_addresses_question,
    {
        "useful": END,
        "not useful": "transform_query",
    },
)

# Compile
app = workflow.compile()

In [None]:
# Test
import time

start = time.time()
from pprint import pprint

ALPHA_EXPRESSION_EXAMPLE = """
(rank((open - (sum(vwap, 10) / 10))) * (-1 * abs(rank((close - vwap))))) \n
((rank(Log(product(rank((rank(correlation(vwap, sum(adv10, 49.6054),
8.47743))^4)), 14.9655))) < rank(correlation(rank(vwap), rank(volume), 5.07914))) * -1) \n
(max(rank(decay_linear(delta(((close * 0.369701) + (vwap * (1 - 0.369701))),
1.91233), 2.65461)), Ts_Rank(decay_linear(abs(correlation(IndNeutralize(adv81,
IndClass.industry), close, 13.4132)), 4.89768), 14.4535)) * -1) \n
"""

STRATEGY = "Momentum"
query = f"""Generate alpha information(s) based on a strategy \n
You are to respond with alpha information(alpha number, alpha expression and alpha explanation). \n

Here is the strategy: \n
{STRATEGY}

Here is an example of alpha expression: \n
{ALPHA_EXPRESSION_EXAMPLE}
"""

inputs = {"keys": {"question": query }}
for output in app.stream(inputs):
    for key, value in output.items():
        pprint(f"Finished running: {key}")

end = time.time()
print(f"Elapsed time: {end - start}s")

-- RETRIEVING DOCUMENTS --
State:  {'keys': {'question': 'Generate alpha information(s) based on a strategy \n\nYou are to respond with alpha information(alpha number, alpha expression and alpha explanation). \n\n\nHere is the strategy: \n\nflow of funds strategy\n\nHere is an example of alpha expression: \n\n\n(rank((open - (sum(vwap, 10) / 10))) * (-1 * abs(rank((close - vwap))))) \n\n((rank(Log(product(rank((rank(correlation(vwap, sum(adv10, 49.6054),\n8.47743))^4)), 14.9655))) < rank(correlation(rank(vwap), rank(volume), 5.07914))) * -1) \n\n(max(rank(decay_linear(delta(((close * 0.369701) + (vwap * (1 - 0.369701))),\n1.91233), 2.65461)), Ts_Rank(decay_linear(abs(correlation(IndNeutralize(adv81,\nIndClass.industry), close, 13.4132)), 4.89768), 14.4535)) * -1) \n\n\n'}}
'Finished running: retrieve'
---CHECKING RELEVANCE---
Score for current document:  [grade(binary_score='no')]
---GRADE: DOCUMENT NOT RELEVANT---
Score for current document:  [grade(binary_score='yes')]
---GRADE: DOCU