In [1]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
from langchain_groq import ChatGroq
from langchain_ollama import ChatOllama
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import PromptTemplate
from langchain_community.document_loaders import PDFMinerLoader
from langchain_community.document_loaders import PyPDFLoader
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import START, MessagesState, StateGraph
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_community.document_loaders.csv_loader import CSVLoader

import asyncio

load_dotenv()

USER_AGENT environment variable not set, consider setting it to identify your requests.


True

In [None]:
async def parse_pdf(path):
    print(path)
    loader = PyPDFLoader(path)
    pages = []
    async for page in loader.alazy_load():
        pages.append(page)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100, add_start_index=True)
    page_splits = text_splitter.split_documents(pages)
    return page_splits


async def parse_txt(path):
    print(path)
    with open(path, "r", errors="ignore") as file:
        text = file.read()
    with open(f"{path}", "w") as file:
        file.write(text)
    loader = CSVLoader(path)
    data = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100, add_start_index=True)
    page_splits = text_splitter.split_documents(data)
    return page_splits


async def parse_external_data():
    vector_store = Chroma(
        collection_name="data_center_collection",
        embedding_function=HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2"),
    )
    file_dir = "data_src"
    docs = []
    pdf_paths = [os.path.join(file_dir, path) for path in os.listdir(file_dir) if path.endswith(".pdf")]
    txt_paths = [os.path.join(file_dir, path) for path in os.listdir(file_dir) if path.endswith(".txt")]
    # print(f"starting to parse {len(txt_paths)} txts...")
    # parsed_docs = await asyncio.gather(*(parse_txt(path) for path in txt_paths))
    # docs = [page for pages in parsed_docs for page in pages]
    # print("finished parsing, starting to add to vector store...")
    # vector_store.add_documents(docs)

    print(f"starting to parse {len(pdf_paths)} pdfs...")
    parsed_docs = await asyncio.gather(*(parse_pdf(path) for path in pdf_paths))
    docs = [page for pages in parsed_docs for page in pages]
    print("finished parsing, starting to add to vector store...")
    vector_store.add_documents(docs)

    return vector_store


vector_store = await parse_external_data()

  from tqdm.autonotebook import tqdm, trange


starting to parse 15 pdfs...
src/2021-carbon-free-energy-data-centers.pdf
src/2024-data-centre-industry-outlook.pdf
src/ADL_Green_data_centers_2023.pdf
src/bnef-eaton-statkraft-data-center-study-en-us.pdf
src/CalData_Californias_Data_Strategy_2020.pdf
src/CEC-400-2022-010_CMF.pdf
src/CyrusOne-2024-Sustainability-Report.pdf
src/datacenters_roadmap_final_0.pdf
src/Datacenter_Sustainability_Strategy_Brief.pdf
src/decarbonizing-the-data-center-industry-siemens-2021.pdf
src/Equinix-Inc_2023-Sustainability-Report-3.pdf
src/jll-us-data-center-report-h1-2024.pdf
src/North America Data Center Trends H1 2024.pdf
src/Report_Digital_Realty_2406_2023_ESG_Report.pdf
src/UI Field report 152_Annual survey - Supply view.pdf


Ignoring wrong pointing object 23 0 (offset 0)
Ignoring wrong pointing object 35 0 (offset 0)
Ignoring wrong pointing object 123 0 (offset 0)
Ignoring wrong pointing object 125 0 (offset 0)
Ignoring wrong pointing object 127 0 (offset 0)
Ignoring wrong pointing object 206 0 (offset 0)
Ignoring wrong pointing object 275 0 (offset 0)
Ignoring wrong pointing object 297 0 (offset 0)


finished parsing, starting to add to vector store...


In [11]:
# Retrieval and Generation: Retrieve
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 10})

# Retrieval and Generation: Generation
llm = ChatGroq(api_key=os.getenv("GROQ_API_KEY"), model="llama-3.2-1b-preview")

# prompt engeering for RAG model
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(llm, retriever, contextualize_q_prompt)
system_prompt = """
You are an consultant for question-answering tasks for building datacenters. 
Use the following pieces of retrieved context to answer the question. 
Specify the resource you used in your answer. 
If you don't know the answer, just say that you don't know.
Consider different aspects OF building datacenters, including but not limited to: scope1,2,3 emissions, water and land usage.
For each question, you can provide multiple aspects related to the original question so the user can dive deeper into the topic.
List out specific companies, technologies, or strategies that are mentioned in the retrieved  context.
Input: {question} 
Context: {context} 
Answer:"""
system_prompt = (
    "You are an consultant for question-answering tasks for building datacenters. "
    "Use the following pieces of retrieved context to answer "
    "Specify the resource you used in your answer. "
    "the question. If you don't know the answer, say that you don't know."
    "Consider different aspects OF building datacenters, including but not limited to: scope 1,2,3 emissions, water and land usage."
    "For each question, you can provide multiple aspects related to the original question so the user can dive deeper into the topic."
    "List out specific companies, technologies, or strategies that are mentioned in the retrieved context."
    "\n\n"
    "{context}"
)
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [23]:
from typing import Sequence

from langchain_core.messages import AIMessage, BaseMessage, HumanMessage
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import START, StateGraph
from langgraph.graph.message import add_messages
from typing_extensions import Annotated, TypedDict


# We define a dict representing the state of the application.
# This state has the same input and output keys as `rag_chain`.
class State(TypedDict):
    input: str
    chat_history: Annotated[Sequence[BaseMessage], add_messages]
    context: str
    answer: str


# We then define a simple node that runs the `rag_chain`.
# The `return` values of the node update the graph state, so here we just
# update the chat history with the input message and response.
def call_model(state: State):
    response = rag_chain.invoke(state)
    return {
        "chat_history": [
            HumanMessage(state["input"]),
            AIMessage(response["answer"]),
        ],
        "context": response["context"],
        "answer": response["answer"],
    }


# Our graph consists only of one node:
workflow = StateGraph(state_schema=State)
workflow.add_edge(START, "model")
workflow.add_node("model", call_model)

# Finally, we compile the graph with a checkpointer object.
# This persists the state, in this case in memory.
memory = MemorySaver()
app = workflow.compile(checkpointer=memory)

In [24]:
from langchain import LLMChain, OpenAI
from langchain.prompts import PromptTemplate
from fastapi import FastAPI

fastapi = FastAPI()

@fastapi.get("/ask")
def ask_meaning():
    config = {"configurable": {"thread_id": "abc123"}}
    result = app.invoke(
        {"input": "What is Task Decomposition?"},
        config=config,
    )
    return {"response": result['answer']}

In [25]:
import asyncio
import uvicorn

if __name__ == "__main__":
    config = uvicorn.Config(fastapi)
    server = uvicorn.Server(config)
    await server.serve()

INFO:     Started server process [235762]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


INFO:     127.0.0.1:60522 - "GET / HTTP/1.1" 404 Not Found
INFO:     127.0.0.1:60532 - "GET /ask HTTP/1.1" 200 OK


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [235762]


In [13]:
config = {"configurable": {"thread_id": "abc123"}}

result = app.invoke(
    {"input": "What is Task Decomposition?"},
    config=config,
)
print(result["answer"])

Task decomposition is a method of breaking down complex tasks into smaller, more manageable tasks that can be performed individually or in groups. It involves identifying the specific activities, steps, and processes required to complete a task or a project, and then dividing it into smaller tasks that can be executed separately.

Task decomposition is often used in project management, IT, and other fields to:

1. Break down complex tasks into smaller, more manageable ones
2. Reduce complexity and make tasks more understandable
3. Improve communication and collaboration among team members
4. Enhance efficiency and productivity
5. Reduce risk and errors

Task decomposition involves several steps:

1. Identify the task or project
2. Map out the task steps and activities
3. Identify the inputs, outputs, and deliverables
4. Determine the required resources and skills
5. Prioritize the tasks and create a project schedule

Task decomposition can be used for various purposes, such as:

1. Def

In [14]:
result = app.invoke(
    {"input": "What is one way of doing it?"},
    config=config,
)
print(result["answer"])

One way to do task decomposition is the "Starfish Decomposition Method" or "Fishbone Diagram" (also known as Ishikawa Diagram). This method involves identifying the roots of a problem and decomposing them into smaller, separate areas of focus.

The method consists of a simple diagram with a root cause (the main problem or issue) at the center, surrounded by several branches or lines that represent different possible causes. Each branch or line represents a specific aspect of the problem or issue, and the tasks associated with each branch or line are the steps needed to resolve it.

Here's an example of how the Starfish Decomposition Method could be applied:

* **Root Cause**: Construction project delays
* **Branches**: Branch 1: Overly complex design
	+ Task 1: Review design plans
	+ Task 2: Check for regulatory compliance
	+ Task 3: Develop design revisions
* Branch 2: Supply chain issues
	+ Task 4: Negotiate with suppliers
	+ Task 5: Manage inventory levels
* Branch 3: Budget constra