In [1]:
import os
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")

from langchain.chat_models import init_chat_model

llm = init_chat_model("llama3-8b-8192", model_provider="groq")

In [2]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = TextLoader('data_results.txt')
docs = loader.load()

splitter = RecursiveCharacterTextSplitter(
                                            chunk_size=100,
                                            chunk_overlap=20,
                                        )
documents = splitter.split_documents(docs)


In [3]:
import operator
from typing import List, Literal, TypedDict

from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableConfig
from langgraph.constants import Send
from langgraph.graph import END, START, StateGraph

# Initial summary
summarize_prompt = ChatPromptTemplate(
    [
        ("human", "Write a concise analytical report of the following: {context}"),
    ]
)
initial_summary_chain = summarize_prompt | llm | StrOutputParser()

# Refining the summary with new docs
refine_template = """
Produce a final analytical report by capturing insights of the data 
the final report should contains the following sections:
1.descriptive analysis 
2.diagnostic  analysis
3.outlier analysis
4.what is the insights?
5.key takeaways
6.future recommendations to clients(note: in this section you have to guide client about their business not about further steps in analytics)
Existing summary up to this point:
{existing_answer}

New context:
------------
{context}
------------

Given the new context, refine the original summary.
"""
refine_prompt = ChatPromptTemplate([("human", refine_template)])

refine_summary_chain = refine_prompt | llm | StrOutputParser()


# We will define the state of the graph to hold the document
# contents and summary. We also include an index to keep track
# of our position in the sequence of documents.
class State(TypedDict):
    contents: List[str]
    index: int
    summary: str


# We define functions for each node, including a node that generates
# the initial summary:
async def generate_initial_summary(state: State, config: RunnableConfig):
    summary = await initial_summary_chain.ainvoke(
        state["contents"][0],
        config,
    )
    return {"summary": summary, "index": 1}


# And a node that refines the summary based on the next document
async def refine_summary(state: State, config: RunnableConfig):
    content = state["contents"][state["index"]]
    summary = await refine_summary_chain.ainvoke(
        {"existing_answer": state["summary"], "context": content},
        config,
    )

    return {"summary": summary, "index": state["index"] + 1}


# Here we implement logic to either exit the application or refine
# the summary.
def should_refine(state: State) -> Literal["refine_summary", END]:
    if state["index"] >= len(state["contents"])//10:
        return END
    else:
        return "refine_summary"


graph = StateGraph(State)
graph.add_node("generate_initial_summary", generate_initial_summary)
graph.add_node("refine_summary", refine_summary)

graph.add_edge(START, "generate_initial_summary")
graph.add_conditional_edges("generate_initial_summary", should_refine)
graph.add_conditional_edges("refine_summary", should_refine)
app = graph.compile()

In [4]:
# from IPython.display import Image

# Image(app.get_graph().draw_mermaid_png())

In [5]:
from langchain_core.runnables import RunnableConfig
config = RunnableConfig(recursion_limit=75)


In [6]:
len(documents)

564

In [None]:
async for step in app.astream(
    {"contents": [do.page_content for doc in range(0, len(documents), 10) for do in documents[doc:doc+10]]
},
    stream_mode="values",
    config=config
):
    if summary := step.get("summary"):
        print(summary)

I apologize, but it seems that you haven't provided the data description. Please provide the description of the data, including the value, and I'll be happy to assist you in writing a concise analytical report.
**Final Analytical Report**

**Descriptive Analysis**

The data provided consists of the Innings1_Balls, which captures the number of balls faced by a batsman in the first innings of a cricket match. The summary statistics are as follows:

* Count: 48
* Mean: 291.3125
* Standard Deviation (STD): 20.351335624526655
* Minimum: 200.0

These statistics provide an overview of the distribution of Innings1_Balls. The mean of 291.3125 suggests that the average number of balls faced by a batsman in the first innings is approximately 291. The standard deviation of 20.351335624526655 indicates that the majority of the data points are clustered around the mean, with a smaller number of data points deviating significantly from the mean.

**Diagnostic Analysis**

A diagnostic analysis involve

In [None]:
from pprint import pprint
pprint(step['summary'])

('**Final Analytical Report: Customer Shopping Data (Refined)**\n'
 '\n'
 '**Descriptive Analysis**\n'
 '\n'
 'The refined descriptive analysis of the Customer Shopping Data reveals the '
 'following key findings:\n'
 '\n'
 '* The dataset consists of 12,000 customer transactions, with a total revenue '
 'of $1,800,000, showing a 20% increase in revenue compared to the previous '
 'report.\n'
 '* The top 3 product categories remain Electronics (35% of total revenue), '
 "Clothing (25%), and Home Goods (20%), with a 5% increase in Home Goods' "
 'revenue and a 3% decrease in Clothing revenue.\n'
 '* The average order value remains at $150, with a median order value of '
 '$100, indicating a consistent customer spending pattern.\n'
 '* The majority of customers (70%) still make a single purchase, while 25% '
 'make multiple purchases, showing a 10% increase in repeat business.\n'
 '* Feature correlations reveal that customers who purchase Electronics are '
 'more likely to purchase Home G

In [None]:
# pprint(step)