In [None]:
from langchain_groq import ChatGroq
from langgraph.graph import StateGraph, MessagesState, START, END
from langchain_core.messages import HumanMessage
from smolagents import CodeAgent, LiteLLMModel, tool, ToolCallingAgent
from langchain.prompts import ChatPromptTemplate
from dotenv import load_dotenv
import smolagents
import os

load_dotenv()

summarizer_agent = ChatGroq(
    model = 'qwen-qwq-32b',
    api_key=os.getenv('GROQ_API_KEY'),
    temperature=0
)


prompt = ChatPromptTemplate.from_messages([
    ("system", 
     """You are a data analyst assistant. 
You will be given loaded data from a CSV or Excel file as a raw string (e.g., `df.head().to_string()` or summary statistics).

Your job is to analyze and summarize the structure and key characteristics of the data in a clear and structured format.

Include the following details in your response:
1. Number of rows and columns.
2. Column names and their data types.
3. Presence of any missing/null values and their distribution.
4. Basic descriptive statistics (e.g., mean, median, std for numerical columns).
5. Any detected categorical columns and their unique value counts.
6. General insights or anomalies (e.g., skewed columns, outliers, etc.)

Respond in a structured format using numbered points or markdown-style bullet points. Do NOT hallucinate; only use the information present in the data.
"""),
    ("human", "{loaded_data}")
])


In [None]:
@tool
def load_data(file_path: str) -> str:
    """
    Loads and analyzes the dataset for summarization purposes.
    Supports CSV and Excel (multi-sheet) files.

    Args:
        file_path (str): Path to the data file (.csv or .xlsx).

    Returns:
        str: Structured string containing dataset information for summarization.
    """
    import pandas as pd
    import os

    summary_output = []
    file_ext = os.path.splitext(file_path)[-1].lower()

    if file_ext == '.csv':
        df = pd.read_csv(file_path)
        summary_output.append("🔹 File Type: CSV")
        summary_output.append(f"🔸 Shape: {df.shape[0]} rows × {df.shape[1]} columns")
        summary_output.append(f"\n🧠 Column Names & Data Types:\n{df.dtypes.to_string()}")
        summary_output.append(f"\n🔍 Data Preview:\n{df.head().to_string(index=False)}")
        summary_output.append(f"\n Missing Values:\n{df.isnull().sum().to_string()}")
        summary_output.append(f"\n📊 Descriptive Statistics:\n{df.describe(include='all', datetime_is_numeric=True).to_string()}")

    elif file_ext in ['.xlsx', '.xls']:
        excel_file = pd.ExcelFile(file_path)
        summary_output.append(f"🔹 File Type: Excel with {len(excel_file.sheet_names)} sheet(s)")

        for sheet in excel_file.sheet_names:
            df = pd.read_excel(excel_file, sheet_name=sheet)
            summary_output.append(f"\n\n📄 Sheet Name: {sheet}")
            summary_output.append(f"🔸 Shape: {df.shape[0]} rows × {df.shape[1]} columns")
            summary_output.append(f"\n🧠 Column Names & Data Types:\n{df.dtypes.to_string()}")
            summary_output.append(f"\n🔍 Data Preview:\n{df.head().to_string(index=False)}")
            summary_output.append(f"\n Missing Values:\n{df.isnull().sum().to_string()}")
            summary_output.append(f"\n📊 Descriptive Statistics:\n{df.describe(include='all', datetime_is_numeric=True).to_string()}")

    else:
        raise ValueError("❌ Unsupported file format. Please upload a CSV or Excel (.xlsx/.xls) file.")

    return "\n".join(summary_output)

@tool
def get_user_inputs(reason_for_inputs: str) -> str:
    """
    Function to get user inputs.
    Args:
        reason_for_inputs (str): Reason for requesting user inputs.

    Returns:
        str: User input for the data analysis request.
    """
    user_request = input(f" {reason_for_inputs}: ")
    return user_request

@tool
def summarize_data(loaded_data: str) -> str:
    """
    Function to summarize the user's request.
    Args:
        loaded_data (str): The loaded_data to be summarized.

    Returns:
        str: Summary of the data.
    """
    messages = prompt.format_messages(loaded_data=loaded_data)
    response = summarizer_agent(messages)
    return response.content


In [None]:

llm = LiteLLMModel(
    "openai/deepseek-r1-distill-llama-70b",
    api_base="https://api.groq.com/openai/v1",
    api_key=os.getenv('GROQ_API_KEY')
)
llm.flatten_messages_as_text = True

coder_agent = CodeAgent(
    name='coder_agent',
    tools=[load_data, summarize_data],
    model=llm,
    description="""Your goal is to solve the user query by generating Python code.
    You may use Python libraries such as pandas, matplotlib, seaborn, plotly, scipy, etc., for tasks like:
    - data wrangling
    - descriptive statistics
    - visualization
    - transformations
    - reporting
    you can use this tools {load_data}, {summarize_data}. first load the data provided by the user and then make a summary using that summarize_data tool.
    The input to load_data will be the file path provided by the user.
    The input to summarize_data will be the output of load_data tool.
    Dont make any assumptions or hallucinate before getting the knowledge on data.
    after getting knowledge on data, then solve the user question by writing code.
    """,
    additional_authorized_imports=[
        "pandas",
        "matplotlib.pyplot",
        "seaborn",
        "openpyxl",
        "stat",
        "scipy",
        "plotly",
        "numpy",
        "os",
        "json",
        "re",
        "datetime",
        "plotly.express"
    ]
)

coder_agent.run('Hey can you give me soe overall insights, use pd.ExcelFile()', additional_args={'data_path':'C:\\Users\\arun5\\Desktop\\Spend_analyzer\\src\\IT_spend_analysis_data.xlsx'})
manager_agent_prompt = """
You are a strategic planning agent responsible for coordinating intelligent data analysis workflows.

Your role is to:
1. Engage with the user to understand their data-related queries using the `get_user_inputs` tool.
2. Use the `load_data` tool to access and extract insights from the provided dataset path.
3. Utilize the `summarize_user_request` tool to generate a clear and structured summary of the dataset. This includes key stats, missing values, data types, and potential insights.
4. Based on the users intent and the data summary, you must craft a high-level plan with instructions for the `coder_agent`. This plan should:
   - Clearly define the task or analysis to be performed (e.g., filtering, visualization, statistical testing, predictions).
   - Specify which columns or patterns the coder should focus on.
   - Indicate any relevant tools or libraries to use (e.g., pandas, matplotlib, seaborn, scipy).
   - Outline expected outputs (e.g., cleaned dataset, chart, report).

💡 Rules:
- Think in a step-by-step, logical manner.
- Never assume anything not backed by user input or dataset summary.
- Always summarize before planning.
- Prioritize clarity, minimalism, and traceability.

Example Flow:
1. Ask user: “What insight or output are you hoping to derive from this dataset?”
2. Load the dataset using the given path.
3. Summarize the data to understand its structure and quality.
4. If columns like `department`, `cost_center`, `amount`, `date` exist, infer tasks such as trend analysis, cost breakdowns, or forecasting.
5. Create a plan and pass it to the `coder_agent`.

You're not writing code — you're creating the blueprint for the coder agent to execute.

Begin every session by loading and summarizing the dataset before engaging in complex planning.
"""


manager_agent = CodeAgent(
    name="ManagerAgent",
    tools=[get_user_inputs, load_data, summarize_data],
    description=manager_agent_prompt,
    model=llm,
    managed_agents=[coder_agent],
    planning_interval=1,
    additional_authorized_imports=[
        "pandas",
        "matplotlib.pyplot",
        "seaborn",
        "openpyxl",
        'stat',
        'scipy',
        'plotly',
        "numpy",
        "os",
        "json",
        "re",
        "datetime",
        'plotly.express'
    ]
)



manager_agent.run('Hey can you give me soe overall insights')