In [1]:
from llama_index.core.query_pipeline import (
    QueryPipeline as QP,
    Link,
    InputComponent,
)
from llama_index.experimental.query_engine.pandas import  PandasInstructionParser
from llama_index.llms.openai import OpenAI
from llama_index.core import PromptTemplate
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
os.environ["OPENAI_API_KEY"]=''

In [2]:
import pandas as pd

df = pd.read_csv("data\charm_df.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,order_id,quantity,variant_id,base_material,category,price,product_type,color,Date
0,3146,5517407092825,1,40557290000000.0,sterling-silver,charm,599.0,,gold,2024-04-28
1,3201,5577157214297,1,40557290000000.0,sterling-silver,charm,599.0,,gold,2024-06-01
2,3206,5588687192153,1,40557290000000.0,sterling-silver,charm,599.0,,gold,2024-06-08
3,3226,5615676293209,1,40504010000000.0,sterling-silver,charm,499.0,Charm,gold,2024-06-27
4,3658,5472291192921,1,40557290000000.0,sterling-silver,charm,599.0,,gold,2024-03-29


# steps

## 1. Pandas prompt to infer pandas instructions from user query 

In [5]:
pandas_prompt_str = (
    "You are working with a pandas dataframe in Python.\n"
    "The name of the dataframe is `df`.\n"
    "This is the result of `print(df.head())`:\n"
    "{df_str}\n\n"
    "Follow these instructions:\n"
    "{instruction_str}\n"
    "Query: {query_str}\n\n"
    "Expression:"
)

instruction_str = (
    "1. Convert the query to executable Python code using Pandas.\n"
    "2. The final line of code should be a Python expression.\n"
    "3. The code should represent a solution to the query.\n"
    "4. PRINT ONLY THE EXPRESSION.\n"
    "5. Do not quote the expression.\n"
)

# Response synthesis prompt to synthesize a final response given the dataframe

In [6]:

response_synthesis_prompt_str = (
    "Given an input question, synthesize a response from the query results.\n"
    "Query: {query_str}\n\n"
    "Pandas Instructions (optional):\n{pandas_instructions}\n\n"
    "Pandas Output: {pandas_output}\n\n"
    "Response: "
)



In [7]:
pandas_prompt = PromptTemplate(pandas_prompt_str).partial_format(
    instruction_str=instruction_str, df_str=df.head(5)
)
pandas_output_parser = PandasInstructionParser(df)
response_synthesis_prompt = PromptTemplate(response_synthesis_prompt_str)
llm = OpenAI(model="gpt-3.5-turbo")

### QP (Query Pipeline): A framework for chaining multiple components (or modules) to process a query step-by-step.
### modules:
### A dictionary that defines the components used in the pipeline. Each key is a module name, and the value is the corresponding component instance.
Components:
input: Handles user input.
pandas_prompt: Converts user queries into pandas-specific instructions.
llm1: A large language model used to generate instructions for pandas.
pandas_output_parser: Executes pandas instructions on the DataFrame and returns the results.
response_synthesis_prompt: Synthesizes the final response in natural language.
llm2: Another LLM instance used for refining or formatting the final response.
verbose=True: Enables detailed logging of pipeline operations for debugging or monitoring purposes.


In [8]:
qp = QP(
    modules={
        "input": InputComponent(),  # Handles user input.
        "pandas_prompt": pandas_prompt,
        "llm1": llm, # For generating pandas code
        "pandas_output_parser": pandas_output_parser,
        "response_synthesis_prompt": response_synthesis_prompt,
        "llm2": llm, #  For refining and formatting the final output.
    },
    verbose=True,
)

# Purpose: Defines the primary sequence of execution in the pipeline.

qp.add_chain(["input", "pandas_prompt", "llm1", "pandas_output_parser"])
# input->pandas_prompt->llm1->pandas_output_parser


### Purpose: Establishes additional connections between modules to pass data across them.


Source Module: The module providing data.
Destination Module: The module receiving data.
Destination Key (dest_key): The key or parameter name where the received data will be stored.

In [9]:

qp.add_links(
    [
        Link("input", "response_synthesis_prompt", dest_key="query_str"),
        # the input should be passed to response_synthesis_prompt with the name query_str.
        Link(
            "llm1", "response_synthesis_prompt", dest_key="pandas_instructions"
        ),
        Link(
            "pandas_output_parser",
            "response_synthesis_prompt",
            dest_key="pandas_output",
        ),
    ]
)
# final link to give the touch-up to the generated response using llm2.
qp.add_link("response_synthesis_prompt", "llm2")

In [12]:
response = qp.run(
    query_str="types of color available in dataset.",
)



[1;3;38;2;155;135;227m> Running module input with input: 
query_str: types of color available in dataset.

[0m[1;3;38;2;155;135;227m> Running module pandas_prompt with input: 
query_str: types of color available in dataset.

[0m[1;3;38;2;155;135;227m> Running module llm1 with input: 
messages: You are working with a pandas dataframe in Python.
The name of the dataframe is `df`.
This is the result of `print(df.head())`:
   Unnamed: 0       order_id  quantity    variant_id    base_material  \...

[0mINFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[1;3;38;2;155;135;227m> Running module pandas_output_parser with input: 
input: assistant: df['color'].unique()

[0m[1;3;38;2;155;135;227m> Running module response_synthesis_prompt with input: 
query_str: types of color available in dataset.
pandas_instructions: assistant: df['color'].unique()
pandas_output: ['gold'

In [13]:
print(response.message.content)

The dataset contains two types of colors: gold and silver.
