Import Packages and set environment variables

In [1]:
import os
import langchain, langchain_experimental, langchain_google_genai

In [2]:
from dotenv import load_dotenv
load_dotenv('../../.env')
google_api_key = os.getenv("GOOGLE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

Activate Langsmith

In [3]:
from langchain.callbacks.manager import tracing_v2_enabled
from langsmith import Client
client = Client()

LANGCHAIN_TRACING_V2=os.getenv("LANGCHAIN_TRACING_V2")
LANGCHAIN_API_KEY=os.getenv("LANGCHAIN_API_KEY")
LANGCHAIN_PROJECT="palm-ragfusion-shadow-99"  # if not specified, defaults to "default"
LANGCHAIN_ENDPOINT=os.getenv("LANGCHAIN_ENDPOINT")
OPENAI_API_KEY=OPENAI_API_KEY

Set LLM and Embedding Models

In [4]:
#Import LLM and Embedding Models
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.embeddings import GooglePalmEmbeddings

In [5]:
llm = GoogleGenerativeAI(model="gemini-pro",temperature=0, google_api_key=google_api_key)
embeddings=GooglePalmEmbeddings(model_name="models/embedding-gecko-001",google_api_key=google_api_key)

Python LLM Agent

In [6]:
from langchain_experimental.tools import PythonAstREPLTool
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
from langchain.agents import Tool
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.agents.mrkl.base import ZeroShotAgent
from langchain.llms import OpenAI
from langchain.llms import VertexAI
from langchain.chains.llm import LLMChain
import vertexai
import pandas as pd
import numpy as np

In [7]:
from langchain.document_loaders import TextLoader, CSVLoader
from langchain.chains.question_answering import load_qa_chain
from langchain.tools import BaseTool

loader = CSVLoader('../../ado_backlog_clean.csv')
docchain = load_qa_chain(llm, chain_type="stuff")

# create document QA tool
class DocumentQATool(BaseTool):
    name='document'
    description='read contents from page content'
    
    def _run(self, query):
        return docchain.run(question=query, input_documents=loader.load(), 
                            return_only_outputs=True, verbose=True)
    
    def _arun(self, query):
        raise NotImplementedError('Not implemented')

In [8]:
df = pd.read_csv('../../ado_backlog_clean.csv')

pytool = PythonAstREPLTool(locals={"df": df}, description='python pandas DataFrame with variable name `df` which contains a csv export of agile backlog data.')

In [18]:
# create agent input and prompt Way too many tokens because I think it's trying to put the whole dataframe into the prefix prompt!!!!
tools = [pytool]
input_variables = ["df", "input", "agent_scratchpad"]
        
prompt = ZeroShotAgent.create_prompt(
    tools,
    prefix='''You are working with python pandas DataFrame with name `df`,
read from the document text from the document loader for context. You should solve
the question by small logical incremental steps and use the tools available.
    ''',
    suffix='''Here's the result of `print(df.head())`:
{df}

Begin!
Question: {input}
{agent_scratchpad}
    ''', 
    input_variables=input_variables
)
partial_prompt = prompt.partial(
    df=str(df.head(1).to_markdown())
)

llm_chain = LLMChain(
    llm=llm,
    prompt=partial_prompt,
)
tool_names = [tool.name for tool in tools]
zeroshotagent = ZeroShotAgent(
    llm_chain=llm_chain,
    tools=tool_names,
    stop=["\Observation:"]
)

In [20]:
from langchain.agents.agent import AgentExecutor

agent = AgentExecutor.from_agent_and_tools(
    agent=zeroshotagent,
    tools=tools,
    verbose=True,
    handle_parsing_errors=True,
    max_iterations=3,
)

In [21]:
agent.run(input='how many rows are in the dataset with a state of new?')



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I should filter the DataFrame to only include rows where the State column is equal to 'New' and then count the number of rows in the resulting DataFrame.
Action: Use the `df[df['State'] == 'New']` expression to filter the DataFrame to only include rows where the State column is equal to 'New'.
Action Input: `df[df['State'] == 'New']`[0m
Observation: Use the `df[df['State'] == 'New']` expression to filter the DataFrame to only include rows where the State column is equal to 'New'. is not a valid tool, try one of [python_repl_ast].
Thought:[32;1m[1;3mQuestion: how many rows are in the dataset with a state of new?
Thought: I should filter the DataFrame to only include rows where the State column is equal to 'New' and then count the number of rows in the resulting DataFrame.
Action: Use the `.shape[0]` attribute to count the number of rows in the DataFrame.
Action Input: `df[df['State'] == 'New'].shape[0]`[0m
Observa

'Agent stopped due to iteration limit or time limit.'

In [None]:
agent.run(input='given the df is of an agile backlog from Azure DevOps. Also, each row represents a work items in the backlog and the days between fields activated date and resolved date represent the time it takes a developer to complete development and software testing. This calculation represents the time to resolve. Also, each unique entry in the Area path field represents a different software development team. Based on the data which team has the best time to resolve?')

In [None]:
agent.run(input='is there enough data in the dataframe df to predict how many work items will have a resolved date in the next 30 day timeframe based on the historical time to resolve?')

In [None]:
agent.run(input='what other datapoints are needed in the dataframe df to accurately predict the number of work items with a resolved date in the next 30 days based on historical time to resolve.')

In [None]:
agent.run(input='what is the average time to resolved for each area path?')

In [None]:
agent.run(input='how many work items have a state of new?')

In [None]:
agent.run(input='what is the distribution of items in a new state by Area path?')

In [None]:
agent.run(input='Using the count of resolved date per week as throughput, use monte carlo analysis to determine the likelihood that 50 work items will have a resolved date in the next 6 weeks. assum')

In [None]:
agent.run(input='how many work items per week have a resolved date after July 2023')

In [None]:
agent.run(input='chart how many work items per week have a resolved date after July 2023, by week')

In [None]:
agent.run(input='chart how many work items per week have a resolved date after July 2023, by week. Series.dt.weekofyear and Series.dt.week have been deprecated. Please use Series.dt.isocalendar().week instead')

In [None]:
agent.run(input='Using the count of resolved date per week as throughput, use monte carlo analysis to determine the likelihood that 50 work items will have a resolved date in the next 6 weeks.')

In [None]:
agent.run(input='Using the count of resolved date per week as throughput, use monte carlo analysis to determine the likelihood that 50 work items will have a resolved date in the next 6 weeks. Series.dt.weekofyear and Series.dt.week have been deprecated. Please use Series.dt.isocalendar().week instead')

In [None]:
agent.run(input='Using the count of resolved date per week as throughput, use monte carlo analysis to determine the highest number of work items will have a resolved date in the next 6 weeks with a high degree of likelihood greater than 90%. Series.dt.weekofyear and Series.dt.week have been deprecated. Please use Series.dt.isocalendar().week instead')

In [None]:
agent.run(input='Using the count of resolved date per week as throughput, use monte carlo analysis to determine the highest number of work items will have a resolved date in the next 6 weeks with a high degree of likelihood greater than 90%. Series.dt.weekofyear and Series.dt.week have been deprecated. Please use Series.dt.isocalendar().week instead, Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass datetime64[ns] instead')

In [None]:
agent.run(input='Using the count of resolved date per week as throughput, use monte carlo analysis to determine the highest number of work items will have a resolved date in the next 6 weeks with a high degree of likelihood greater than 90%. Gropu results by Area path. Series.dt.weekofyear and Series.dt.week have been deprecated. Please use Series.dt.isocalendar().week instead, Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass datetime64[ns] instead')

In [None]:
agent.run(input='Using the count of resolved date per week as throughput, use monte carlo analysis to determine the highest number of work items will have a resolved date in the next 6 weeks with a high degree of likelihood greater than 90%. Gropu results by Area path. Series.dt.weekofyear and Series.dt.week have been deprecated. Please use Series.dt.isocalendar().week instead, Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass datetime64[ns] instead')

In [None]:
agent.run(input="""Using the count of resolved date per week as throughput, use monte carlo analysis to determine the highest number of work items will have a resolved date in the next 6 weeks with a high degree of likelihood greater than 90%. Group results by Area path. Consider using the code df.groupby('Area Path')['Resolved Date'].agg(lambda x: x.dt.isocalendar().week.value_counts()).reset_index(name='count'). Avoid depracated functions such as Series.dt.weekofyear and Series.dt.week have been deprecated. Please use Series.dt.isocalendar().week instead, Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass datetime64[ns] instead""")

In [None]:
agent.run(input="""Using the count of resolved date per week as throughput, use monte carlo analysis to determine the highest number of work items will have a resolved date in the next 6 weeks with a high degree of likelihood greater than 90%. Group results by Area path. Consider using the code df.groupby('Area Path')['Resolved Date'].agg(lambda x: x.dt.isocalendar().week.value_counts()).reset_index(name='count'). Avoid depracated functions such as Series.dt.weekofyear and Series.dt.week have been deprecated. Please use Series.dt.isocalendar().week instead, Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass datetime64[ns] instead""")

In [None]:
agent.run(input='For each area path in the dataframe df, Using the count of resolved date per week as throughput, use monte carlo analysis to determine the highest number of work items will have a resolved date in the next 6 weeks with the following degrees of likelihood greater than 70%, greater than 80%, and greater than 90%. ignore values of nan')

Simpler PythonAstREPLTool

In [None]:
from langchain.memory import ConversationBufferMemory
from langchain.agents.openai_functions_agent.base import OpenAIFunctionsAgent
from langchain.schema import SystemMessage
from langchain.agents import AgentExecutor

In [None]:
memory = ConversationBufferMemory(memory_key="chat_history")
df = pd.read_csv('../../ado_backlog_clean.csv')
dataset = {'df': df}
pytool = PythonAstREPLTool(locals=dataset)
tools = [pytool, DocumentQATool()]
prompt = ZeroShotAgent.create_prompt(
    tools,
    prefix="",
    suffix="{agent_scratchpad}"
    )
llm_chain = LLMChain(
    llm=llm,
    prompt=partial_prompt,
)
tool_names = [tool.name for tool in tools]
zeroshotagent = ZeroShotAgent(
    llm_chain=llm_chain,
    tools=tool_names,
    handle_parsing_errors=True
)

In [None]:
agent = AgentExecutor.from_agent_and_tools(
    agent=zeroshotagent,
    tools=tools, 
    verbose=True,
    memory=memory,
    handle_parsing_errors=True
    )

In [None]:
agent.run(input="how many rows are in the dataframe?")

ONe Cell

In [None]:
from langchain_experimental.tools import PythonAstREPLTool
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.llms import OpenAI
from langchain.llms import VertexAI
import vertexai
import pandas as pd
import numpy as np

from langchain.agents.mrkl.base import ZeroShotAgent
from langchain.chains.llm import LLMChain
from langchain.agents import Tool
from langchain.document_loaders import TextLoader, CSVLoader
from langchain.chains.question_answering import load_qa_chain
from langchain.tools import BaseTool

llm = GoogleGenerativeAI(model="gemini-pro",temperature=0, google_api_key=os.getenv("GOOGLE_API_KEY"))

loader = CSVLoader('../../ado_backlog_clean.csv')
docchain = load_qa_chain(llm, chain_type="stuff")

# create document QA tool
class DocumentQATool(BaseTool):
    name='document'
    description='read contents from page content'
    
    def _run(self, query):
        return docchain.run(question=query, input_documents=loader.load(), 
                            return_only_outputs=True, verbose=True)
    
    def _arun(self, query):
        raise NotImplementedError('Not implemented')

df = pd.read_csv('../../ado_backlog_clean.csv')

pytool = PythonAstREPLTool(locals={"df": df})

# create agent input and prompt Way too many tokens because I think it's trying to put the whole dataframe into the prefix prompt!!!!
tools = [pytool, DocumentQATool()]
input_variables = ["df", "input", "agent_scratchpad"]
        
prompt = ZeroShotAgent.create_prompt(
    tools,
    prefix='''You are working with python pandas DataFrame with variable name `df`,
and reading from the document text from the document loader. You should solve
the question by small steps and use the tools available. It is important to understand the attributes and datatypes of the dataframe before working with it. This is the result of running `df.head().to_markdown()`
    ''',
    suffix='''Here's the result of `print(df.head())`:
{df}

You are not meant to use only these rows to answer questions - they are meant as a way of telling you about the shape and schema of the dataframe.
You also do not have use only the information here to answer questions - you can run intermediate queries to do exporatory data analysis to give you more information as needed.
avoide using depracated functions, such as Series.dt.weekofyear and Series.dt.week have been deprecated. Please use Series.dt.isocalendar().week instead, and Passing unit-less datetime64 dtype to .astype is deprecated and will raise errors in a future version thus pass datetime64[ns] instead.

Begin!
Question: {input}
{agent_scratchpad}
    ''', 
    input_variables=input_variables
)
partial_prompt = prompt.partial(df=str(df.head().to_markdown()))

llm_chain = LLMChain(
    llm=llm,
    prompt=partial_prompt,
)
tool_names = [tool.name for tool in tools]
zeroshotagent = ZeroShotAgent(
    llm_chain=llm_chain,
    tools=tool_names,
    handle_parsing_errors=True
)

from langchain.agents.agent import AgentExecutor

agent = AgentExecutor.from_agent_and_tools(
    agent=zeroshotagent,
    tools=tools,
    verbose=True,
    handle_parsing_errors=True
)

agent.run(input='how many rows are in the dataset with a state of new?')