<a href="https://colab.research.google.com/github/AnDDoanf/LLM-repo/blob/master/autonomous_data_analysis_agent_mistral7b_langchain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
%%capture
!pip install transformers bitsandbytes accelerate langchain langchain_community sentence-transformers langchain_experimental

In [3]:
import os
from google.colab import drive

drive.mount('/content/drive')

os.environ['KAGGLE_CONFIG_DIR'] = "/content/drive/MyDrive/Kaggle"
%cd "/content/drive/MyDrive/Kaggle"

# !kaggle datasets download -d rabieelkharoua/cancer-prediction-dataset
# !unzip cancer-prediction-dataset.zip -d cancer-prediction-data
# !rm *.zip

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Kaggle


In [4]:
%%capture
from transformers import AutoModelForCausalLM, AutoTokenizer
import transformers
from transformers import BitsAndBytesConfig
from torch import cuda, bfloat16
from langchain.llms import HuggingFacePipeline

import pandas as pd
from sqlalchemy import create_engine
from langchain.sql_database import SQLDatabase
from langchain_community.agent_toolkits import create_sql_agent

model_name='mistralai/Mistral-7B-Instruct-v0.1'

model_config = transformers.AutoConfig.from_pretrained(
    model_name,
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)

text_generation_pipeline = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.01,
    repetition_penalty=1.3,
    return_full_text=True,
    max_new_tokens=300,
    do_sample=True,
)

llm = HuggingFacePipeline(pipeline=text_generation_pipeline)







In [5]:
# Using sql_agent
# engine = create_engine("sqlite:///cancer1500v2.db")

# df.to_sql('df', con=engine, if_exists='replace', index=False)
# db = SQLDatabase(engine=engine)


# agent_executor = create_sql_agent(llm, db=db, verbose=True,
#                                   agent_executor_kwargs = {'handle_parsing_errors':True},
#                                   max_iterations = 2,
#                                   prefix = prefix
#                                   )

# agent_executor.invoke({"input": "What is the average age of the patients?"})


In [20]:
from langchain.agents import AgentOutputParser
from langchain.agents.conversational_chat.prompt import FORMAT_INSTRUCTIONS
from langchain.output_parsers.json import parse_json_markdown
from langchain.schema import AgentAction, AgentFinish

class OutputParser(AgentOutputParser):
    def get_format_instructions(self) -> str:
        return FORMAT_INSTRUCTIONS

    def parse(self, text: str) -> AgentAction | AgentFinish:
        try:
            response = parse_json_markdown(text)
            action, action_input = response["action"], response["action_input"]
            if action == "Final Answer":
                return AgentFinish({"output": action_input, "action":action}, text)
            else:
                return AgentAction(action, action_input, text)
        except Exception:
            return AgentFinish({"output": text, "action": action_input}, text)

    @property
    def _type(self) -> str:
        return "conversational_chat"

In [33]:
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
from langchain.prompts import PromptTemplate
df = pd.read_csv('/content/drive/MyDrive/Kaggle/cancer-prediction-dataset/The_Cancer_data_1500_V2.csv')
prefix = """

This is the previous conversation:

question: How many patient has cancer history?
answer:
  Action: df['CancerHistory'].value_counts()[1]
  Final Answer: Number of patient has cancer history is 216.


question: What is the average age of patients?
answer:
  Action: df['Age'].mean()
  Final Answer:  The average age of patients is 50.32.
"""

parser = OutputParser()

def _handle_error(error) -> str:
    return str(error)[:72]

# Create the agent with the custom prompt template
agent = create_pandas_dataframe_agent(
    llm=llm,
    df=df,
    include_df_in_prompt = False,
    verbose=True,
    agent_executor_kwargs={'output_parser': parser,
                           "handle_parsing_errors":True},
    max_iterations=2
)

In [28]:
df.head()

Unnamed: 0,Age,Gender,BMI,Smoking,GeneticRisk,PhysicalActivity,AlcoholIntake,CancerHistory,Diagnosis
0,58,1,16.085313,0,1,8.146251,4.148219,1,1
1,71,0,30.828784,0,1,9.36163,3.519683,0,0
2,48,1,38.785084,0,2,5.135179,4.728368,0,1
3,34,0,30.040296,0,0,9.502792,2.044636,0,0
4,62,1,35.479721,0,0,5.35689,3.309849,0,1


In [34]:
input = "Draw graph of distribution in BMI?"
agent.invoke(input)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mParsing LLM output produced both a final answer and a parse-able action:: 
You are working with a pandas dataframe in Python. The name of the dataframe is `df`.
You should use the tools below to answer the question posed of you:

python_repl_ast - A Python shell. Use this to execute python commands. Input should be a valid python command. When using this tool, sometimes output is abbreviated - make sure it does not look abbreviated before using it in your answer.

Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [python_repl_ast]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question


Begin!
Question: Draw graph of distrib

{'input': 'Draw graph of distribution in BMI?',
 'output': 'Agent stopped due to iteration limit or time limit.'}

In [30]:
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
from langchain.prompts import PromptTemplate
prefix = """
Answer directly to the user.
"""
agent_2 = create_pandas_dataframe_agent(llm, df, verbose=True,
                                      prefix = prefix,
                                      agent_executor_kwargs = {'handle_parsing_errors':True},
                                      return_intermediate_steps = True,
                                      include_df_in_prompt=True,
                                      max_iterations = 2)


