# Import Headers

In [6]:
from langchain_openai import ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain.prompts import FewShotPromptTemplate
from langchain.prompts.prompt import PromptTemplate
from langchain.prompts.example_selector import SemanticSimilarityExampleSelector
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.agents import create_sql_agent, AgentType
from dotenv import load_dotenv
import os

load_dotenv()

True

# Setup LLM

In [7]:
api_key = os.getenv("OPENAI_API_KEY")
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

response = llm.invoke("Write a SQL query to get top 5 customers by sales. keep it short")
print(response.content)

SELECT customer_id, SUM(sales_amount) as total_sales
FROM sales
GROUP BY customer_id
ORDER BY total_sales DESC
LIMIT 5;


# Connect With Database

In [8]:
from langchain.utilities import SQLDatabase

db_user = "root"
db_password = "root"
db_host = "localhost"
db_name = "tshirts"

db = SQLDatabase.from_uri(f"mysql+pymysql://{db_user}:{db_password}@{db_host}/{db_name}",sample_rows_in_table_info=3)

print(db.table_info)


CREATE TABLE discounts (
	discount_id INTEGER NOT NULL AUTO_INCREMENT, 
	t_shirt_id INTEGER NOT NULL, 
	pct_discount DECIMAL(5, 2), 
	PRIMARY KEY (discount_id), 
	CONSTRAINT discounts_ibfk_1 FOREIGN KEY(t_shirt_id) REFERENCES t_shirts (t_shirt_id), 
	CONSTRAINT discounts_chk_1 CHECK ((`pct_discount` between 0 and 100))
)COLLATE utf8mb4_0900_ai_ci ENGINE=InnoDB DEFAULT CHARSET=utf8mb4

/*
3 rows from discounts table:
discount_id	t_shirt_id	pct_discount
1	1	10.00
2	2	15.00
3	3	20.00
*/


CREATE TABLE t_shirts (
	t_shirt_id INTEGER NOT NULL AUTO_INCREMENT, 
	brand ENUM('Van Huesen','Levi','Nike','Adidas') NOT NULL, 
	color ENUM('Red','Blue','Black','White') NOT NULL, 
	size ENUM('XS','S','M','L','XL') NOT NULL, 
	price INTEGER, 
	stock_quantity INTEGER NOT NULL, 
	PRIMARY KEY (t_shirt_id), 
	CONSTRAINT t_shirts_chk_1 CHECK ((`price` between 10 and 50))
)COLLATE utf8mb4_0900_ai_ci ENGINE=InnoDB DEFAULT CHARSET=utf8mb4

/*
3 rows from t_shirts table:
t_shirt_id	brand	color	size	price	stock

______________

# Ask questions with Zero shot learning

In [None]:
sql_agent = create_sql_agent(
    llm, 
    db=db, 
    agent_type="zero-shot-react-description", 
    handle_parsing_errors=True
    ) # set verbose=True to check how it thinks

## Question 1

In [10]:
q1 = sql_agent.invoke("How many t-shirts do we have left for Nike in extra small size and white color?")
print(q1)

{'input': 'How many t-shirts do we have left for Nike in extra small size and white color?', 'output': 'There is 0 Nike t-shirt left in extra small size and white color.'}


- ##### `0` is incorrect. The correct answer is `25`.  
--

## Question 2

In [11]:
q2 = sql_agent.invoke("How much is inventory value for all small size t-shirts?")
print(q2)

{'input': 'How much is inventory value for all small size t-shirts?', 'output': '$13,429'}


- ##### LLM answered this correctly
--

# Question 3

In [12]:
q3 = sql_agent.invoke("What is the total value of Levi’s T-shirts we have in stock?")
print(q3)

{'input': 'What is the total value of Levi’s T-shirts we have in stock?', 'output': '$12,956'}


- ##### LLM answered this correctly. 
--

# Question 4

In [13]:
q4 = sql_agent.invoke("How many white color Levi's t shirts we have available?")
print(q4)

{'input': "How many white color Levi's t shirts we have available?", 'output': '2'}


- ##### `2` is incorrect. The correct answer is `43`.

_____________________

# Provide correct SQL queries and get the results

In [14]:
q1 = sql_agent.invoke("SELECT sum(stock_quantity) FROM t_shirts WHERE brand = 'Nike' AND color = 'White' AND size = 'XS'")
q2 = sql_agent.invoke("SELECT SUM(price*stock_quantity) FROM t_shirts WHERE size = 'S'")
q3 = sql_agent.invoke("SELECT SUM(price * stock_quantity) FROM t_shirts WHERE brand = 'Levi'")
q4 = sql_agent.invoke("SELECT sum(stock_quantity) FROM t_shirts WHERE brand = 'Levi' AND color = 'White'")

print(f"q1 output: {q1['output']}")
print(f"q2 output: {q2['output']}")
print(f"q3 output: {q3['output']}")
print(f"q4 output: {q4['output']}")


q1 output: 25
q2 output: The total value of size 'S' t-shirts in stock is $13,429.
q3 output: $12,956
q4 output: 43


__________________

# Few Shot Learning

In [15]:
few_shots = [
    {
      'Question' : "How many t-shirts do we have left for Nike in XS size and white color?",
      'SQLQuery' : "SELECT sum(stock_quantity) FROM t_shirts WHERE brand = 'Nike' AND color = 'White' AND size = 'XS'",
      'SQLResult': "Result of the SQL query",
      'Answer' : q1
     },
    {
      'Question': "How much is the total inventory value for all small size t-shirts?",
      'SQLQuery':"SELECT SUM(price*stock_quantity) FROM t_shirts WHERE size = 'S'",
      'SQLResult': "Result of the SQL query",
      'Answer': q2
    },
    {
      'Question' : "If we have to sell all the Levi’s T-shirts today. How much revenue our store will generate without discount?" ,
      'SQLQuery': "SELECT SUM(price * stock_quantity) FROM t_shirts WHERE brand = 'Levi'",
      'SQLResult': "Result of the SQL query",
      'Answer' : q3
    },
    {
      'Question': "How many white color Levi's shirt I have?",
      'SQLQuery' : "SELECT sum(stock_quantity) FROM t_shirts WHERE brand = 'Levi' AND color = 'White'",
      'SQLResult': "Result of the SQL query",
      'Answer' : q4
    }
]

# Create Vector Database

In [25]:
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

texts = [d["Question"] for d in few_shots] # extracts all the questions and store it as a list

metadatas = [
    {
        "Question": d["Question"], 
        "sql": d["SQLQuery"],
        "sql_result": d["SQLResult"],
        "answer_output": d["Answer"]["output"]
    }
    for d in few_shots
]

vectorstore = Chroma.from_texts(
    texts=texts,
    embedding=embeddings,
    metadatas=metadatas
)

# Semantic Similarity Example Selector


In [17]:
example_selector = SemanticSimilarityExampleSelector(
    vectorstore=vectorstore,
    k=2
)

# Let's pull two similar questions from vector db to the following example.
example_selector.select_examples({"Question": "How many Adidas T shirts I have left in my store?"})

[{'sql': "SELECT sum(stock_quantity) FROM t_shirts WHERE brand = 'Nike' AND color = 'White' AND size = 'XS'",
  'Question': 'How many t-shirts do we have left for Nike in XS size and white color?',
  'answer_output': '25',
  'sql_result': 'Result of the SQL query'},
 {'sql': "SELECT SUM(price*stock_quantity) FROM t_shirts WHERE size = 'S'",
  'sql_result': 'Result of the SQL query',
  'Question': 'How much is the total inventory value for all small size t-shirts?',
  'answer_output': "The total value of size 'S' t-shirts in stock is $13,429."}]

# Few Shot Prompt Template

In [18]:
example_prompt = PromptTemplate(
    input_variables=["Question", "sql", "sql_result", "answer_output"],
    template="""Question: {Question}
SQLQuery: {sql}
SQLResult: {sql_result}
Answer: {answer_output}"""
)

few_shot_prompt = FewShotPromptTemplate(
    example_selector=example_selector,
    example_prompt=example_prompt,
    suffix="Question: {input}\nSQLQuery:",
    input_variables=["input"]
)

final_prompt = few_shot_prompt.format(input="How many Adidas T shirts I have left in my store?")
print(final_prompt)


Question: How many t-shirts do we have left for Nike in XS size and white color?
SQLQuery: SELECT sum(stock_quantity) FROM t_shirts WHERE brand = 'Nike' AND color = 'White' AND size = 'XS'
SQLResult: Result of the SQL query
Answer: 25

Question: How much is the total inventory value for all small size t-shirts?
SQLQuery: SELECT SUM(price*stock_quantity) FROM t_shirts WHERE size = 'S'
SQLResult: Result of the SQL query
Answer: The total value of size 'S' t-shirts in stock is $13,429.

Question: How many Adidas T shirts I have left in my store?
SQLQuery:


In [19]:
few_shot_examples_str = few_shot_prompt.format(input="{input}")

mysql_instructions = """
You are a MySQL expert. Given an input question, first create a syntactically correct MySQL query to run,
then look at the results of the query and return the answer to the input question.
Unless the user specifies a specific number of examples to obtain, query for at most {top_k} results using LIMIT.
Never SELECT *; only select needed columns, and wrap column names in backticks (`).
Use only columns that exist; pay attention to which column is i n which table.
Use CURDATE() for "today" queries. Be precise and concise.
"""

custom_sql_prompt = PromptTemplate(
    input_variables=["input", "tools", "tool_names", "agent_scratchpad"],
    template=f"""
{mysql_instructions}

# Worked Examples 
{few_shot_examples_str}

You can use the following tools:
{{tools}}

You may call only these tool names:
{{tool_names}}

Use this exact format:
Question: the input question to answer
Thought: think step-by-step about what to do next
Action: one of [{{tool_names}}]
Action Input: the input for the action
Observation: the result of the action
... (you can repeat Thought/Action/Action Input/Observation as needed)
Thought: I now know the final answer
Final Answer: the final answer to the original question

Begin!

Question: {{input}}
{{agent_scratchpad}}

Database schema:
{db.table_info}
"""
)

#  Final SQL Agent

In [20]:
final_sql_agent = create_sql_agent(
    llm=llm,
    db=db,
    agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    handle_parsing_errors=True,
    prompt=custom_sql_prompt,
    agent_executor_kwargs={"return_intermediate_steps": True},
)

### Testing the examples with incorrect LLM outputs (without few-shots)

In [21]:
q1 = final_sql_agent.invoke("How many t-shirts do we have left for Nike in extra small size and white color?") 
q4 = final_sql_agent.invoke("How many white color Levi's t shirts we have available?")

print(f"q1 output: {q1['output']}")
print(f"q4 output: {q4['output']}")

q1 output: 25
q4 output: There are 43 white Levi's t-shirts available.


##### The Model now predicted these correctly

______