In [15]:
import os
import re
import json
import pandas as pd
from tqdm import tqdm

In [2]:
os.listdir()

['.DS_Store',
 '.env',
 '.git',
 '.ipynb_checkpoints',
 '05efe60b-0386-4031-8db1-db1968986a73.json',
 '44ac0f55-b95e-43e3-bf26-6c6d831aef2d.json',
 'assets',
 'b0a9f4e1-6363-4864-9923-5a133e21776b.json',
 'condaenv.yml',
 'data preprocess.ipynb',
 'fewshortexample.ipynb',
 'funcs.py',
 'main.py',
 'orders_superstore_24_july_2024.csv',
 'prompts.py',
 'requirements.txt',
 'sample_data',
 'Seller Intelligence_ SQL Queries for Sample Questions.docx',
 '__pycache__']

In [6]:
with open('Seller Intelligence_ SQL Queries for Sample Questions.txt', 'r', encoding='latin-1') as infile:
    data = infile.read()

In [8]:
# Split data using regex to identify question blocks
pattern = r"(Q\d+:.*?)(?=Q\d+:|$)"
matches = re.findall(pattern, data, re.DOTALL)

In [25]:
few_short_data_dict = {}

for qa_string in tqdm(matches):
    qa_string_components = qa_string.split('\n\n')
    if len(qa_string_components) > 1:
        ques = qa_string_components[0]
        ans = qa_string_components[1]
        ans = ans.replace('`datachannel-238509.dt_sellerintelligence.sales_data_model`', 'table')

        ques = ques.split(": ")[1]
        few_short_data_dict[ques] = ans

        # few_short_data_dict.update(json.dumps(few_short_instance))

100%|██████████| 47/47 [00:00<?, ?it/s]


In [39]:
from uuid import uuid4
from pinecone import Pinecone, ServerlessSpec
def create_pinecone_index_and_namespace(qna_dict, 
                                        open_ai_client, 
                                        pinecone_client,
                                        pinecone_index,
                                        pinecone_namespace):

    # Create embeddings
    questions = list(qna_dict.keys())
    res = open_ai_client.embeddings.create(input=questions, model='text-embedding-3-large')
    embeds = [record.embedding for record in res.data]
    master_dict = {ques: {'embedding': embeds[i], 
                          'metadata': {"user_query": ques, "SQL_Query": qna_dict.get(ques)}
                         }
                   for i, ques in enumerate(questions)}

    
    # Check if index exists
    existing_indexes = pinecone_client.list_indexes().index_list['indexes']
    existing_indexes_names = [i['name'] for i in existing_indexes]
    index_exists = pinecone_index in existing_indexes_names

    if not index_exists:
        print(f"Did not find {pinecone_index}. Creating")
        # Create index
        pinecone_index.create_index(name=pinecone_index, 
                                    dimension=1536, 
                                    metric='cosine',  
                                    spec=ServerlessSpec(cloud="aws",region="us-east-1"))

    # Initialise Index
    index = pinecone_client.Index(pinecone_index)
    
    # Check if namespace exists
    existing_namespaces = index.describe_index_stats()['namespaces']
    existing_namespaces = list(existing_namespaces.keys())
    namespace_exists = pinecone_namespace in existing_namespaces
    if namespace_exists:
        print(f"Found existing namespace `{pinecone_namespace}` in {pinecone_index}. Recreating.")
        index.delete(namespace=pinecone_namespace, delete_all=True)
    
    # Upload Vectors in index-namespace
    vectors = [{'id': str(uuid4()), 
                'values': master_dict.get(quest).get('embedding'),
                'metadata': master_dict.get(quest).get('metadata')} for quest in questions]

    index.upsert(vectors=vectors, namespace=pinecone_namespace)


In [37]:
from openai import OpenAI

client = OpenAI(api_key='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')

In [40]:
from pinecone import Pinecone
pinecone_client = Pinecone(api_key="5ec49a0d-4e4f-4d9c-9cee-1786a045970e")

In [41]:
create_pinecone_index_and_namespace(qna_dict=few_short_data_dict,open_ai_client=client,pinecone_client=pinecone_client,pinecone_index="fewshortexamples",pinecone_namespace='ns1')

In [46]:
def find_top_k_nearest_examples(user_query, 
                                open_ai_client, 
                                pinecone_client,
                                pinecone_index_name,
                                pinecone_namespace_name,
                                top_k=5):

    # Create Embedding
    user_q_embedding = open_ai_client.embeddings.create(input=user_query, model='text-embedding-3-large')
    user_q_embedding = user_q_embedding.to_dict()['data'][0]['embedding']

    # Get Closest
    index = pinecone_client.Index(pinecone_index_name)
    closest_calls = index.query(vector=user_q_embedding,
                                top_k=top_k, 
                                include_metadata=True, 
                                namespace=pinecone_namespace_name)

    closest_question_answers = {}
    for match in closest_calls['matches']:
        ques = match['metadata']['user_query']
        ans = match['metadata']['SQL_Query']
        closest_question_answers[ques] = ans
    
    return closest_question_answers

In [47]:
find_top_k_nearest_examples(user_query="What were the total sales for the last 30 days",open_ai_client=client,pinecone_client=pinecone_client,pinecone_index_name="fewshortexamples",pinecone_namespace_name='ns1')

{'What were the total sales for the last 30 days': 'SELECT SUM(revenue) AS sales\nFROM table\nWHERE order_date >= DATE_SUB(CURRENT_DATE(), INTERVAL 30 DAY);',
 'How many units were sold in the last 30 days': 'SELECT SUM(quantity) AS units_sold\nFROM table\nWHERE order_date >= DATE_SUB(CURRENT_DATE(), INTERVAL 30 DAY);\n  2\n ',
 'How many units were sold this month': 'SELECT SUM(quantity) AS units_sold\nFROM table\nWHERE EXTRACT(YEAR FROM order_date) = EXTRACT(YEAR FROM CURRENT_DATE()) \nAND EXTRACT(MONTH FROM order_date) = EXTRACT(MONTH FROM CURRENT_DATE())',
 'What is the percentage change in sales yesterday compared to the same day last month': 'WITH yesterday_sales AS (\n    SELECT SUM(revenue) AS sales_yesterday\n    FROM table\n    WHERE  order_date = DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY)\n),\nsame_day_last_month_sales AS (\n    SELECT  SUM(revenue) AS sales_same_day_last_month\n    FROM table\n    WHERE  order_date = DATE_SUB(DATE_SUB(CURRENT_DATE(), INTERVAL 1 MONTH), INTERV