In [None]:
!pip install chromadb==0.3.26 pydantic==1.10.8
!pip install openai
!pip install langchain==0.0.249
!pip install tiktoken
!pip install "datasets[s3]==2.13.0"

# Create few-shot examples

In [None]:
from datasets import load_dataset
from random import randrange
# Load dataset from the huggingface

train_dataset = load_dataset("spider", split='train')

print(f"dataset size: {len(train_dataset)}")
print(train_dataset[randrange(len(train_dataset))])

In [None]:
splits = train_dataset.train_test_split(test_size=0.2, seed = 40)

In [None]:
train_subset = splits['train']
test_subset = splits['test']

## Fixed few shot

In [None]:
import random

db_few_shot = {}
few_shot_count = 4 # set few shot examples at least 3

for db_id in full_schema:  
    subset = train_subset.filter(lambda example: example['db_id'] == db_id)
    few_shot = ''
    
    if len(subset) < 1:
        print('no example found')
    
    else:
        for i in range(0, few_shot_count):
            idx = random.randint(0, len(subset)-1)
            few_shot += "Questions: " + subset[idx].get('question') + '\n' + "Query:" + subset[idx].get('query') + '\n'

    db_few_shot[db_id] = few_shot


In [None]:
# db_few_shot['music_1']
with open(f'db-{few_shot_count}-shot.json', 'w') as f:
    json.dump(db_few_shot, f)

## Dynamic few shot

In [None]:
from langchain.prompts import SemanticSimilarityExampleSelector
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
import openai
import os

os.environ["OPENAI_KEY"] = "<YOUR_OPENAI_KEY>"
openai.api_key = os.environ.get("OPENAI_KEY", None)


In [None]:
fewshots = []
for db_id in full_schema:  
    subset = train_subset.filter(lambda example: example['db_id'] == db_id)
    subset2 = test_subset.filter(lambda example: example['db_id'] == db_id)
    if len(subset2) > 0 and len(subset) > 0:
        # construct example set using training subset
        examples = []
        for entry in subset:
            example = {}
            example['question'] = entry.get('question')
            example['query'] = entry.get('query')
            examples.append(example)
            
        print(f'building embeddings for {db_id}')
        to_vectorize = [" ".join(example.values()) for example in examples]
        embeddings = OpenAIEmbeddings(openai_api_key=openai.api_key)
        vectorstore = Chroma.from_texts(to_vectorize, embeddings, metadatas=examples)
        
        threeshot_selector = SemanticSimilarityExampleSelector(
            vectorstore=vectorstore,
            k=3,
        )
        
#         fourshot_selector = SemanticSimilarityExampleSelector(
#             vectorstore=vectorstore,
#             k=4,
#         )
        
#         twoshot_selector = SemanticSimilarityExampleSelector(
#             vectorstore=vectorstore,
#             k=2,
#         )

        # for each item in testing subset
        for entry in subset2:
            
            question_fewshot = {}
            question_fewshot['db_id'] = db_id
            question_fewshot['question'] = entry.get('question')
            question_fewshot['query'] = entry.get('query')
            print('retrieving embeddings')
            threeshot = threeshot_selector.select_examples({"question": entry.get('question')})
            question_fewshot['threeshot'] = threeshot
            # twoshot = twoshot_selector.select_examples({"question": entry.get('question')})
            # question_fewshot['twoshot'] = twoshot
            # fourshot = fourshot_selector.select_examples({"question": entry.get('question')})
            # question_fewshot['fourshot'] = fourshot
            
            fewshots.append(question_fewshot)
        

In [None]:
with open('dynamic-fewshot.json', 'w') as f:
    json.dump(fewshots, f)

In [None]:
# testing
test_db_id = ''
test_db = test_subset.filter(lambda example: example['db_id'] == test_db_id)
example_question = test_db[0].get('question')

example_selector = SemanticSimilarityExampleSelector(
    vectorstore=vectorstore,
    k=3,
)

# The prompt template will load examples by passing the input do the `select_examples` method
example_selector.select_examples({"question": example_question})