In [1]:
# pip install openai 
# pip install azure-search-documents==11.4.0b8
# pip install openai[embeddings]

import openai
from openai.embeddings_utils import get_embedding
import json

# Azure OpenAI Credentials
OPENAI_API_KEY = "sk-???"
OPENAI_RESOURCE_ENDPOINT = "https://api.openai.com/v1"

# Azure Cognitive Search Credentials
ACS_ENDPOINT = "https://?????.search.windows.net"
ACS_KEY = "??????" # Admin key

#openai.api_type = "azure"
#openai.api_key = OPENAI_API_KEY
#openai.api_base = OPENAI_RESOURCE_ENDPOINT
#openai.api_version = "2023-07-01-preview"

# Note : currently running this against OpenAI API
openai.api_type = "openai"
openai.api_key = OPENAI_API_KEY
openai.api_version = "2020-11-07"


In [15]:
import json

file_path = "../data/ingredients.json"

with open(file_path, 'r') as file:  
    text = file.read()  

ingredients = json.loads(text)

def flattenAll(*args):
    result=''
    for arg in args:
        if arg is None or arg == '' :
            continue
        
        if( len(result)>0) : 
            result += ', '

        if type(arg) is list:
            result+= ', '.join(arg)
            continue

        result+=arg
    return result
        

def flattenIfList(input):
    if type(input) is list:
        return ', '.join(input)
    return input

for ingredient in ingredients:

    ingredient['id']=ingredient['link'].replace('/wiki/','')

    # Creating the text for the embedding
    features = flattenAll(
        ingredient.get('distinction',''), 
        ingredient.get('characteristics',''), 
        ingredient.get('properties',''),
        ingredient.get('usage')
    ).strip()

    ingredient['features']=features if len(features) > 0 else 'Unknown'

    text_for_embedding = "Name: "+ingredient['name']+". " \
            +"Description: "+ingredient['description']+". " 
    
    if( ingredient.get('madefrom','') != '' ) :      
        text_for_embedding += " Made from: "+ flattenIfList(ingredient['madefrom'])

    if( ingredient.get('fullDescription','') != '' ) :      
        text_for_embedding += " Details: "+ingredient['fullDescription']
    
    preparation=flattenIfList(ingredient.get('preparation'))
    if(preparation != None and len(preparation)>0):
        text_for_embedding + ' Preparation: '+preparation

    ingredient['text'] = text_for_embedding

    print(ingredient['name'])

SAVE_PATH = "../data/intermediate.json"
with open(SAVE_PATH, 'w') as f:
    json.dump(ingredients, f)

Abraxan hair
Aconite
Acromantula venom
Adder's Fork
African Red Pepper
African Sea Salt
Agrippa
Alcohol
Alihotsy
Angel's Trumpet
Anjelica
Antimony
Armadillo bile
Armotentia
Arnica
Asian Dragon Hair
Ashwinder egg
Asphodel
Avocado
Balm
Banana
Baneberry
Bat spleen
Bat wing
Beetle Eye
Belladonna
Betony
Bezoar
Bicorn Horn
Billywig sting
Billywig Sting Slime
Billywig wings
Bitter root
Blatta Pulvereus
Blind-worm's Sting
Blood
Bloodroot
Blowfly
Bone
Boom Berry
Boomslang
Boomslang Skin
Borage
Bouncing Bulb
Bouncing Spider Juice
Bubotuber pus
Bulbadox juice
Bundimun Secretion
Bursting mushroom
Butterscotch
Camphirated Spirit
Castor oil
Cat Hair
Caterpillar
Centaury
Cheese
Chicken Lips
Chinese Chomping Cabbage
Chizpurfle Carapace
Chizpurfle fang
Cinnamon
Cockroach
Corn starch
Cowbane
Crocodile Heart
Daisy
Dandelion root
Dandruff
Deadlyius
Death-Cap
Dittany
Doxy egg
Dragon blood
Dragon claw
Dragon Claw Ooze
Dragon dung
Dragon horn
Dragon liver
Dragonfly thorax
Eagle Owl Feather
Eel eye
Erumpent h

In [16]:
import pandas as pd

# If needed, reduce dataset for prototyping
#ingredients=ingredients[0:10]

# calculate embeddings using the batch API
EMBEDDING_MODEL = "text-embedding-ada-002"  # OpenAI's best embeddings as of Apr 2023
BATCH_SIZE = 1000  # you can submit up to 2048 embedding inputs per request

embeddings = []
for batch_start in range(0, len(ingredients), BATCH_SIZE):
    batch_end = batch_start + BATCH_SIZE
    page = ingredients[batch_start:batch_end]
    print(f"Batch {batch_start} to {batch_start+len(page)} - text embeddings")

    textBatch = [obj['text'] for obj in page]   
    response = openai.Embedding.create(model=EMBEDDING_MODEL, input=textBatch)
    for i, be in enumerate(response["data"]):
        assert i == be["index"]  # double check embeddings are in same order as input
        page[i]['textVector']=be["embedding"]

    print(f"Batch {batch_start} to {batch_start+len(page)} - feature embeddings")
    featuresBatch = [obj['features'] for obj in page]
    response = openai.Embedding.create(model=EMBEDDING_MODEL, input=featuresBatch)
    for i, be in enumerate(response["data"]):
        assert i == be["index"]  # double check embeddings are in same order as input
        page[i]['featuresVector']=be["embedding"]

df = pd.Series(ingredients)

Batch 0 to 288 - text embeddings
Batch 0 to 288 - feature embeddings


In [17]:
# save document chunks and embeddings

SAVE_PATH = "../data/ingredients_with_embeddings.csv"

df.to_csv(SAVE_PATH, index=False)

In [18]:
# Import Azure Cognitive Search SDK

from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.models import Vector

from azure.search.documents.indexes.models import (
        SearchIndex,
        SearchField,
        SearchFieldDataType,
        SimpleField,
        SearchableField,
        VectorSearch,
        HnswVectorSearchAlgorithmConfiguration,
        CorsOptions
)



In [19]:
import re

index_name = "ingredients"

def clean_key(key):
    return re.sub(r'[^a-zA-Z0-9=_-]', '_', key)

def get_index(name: str):

    fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True, filterable=True),
        SearchableField(name="name", type=SearchFieldDataType.String,retrievable=True,searchable=True),
        SearchableField(name="description", type=SearchFieldDataType.String,retrievable=True,searchable=True),
        SearchableField(name="features", type=SearchFieldDataType.String,retrievable=True,searchable=True),
        SearchableField(name="details", type=SearchFieldDataType.String,retrievable=True,searchable=True),
        SearchableField(name="text", type=SearchFieldDataType.String,retrievable=True,searchable=True),
        SearchField(
            name="textVector",
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
            searchable=True,
            vector_search_dimensions=1536,
            vector_search_configuration="my-vector-config",
        ),   
        SearchField(
            name="featuresVector",
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
            searchable=True,
            vector_search_dimensions=1536,
            vector_search_configuration="my-vector-config",
        ), 
    ]

    cors_options = CorsOptions(allowed_origins=["*"], max_age_in_seconds=60)
    scoring_profiles = []
    vector_search = VectorSearch(
        algorithm_configurations=[HnswVectorSearchAlgorithmConfiguration(name="my-vector-config", kind="hnsw")]
    )

    return SearchIndex(
            name=name, 
            fields=fields, 
            scoring_profiles=scoring_profiles, 
            vector_search=vector_search, 
            cors_options=cors_options)

client = SearchIndexClient(ACS_ENDPOINT, AzureKeyCredential(ACS_KEY))

index = get_index(index_name)
client.delete_index(index_name)
client.create_index(index)

search_client = SearchClient(ACS_ENDPOINT, index_name, AzureKeyCredential(ACS_KEY))

# Clean up the documents: filter out any properties not declared in the index
documents = [{
    "id": clean_key(d["id"]), 
    "name": d["name"], 
    "description": d["description"], 
    "features": d["features"], 
    "details": d["fullDescription"], 
    "text": d["text"], 
    "textVector": d["textVector"], 
    "featuresVector": d["featuresVector"], } for d in ingredients]

search_client.upload_documents(documents=documents)

[<azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f7456432fe0>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f7456433310>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f7456433640>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f74564336a0>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f7456433700>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f7456433760>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f74564337c0>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f7456433820>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f7456433880>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f74564338e0>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f7456433940>,
 <azure.se

In [31]:
# Try searching the data

ACS_INDEX_NAME = "ingredients"

def simple_hybrid_search(query):
    search_client = SearchClient(ACS_ENDPOINT, ACS_INDEX_NAME, AzureKeyCredential(ACS_KEY))
    vector = Vector(value=get_embedding(query, engine = 'text-embedding-ada-002'), k=3, fields="textVector")

    results = search_client.search(
        search_text=query,
        vectors=[vector],
        select=["id", "name","description","text","features"],
        top=5
    )
    
    return results

search_results = simple_hybrid_search(query = "Anger management potion, to be used to subdue extremely powerful and violent prisoners in Azkaban")

for result in search_results:
    print("---")
    print(result['name'])
    print(result['description'])
    print(result['text'])
    print(result['features'])
    print(result['@search.score'])


---
Moondew
Used in the Wiggenweld Potion, Draught of Living Death and Antidote to Common Poisons.
Name: Moondew. Description: Used in the Wiggenweld Potion, Draught of Living Death and Antidote to Common Poisons..  Details: It was also used in extra-sweet Butterbeer, concocted by Madam Rosmerta.
Wiggenweld Potion, Draught of Living Death, Extra-sweet Butterbeer
0.025652701035141945
---
Urine
Used in Panacea
Name: Urine. Description: Used in Panacea.  Made from: The urethra of living creatures Details: According to an alchemical work dating back to the 1550s, urine is used in the making of the perfect medicine, along with salt, vinegar, Sal Ammoniac and a particular Sulphur Vive. On 31 October 1992, Hermione Granger said that it was awful trying to have a pee with Moaning Myrtle wailing at you in her bathroom. Though its exact flavour varies on the person to be transformed into, on the whole, Alastor Moody expressed the opinion that the Polyjuice Potion tasted like "goblin piss".
Yello

In [None]:
# Generate potions using OpenAI

def getAnswerFromOpenAI(query):

    # Search in the Azure Search Index
    azure_search_results = simple_hybrid_search(query)
    
    results_for_prompt=""
    for result in azure_search_results:
        results_for_prompt += f"Ingredient: {result['text']}\n"
      
    system_message = f"""Assistant is a large language model designed to help the Hogwarts potion master discover new potions.
    You have access to an Azure Cognitive Search index with all available magic ingrediënts.

    This information returned from the search to anwser the users question
    {results_for_prompt}

    Select the relevant ingredients from the search results and explain why the ingredient is used.
    If the ingredients are insufficient, do not provide a recipe but recommend a search for new magic ingrediënts instead.

    Give your response as a potion recipe including specific preparation method, and volumes, counts or weights of each ingredient.
    """
    
    response = openai.ChatCompletion.create(
        #engine="gpt-4-0613", #"gpt-35-turbo",
        model="gpt-3.5-turbo-0613", # "gpt-4",
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": query},
        ]
    )

    return response['choices'][0]['message']['content']
# Fire protection potion, to be used to protect against dragons
# Anger management potion, to be used to subdue extremely powerful and violent prisoners in Azkaban
# Blabbermouth: A potion that causes uncontrollable speaking of nonsense.
# Unbearable lightness potion: wards of dark creatures and prevents them from touching you
result = getAnswerFromOpenAI("Unbearable lightness potion: wards of dark creatures and prevents them from touching you")
print(result)