In [1]:
#————————————————————

# Name: Azure OpenAI RAG (V1)

# Purpose: 

# Verify token count and estimate cost.

# Company: Allgeier Schweiz AG
# Author: Nicolas Rehder (nrehder@allgeier.ch)
# Create for: SDSC 2024
# Date Created: 10.01.2024
# Last Updated: 10.01.2024
# Python Version: 3.10.4

# General Sources:
# https://cscblog.ethz.ch/index.php/2024/02/06/az-open-ai-rag-chromadb-langchain/
# https://github.com/Azure-Samples/openai/blob/main/Basic_Samples/Chat/chat_with_your_own_data.ipynb
# https://thenewstack.io/tutorial-use-chroma-and-openai-to-build-a-custom-qa-bot/
# https://www.pinecone.io/learn/chunking-strategies/
# https://python.langchain.com/docs/modules/model_io/prompts/few_shot_examples/
# https://github.com/langchain-ai/langchain/issues/14123
# https://github.com/langchain-ai/langchain/issues/15878
# https://www.kaggle.com/code/peremartramanonellas/ask-your-documents-with-langchain-vectordb-hf
# https://stackoverflow.com/questions/77087460/langchain-azure-openai-api-returning-additional-information-than-the-asked-q

# Azure Openai Usage:

# Additionals:

# Download Python packages (run the below command in terminal if packages have not yet been installed)
# pip install -r C:\Python\openai-lab\support\requirements\requirements.txt

#————————————————————

hello


In [1]:
# Import required libraries
import os
import io
import time
from io import StringIO
import json
from dotenv import load_dotenv
from pathlib import Path
import pandas as pd
from openai import AzureOpenAI
import chromadb
import chromadb.utils.embedding_functions as embedding_functions
from langchain.vectorstores import Chroma
from langchain_openai import AzureOpenAIEmbeddings
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import AzureChatOpenAI
from langchain.chains import RetrievalQA
from langchain import PromptTemplate
from langchain_core.prompts import (
    ChatPromptTemplate,
    FewShotChatMessagePromptTemplate,
)


In [2]:
# Load Azure OpenAI Key and Endpoint. These values can be found within the Azure OpenAI Service resource in portal.azure.com under Keys and Endpoint
load_dotenv(dotenv_path=Path("C:\\Python\\azure-openai-lab\\.venv\\.env")) #Error sometimes due to \ or \\. Try one or the other.
azure_oai_key = os.environ['AZURE_OPENAI_KEY']
azure_oai_endpoint = os.environ['AZURE_OPENAI_ENDPOINT']

In [3]:
# Initialize the Azure OpenAI client
client = AzureOpenAI(
        azure_endpoint = azure_oai_endpoint, 
        api_key=azure_oai_key,  
        api_version="2024-02-01"
        )

In [4]:
# Import recipes csv

path_input = r"C:\Python\azure-openai-lab\data\recipes.csv" #Change path if required
df = pd.read_csv(path_input , sep='\t', on_bad_lines='skip', low_memory=False)

In [5]:
# List column headers
# list(df)

# Remove columns

df = df.drop(['id', 'contributor_id', 'submitted'], axis=1)

# Remove double whitespaces from name

df["name"] = df["name"].str.replace(r'\s+', ' ', regex=True)

# Create subset of data

#df = df[df["n_ingredients"] > 20]
df = df[df['tags'].str.contains("vegan")]


In [6]:
# Create new column with relevant information for LLM packed into one string

df["dense_feature"] = df.name + "; " + df.tags.apply(lambda x: str(x).strip("[]").replace("'", "")) + "; " + df.nutrition.apply(lambda x: str(x).strip("[]").replace("'", "")) + "; " + df.ingredients.apply(lambda x: str(x).strip("[]").replace("'", "")) + "; " + df.steps
df_text_input = pd.DataFrame(df["dense_feature"])


In [7]:
# Create dataframe input compatible with langchain chroma

df_loader = DataFrameLoader(df_text_input, page_content_column="dense_feature")
df_document = df_loader.load()
#display(df_document[:2])

In [8]:
# Chunk input

text_splitter = CharacterTextSplitter(
    separator = "\n\n",
    chunk_size = 256,
    chunk_overlap  = 20
)
df_document_split = text_splitter.split_documents(df_document)

In [9]:
# Generate and store the Word Embeddings for the Dataset using Azure Openai

# def text_embedding(text):
#     response = client.embeddings.create(model="text-embedding-ada-002", input=[text])
#     return response.data[0].embedding


openai_ef = AzureOpenAIEmbeddings(
                deployment = "text-embedding-ada-002",
                openai_api_key = azure_oai_key,
                azure_endpoint = azure_oai_endpoint,
                openai_api_version = "2024-02-01",
                #openai_api_type = "azure",
                #chunk_size = 1
            )


In [10]:
# Create the ChromaDB Vector Database collection based on the Azure OpenAI embeddings model. Vector Database is created locally.
# Cant run code using Proxy API.

vectordb = Chroma.from_documents(
                documents = df_document_split,
                embedding = openai_ef,
                collection_name = "recipes",
                persist_directory = "C:\Python\data\chromadb", #"./chroma_db",
                collection_metadata={"hnsw:space": "cosine"}
            )

vectordb.persist()

In [None]:
# Load from disk

#vectordb = Chroma(persist_directory = "C:\Python\chromadb", openai_ef)

In [11]:
#Perform a Similarity Search to view Vector Database output based on input query.

query = "Avocado, Coconut Milk, Tofu, Soy Sauce"
#vector = text_embedding(query)

vectordb_output = vectordb.similarity_search(query)

# print results
print(vectordb_output[0].page_content)

veggie burger shepherd s pie; time-to-make, course, main-ingredient, cuisine, preparation, occasion, north-american, main-dish, vegetables, easy, fall, vegan, vegetarian, winter, dietary, seasonal, comfort-food, taste-mood, 3-steps-or-less, 4-hours-or-less; 258.2, 14.0, 22.0, 37.0, 21.0, 8.0, 11.0; olive oil, yellow onion, carrot, white mushrooms, tomato paste, tamari, vegetable broth, fresh thyme, fresh marjoram, salt & fresh ground pepper, cornstarch, water, veggie burgers, frozen green pea, ground walnuts, mashed potatoes, paprika; ['preheat oven to 375f heat 1 tbsp of olive oil in a large skillet over medium heat', 'add the onion and carrot , cover , and cook until tender , about 5 minutes', 'add the mushrooms and cook , stirring occasionally , for 3 minutes', 'stir in tomato paste , tamari , veggie stock , thyme , marjoram , and salt and pepper to taste', 'stir in cornstarch mixture and simmer to thicken slightly , about 1 minute', 'spoon the filling into a lightly oiled 2', '5 qu

In [12]:
# Initalize Azure Openai through Langchain (Default gpt-35-turbo and fine-tuned gpt-35-turbo-0613-ft)

client = AzureChatOpenAI(
                deployment_name = "gpt-35-turbo", #"gpt-35-turbo-0613-ft",
                openai_api_key = azure_oai_key,
                azure_endpoint = azure_oai_endpoint,
                openai_api_version = "2023-05-15",
        )   

In [13]:
#Few-Shot prompt

examples = [
    {
        "input": "canola oil, onion, garlic cloves, fresh ginger, jalapeno, curry powder, diced tomatoes with juice, low sodium vegetable broth, natural-style peanut butter, tamari soy sauce, pepper, sweet potato, carrots, chickpeas, fresh okra, frozen green beans, kale, lime, juice of, fresh cilantro, lime wedge, salted peanuts",
        "output": """{"name":"african peanut stew","minutes":90,"tags":"['time-to-make', 'course', 'main-ingredient', 'cuisine', 'preparation', 'main-dish', 'beans', 'vegetables', 'african', 'easy', 'vegan', 'vegetarian', 'stews', 'dietary', 'chick-peas-garbanzos', '3-steps-or-less', '4-hours-or-less']","nutrition":"[307.8, 20.0, 27.0, 21.0, 22.0, 10.0, 13.0]","n_steps":11,"steps":"['heat the oil in a large , heavy stockpot', 'add the onion , garlic , jalapeno , and ginger , and cook over moderate heat , stirring frequently , until the onion is lightly browned , about 6 minutes', 'add the curry powder and cook , stirring , until fragrant and lightly toasted , about 2 minutes', 'add the tomatoes , scraping up any bits stuck to the bottom of the pan', 'whisk in the broth and peanut butter , season with pepper and tamari to taste , and bring to a boil', 'cook over moderately high heat for 15 minutes , stirring frequently', 'add the sweet potatoes , carrots , and chickpeas , cover partially and cook over moderately low heat until the vegetables are just tender , about 20 minutes', 'add the okra and green beans , cover partially and cook until all the vegetables are tender , about 10 minutes longer', 'add the kale and juice of 1 lime , and cook for 10 more minutes', 'transfer to deep bowls and serve hot', 'garnish with the cilantro , lime wedges , and chopped peanuts at the table']","description":"i first had this stew at a restaurant in rochester, ny, and when i moved away, spent years recreating it!  delicious with a lovely hearty bread, or with rice cooked in the stew (about 3\/4 cup brown rice added with the sweet potatoes and carrots).","ingredients":"['canola oil', 'onion', 'garlic cloves', 'fresh ginger', 'jalapeno', 'curry powder', 'diced tomatoes with juice', 'low sodium vegetable broth', 'natural-style peanut butter', 'tamari soy sauce', 'pepper', 'sweet potato', 'carrots', 'chickpeas', 'fresh okra', 'frozen green beans', 'kale', 'lime, juice of', 'fresh cilantro', 'lime wedge', 'salted peanuts']","n_ingredients":21}""",
    },
    {
        "input": "balsamic vinegar, lemon juice, fresh garlic, french dijon mustard, sugar, canola oil, extra virgin olive oil, fresh basil, salt, pepper, orzo pasta, cooked wild rice, red onion, currants, canned corn niblet, toasted almond, parsley, red peppers, yellow peppers, green onion, garlic granules",
        "output": """{"name":"alexander s orzo and wild rice salad","minutes":20,"tags":"['30-minutes-or-less', 'time-to-make', 'course', 'main-ingredient', 'preparation', 'occasion', 'salads', 'side-dishes', 'pasta', 'rice', 'easy', 'beginner-cook', 'dinner-party', 'holiday-event', 'vegan', 'vegetarian', 'dietary', 'pasta-rice-and-grains', 'taste-mood', 'savory', 'sweet']","nutrition":"[815.5, 72.0, 45.0, 7.0, 31.0, 24.0, 28.0]","n_steps":7,"steps":"['for dressing: dissolve vinegar , lemon juice , garlic and sugar with a hand whip', 'fold in the mustard , basil , salt and pepper', 'slowly add oils , while whisking vigorously', 'refrigerate', 'use only 1 \/ 2 cup of dressing for salad', 'place all other ingredients in a mixing bowl and mix well', 'serve ice cold , 38-40f shelf life mixed is two hours']","description":"wonderful blend of flavors makes for a delightful side dish or lunch salad.","ingredients":"['balsamic vinegar', 'lemon juice', 'fresh garlic', 'french dijon mustard', 'sugar', 'canola oil', 'extra virgin olive oil', 'fresh basil', 'salt', 'pepper', 'orzo pasta', 'cooked wild rice', 'red onion', 'currants', 'canned corn niblet', 'toasted almond', 'parsley', 'red peppers', 'yellow peppers', 'green onion', 'garlic granules']","n_ingredients":21}""",
    },
]



In [14]:
# Prompt template used to format each individual example.

example_prompt = ChatPromptTemplate.from_messages(
    [
        ("human", "{input}"),
        ("ai", "{output}"),
    ]
)

few_shot_prompt = FewShotChatMessagePromptTemplate(
    example_prompt=example_prompt,
    examples=examples,
)

In [15]:
# Bring together the examples with the system and user (human) inputs.

final_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", """You are an Cooking Assistant specialising in vegan recipes. your cooking style is mediterranean asian fusion, similar to a mix between Jamie Oliver and Joanne Molinaro. You will be given a set of ingredients and respond with a great tasting recipe involving those ingredients. Use the context and the examples to create the recipe:
{context}"""),
        few_shot_prompt,
        ("human", "{question}"),
    ]
)

In [16]:
# Run chain to call Azure openai for Q&A using ChromaDB vector database data to enrich the prompt (RAG).

chain = RetrievalQA.from_chain_type(
       llm=client,
       retriever = vectordb.as_retriever(),
       chain_type="stuff",
       chain_type_kwargs={"prompt": final_prompt}
)
result = chain.invoke({"query": query})