In [1]:
#————————————————————

# Name: Azure OpenAI RAG (V1)

# Purpose: 

# Verify token count and estimate cost.

# Company: Allgeier Schweiz AG
# Author: Nicolas Rehder (nrehder@allgeier.ch)
# Create for: SDSC 2024
# Date Created: 10.01.2024
# Last Updated: 10.01.2024
# Python Version: 3.10.4

# General Sources:
# https://cscblog.ethz.ch/index.php/2024/02/06/az-open-ai-rag-chromadb-langchain/
# https://github.com/Azure-Samples/openai/blob/main/Basic_Samples/Chat/chat_with_your_own_data.ipynb
# https://thenewstack.io/tutorial-use-chroma-and-openai-to-build-a-custom-qa-bot/
# https://www.pinecone.io/learn/chunking-strategies/
# https://python.langchain.com/docs/modules/model_io/prompts/few_shot_examples/
# https://github.com/langchain-ai/langchain/issues/14123
# https://github.com/langchain-ai/langchain/issues/15878
# https://www.kaggle.com/code/peremartramanonellas/ask-your-documents-with-langchain-vectordb-hf
# https://stackoverflow.com/questions/77087460/langchain-azure-openai-api-returning-additional-information-than-the-asked-q

# Azure Openai Usage:

# Additionals:

# Download Python packages (run the below command in terminal if packages have not yet been installed)
# pip install -r C:\Python\openai-lab\support\requirements\requirements.txt

#————————————————————

hello


In [1]:
# Import required libraries
import os
import io
import time
from io import StringIO
import json
from dotenv import load_dotenv
from pathlib import Path
import pandas as pd
from openai import AzureOpenAI
import chromadb
import chromadb.utils.embedding_functions as embedding_functions
from langchain.vectorstores import Chroma
from langchain_openai import AzureOpenAIEmbeddings
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import AzureChatOpenAI
from langchain.chains import RetrievalQA
from langchain import PromptTemplate
from langchain_core.prompts import (
    ChatPromptTemplate,
    FewShotChatMessagePromptTemplate,
)


In [2]:
# Load Azure OpenAI Key and Endpoint. These values can be found within the Azure OpenAI Service resource in portal.azure.com under Keys and Endpoint
load_dotenv(dotenv_path=Path("C:\Python\openai-lab\.venv\.env"))
azure_oai_key = os.environ['AZURE_OPENAI_KEY']
azure_oai_endpoint = os.environ['AZURE_OPENAI_ENDPOINT']

In [3]:
# Initialize the Azure OpenAI client
client = AzureOpenAI(
        azure_endpoint = azure_oai_endpoint, 
        api_key=azure_oai_key,  
        api_version="2024-02-01"
        )

In [4]:
# Import recipes csv

path_input = r"C:\Python\openai-lab\data\recipes.csv" #Change path if required
df = pd.read_csv(path_input , sep=',', on_bad_lines='skip', low_memory=False)

In [5]:
# List column headers
# list(df)

# Remove columns

df = df.drop(['id', 'contributor_id', 'submitted'], axis=1)

# Remove double whitespaces from name

df["name"] = df["name"].str.replace(r'\s+', ' ', regex=True)

# Create subset of data

df = df[df["n_ingredients"] > 20]


In [6]:
# Create new column with relevant information for LLM packed into one string

df["dense_feature"] = df.name + "; " + df.tags.apply(lambda x: str(x).strip("[]").replace("'", "")) + "; " + df.nutrition.apply(lambda x: str(x).strip("[]").replace("'", "")) + "; " + df.ingredients.apply(lambda x: str(x).strip("[]").replace("'", "")) + "; " + df.steps
df_text_input = pd.DataFrame(df["dense_feature"])


In [7]:
# Create dataframe input compatible with langchain chroma

df_loader = DataFrameLoader(df_text_input, page_content_column="dense_feature")
df_document = df_loader.load()
#display(df_document[:2])

In [8]:
# Generate and store the Word Embeddings for the Dataset using Azure Openai

# def text_embedding(text):
#     response = client.embeddings.create(model="text-embedding-ada-002", input=[text])
#     return response.data[0].embedding


openai_ef = AzureOpenAIEmbeddings(
                deployment = "text-embedding-ada-002",
                openai_api_key = azure_oai_key,
                azure_endpoint = azure_oai_endpoint,
                openai_api_version = "2024-02-01",
                #openai_api_type = "azure",
                #chunk_size = 1
            )


In [9]:
# Chunk input

text_splitter = CharacterTextSplitter(
    separator = "\n\n",
    chunk_size = 256,
    chunk_overlap  = 20
)
df_document_split = text_splitter.split_documents(df_document)

In [10]:
# Create the ChromaDB Vector Database collection based on the Azure OpenAI embeddings model. Vector Database is created locally.

vectordb = Chroma.from_documents(
                documents = df_document_split,
                embedding = openai_ef,
                collection_name = "recipes",
                persist_directory = "C:\Python\openai-lab\data\chromadb", #"./chroma_db",
                collection_metadata={"hnsw:space": "cosine"}
            )

vectordb.persist()

In [None]:
# Load from disk

#vectordb = Chroma(persist_directory = "C:\Python\chromadb", openai_ef)

In [11]:
#Perform a Similarity Search to view Vector Database output based on input query.

query = "Beef, Butter, Mushrooms, Onions, Cream"
#vector = text_embedding(query)

vectordb_output = vectordb.similarity_search(query)

# print results
print(vectordb_output[0].page_content)

beef and mushroom stew; weeknight, time-to-make, course, main-ingredient, cuisine, preparation, occasion, north-american, main-dish, beef, eggs-dairy, pork, vegetables, canadian, easy, potluck, dinner-party, stove-top, dietary, low-sodium, comfort-food, low-carb, mushrooms, low-in-something, meat, taste-mood, to-go, equipment, 4-hours-or-less; 623.9, 70.0, 14.0, 20.0, 67.0, 98.0, 4.0; unsalted butter, olive oil, bacon, onion, garlic, carrot, celery, worcestershire sauce, bay leaf, dried thyme, black pepper, salt, cayenne pepper, all-purpose flour, stewing beef, red wine, chicken stock, white pearl onions, button mushroom, sour cream, flat-leaf italian parsley; ['in a dutch oven over medium heat , melt butter and add oil', 'add bacon pieces to fat and cook , stirring often , until bacon starts to brown', 'add onion , garlic , carrots , celery , worcestershire , and herbs and spices', 'cook , stirring often , until onion is softened', 'pat beef cubes dry with a paper towel , if necessary

In [13]:
# Initalize Azure Openai through Langchain (Default gpt-35-turbo and fine-tuned gpt-35-turbo-0613-ft)

client = AzureChatOpenAI(
                deployment_name = "gpt-35-turbo", #"gpt-35-turbo-0613-ft",
                openai_api_key = azure_oai_key,
                azure_endpoint = azure_oai_endpoint,
                openai_api_version = "2023-05-15",
        )   

In [14]:
#Few-Shot prompt

examples = [
    {
        "input": "butter, lemon, juice of, salt, white pepper, egg yolks",
        "output": """{"name":"easiest ever hollandaise sauce","minutes":25,"tags":"['30-minutes-or-less', 'time-to-make', 'course', 'main-ingredient', 'preparation', 'very-low-carbs', 'sauces', 'condiments-etc', 'eggs-dairy', 'eggs', 'stove-top', 'dietary', 'low-carb', 'savory-sauces', 'low-in-something', 'equipment', 'number-of-servings']","nutrition":"[1290.4, 213.0, 4.0, 53.0, 22.0, 417.0, 1.0]","n_steps":7,"steps":"['cut the butter into several pieces and bring to room temperature', 'in the top of a double boiler , combine egg yolks , lemon juice , salt and pepper', 'add a piece of butter', 'cook , stirring steadily with a wooden spoon or wire whisk , over , but not touching , boiling water', 'when butter melts and sauce begins to thicken , add remaining butter , stirring constantly until melted', 'continue cooking as sauce thickens , about 2 more minutes', 'immediately remove from heat']","description":"the secret to this easy hollandaise sauce is in separating the egg yolks. remove all the egg whites, as they can thin the sauce. also, it is best prepared in a double boiler to prevent overheating. serve over cooked asparagus, broccoli, or broiled tomatoes.","ingredients":"['butter', 'lemon, juice of', 'salt', 'white pepper', 'egg yolks']","n_ingredients":5}""",
    },
    {
        "input": "bacon, onion, celery, carrot, garlic, butter, olive oil, lean ground beef, ground pork, beef consomme, dry white wine, crushed tomatoes, salt, black pepper, rubbed sage, oregano, red pepper flakes, nutmeg, milk, penne pasta",
        "output": """{"name":"real italian bolognese sauce","minutes":160,"tags":"['time-to-make', 'course', 'cuisine', 'preparation', 'sauces', 'condiments-etc', 'european', 'italian', 'dietary', '4-hours-or-less']","nutrition":"[1260.7, 97.0, 11.0, 71.0, 103.0, 119.0, 38.0]","n_steps":16,"steps":"['in a dutch oven or medium size pot , heat butter and olive oil over medium heat until butter begins to froth', 'add onion , celery , carrot , garlic , and bacon', 'cook until onions are translucent', 'remove bacon and remove fat', 'chop lean portions of bacon in small pieces and return to pot', 'add ground beef and ground pork , and cook until meat loses red , raw color', 'raise heat and add wine and consomme', 'cook sauce until wine and consomme are mostly evaporated', 'turn heat down to simmer and add oregano , salt , pepper , sage , red pepper flakes , and nutmeg', 'let cook for approximately 20 minutes', 'add crushed tomatoes and bring heat to a boil', 'once the mixture comes to a boil , return to simmer', 'let sauce simmer partially covered for about 2 to 4 hours , stirring occasionally to prevent sticking', 'about 5 to 10 minutes before serving , add milk', 'sauce can now be added to cooked penne pasta , spaghetti or many other pastas to your liking', 'remaining sauce may be frozen for up to two months for future use']","description":"after traveling throughout italy, savoring the fine tastes of bolognese from the many different regions, i decided to formulate my own. try it, you'll love it.","ingredients":"['bacon', 'onion', 'celery', 'carrot', 'garlic', 'butter', 'olive oil', 'lean ground beef', 'ground pork', 'beef consomme', 'dry white wine', 'crushed tomatoes', 'salt', 'black pepper', 'rubbed sage', 'oregano', 'red pepper flakes', 'nutmeg', 'milk', 'penne pasta']","n_ingredients":20}""",
    },
]

In [15]:
# Prompt template used to format each individual example.

example_prompt = ChatPromptTemplate.from_messages(
    [
        ("human", "{input}"),
        ("ai", "{output}"),
    ]
)

few_shot_prompt = FewShotChatMessagePromptTemplate(
    example_prompt=example_prompt,
    examples=examples,
)

In [16]:
# Bring together the examples with the system and user (human) inputs.

final_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", """Act as a head chef and create flavourful recipe from a list of ingredients. Use the following pieces of context and the examples to create the recipe:
{context}"""),
        few_shot_prompt,
        ("human", "{question}"),
    ]
)

In [17]:
# Run chain to call Azure openai for Q&A using ChromaDB vector database data to enrich the prompt (RAG).

chain = RetrievalQA.from_chain_type(
       llm=client,
       retriever = vectordb.as_retriever(),
       chain_type="stuff",
       chain_type_kwargs={"prompt": final_prompt}
)
result = chain.invoke({"query": query})