In [None]:
#————————————————————

# Name: Azure OpenAI API, Retrieval-Augmented Generation (RAG)

# Purpose: This notebook will implement RAG using ChromaDB and LangChain together with the Azure OpenAI model text-embedding-3-large.

# Company: Allgeier Schweiz AG
# Author: Nicolas Rehder (nrehder@allgeier.ch), Alex Dean (adean@allgeier.ch)
# Create for: SDSC 2024
# Date Created: 22.01.2024
# Last Updated: 25.05.2024
# Python Version: 3.10.4

# Troubleshooting:
# https://github.com/langchain-ai/langchain/issues/14123
# https://github.com/langchain-ai/langchain/issues/15878
# https://stackoverflow.com/questions/77087460/langchain-azure-openai-api-returning-additional-information-than-the-asked-q
# https://stackoverflow.com/questions/77839844/langchain-retrievalqa-missing-some-input-keys
# https://gist.github.com/defulmere/8b9695e415a44271061cc8e272f3c300?permalink_comment_id=4711478
# https://stackoverflow.com/questions/76921252/attributeerror-module-chromadb-has-no-attribute-config

# Additionals:

# If necessary, download Python packages (run the below command in terminal if packages have not yet been installed)
# pip install -r C:\Python\openai-lab\support\requirements\requirements.txt

#————————————————————

In [1]:
# Import Python packages
import os
import io
import time
from io import StringIO
import json
from dotenv import load_dotenv
from pathlib import Path
import pandas as pd
from openai import AzureOpenAI
import chromadb
import chromadb.utils.embedding_functions as embedding_functions
from langchain.vectorstores import Chroma
from langchain_openai import AzureOpenAIEmbeddings
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import AzureChatOpenAI
from langchain.chains import RetrievalQA
from langchain import PromptTemplate
from langchain_core.prompts import (
    ChatPromptTemplate,
    FewShotChatMessagePromptTemplate,
)


In [2]:
# Load required variables from .env file.
load_dotenv(dotenv_path=Path("/workspaces/azure-openai-lab/.venv/.env")) #Error sometimes due to \ or \\. Try one or the other. "C:\\Python\\azure-openai-lab\\.venv\\.env"

# Load Azure OpenAI Key and Endpoint. These values can be found within the Azure OpenAI Service resource in portal.azure.com under Keys and Endpoint
azure_oai_key = os.environ['AZURE_OPENAI_KEY']
azure_oai_endpoint = os.environ['AZURE_OPENAI_ENDPOINT']

In [3]:
# Initialize the Azure OpenAI client
client = AzureOpenAI(
        azure_endpoint = azure_oai_endpoint, 
        api_key=azure_oai_key,  
        api_version="2024-02-01"
        )

In [4]:
# Import recipes from CSV file
path_input = r"/workspaces/azure-openai-lab/data/recipes-preprocessed.csv" #r"C:\Python\azure-openai-lab\data\recipes-preprocessed.csv"
df = pd.read_csv(path_input , sep=',', on_bad_lines='skip', low_memory=False)

In [5]:
# Create dataframe input compatible with langchain chroma
df_text_input = pd.DataFrame(df["dense_feature"])
df_loader = DataFrameLoader(df_text_input, page_content_column="dense_feature")
df_document = df_loader.load()


In [6]:
# Chunk text input for embedding
text_splitter = CharacterTextSplitter(
    separator = "\n\n",
    chunk_size = 256,
    chunk_overlap  = 20
)
df_document_split = text_splitter.split_documents(df_document)

In [7]:
# Generate the Word Embeddings for the Dataset using Azure OpenAI with model text-embedding-ada-002
openai_ef = AzureOpenAIEmbeddings(
                deployment = "text-embedding-3-large",
                openai_api_key = azure_oai_key,
                azure_endpoint = azure_oai_endpoint,
                openai_api_version = "2024-02-01",
            )


In [8]:
# Create the ChromaDB Vector Database collection based on the Azure OpenAI embeddings model. Vector Database is created locally.

vectordb = Chroma.from_documents(
                documents = df_document_split,
                embedding = openai_ef,
                collection_name = "recipes",
                persist_directory = r"/workspaces/azure-openai-lab/data/chromadb", #r"C:\Python\azure-openai-lab\data\chromadb"
                collection_metadata={"hnsw:space": "cosine"}
            )


In [9]:
# Initalize Azure Openai using LangChain
client = AzureChatOpenAI(
                deployment_name = "gpt-35-turbo", 
                openai_api_key = azure_oai_key,
                azure_endpoint = azure_oai_endpoint,
                openai_api_version = "2024-02-01"
        )   

In [10]:
# Zero-shot learning Prompt
prompt_template = \
"""
### INSTRUCTIONS
Persona: Act as a head chef such as Joël Robuchon who specializes in simple contemporary cuisine.
Action: Create well-thought-out and flavourful vegan recipes from a list of ingredients {question}, implementing classic culinary techniques.
Target Audience: The recipients of these vegan recipes are couples who want to cook a special meal at least once a week.

### EXAMPLE
{context}

### OUTPUT FORMAT
Output only one vegan recipe and return it as a JSON object with the following format:
{{"name":"","minutes":,"tags":"[]","nutrition":"[]","n_steps":"","steps":"[]","description":"","ingredients":"[]", "n_ingredients":}}

The variables should contain the following information:
- name: the name of the recipe.
- minutes: the time in minutes to prepare the recipe.
- tags: a list of words that characterize the recipe.
- nutrition: a list of numeric values representing calories, total fat, sugar, sodium, protein, saturated fat, and carbohydrates.
- n_steps: the number of steps to prepare the recipe.
- steps: a list of steps to prepare the recipe.
- description: a summary of the recipe.
- ingredients: a list of the ingredient names in the recipe.
- n_ingredients: the total number of ingredients used in the recipe.
"""


simple_prompt = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [11]:
# Run chain to call Azure OpenAI using ChromaDB vector database data to enrich the prompt (RAG).
ingredients = """'Tofu', 'Avocado', 'Soy Sauce', 'Chili', 'Coconut Milk', 'Broccoli'"""

chain = RetrievalQA.from_chain_type(
       llm=client,
       retriever = vectordb.as_retriever(),
       chain_type="stuff",
       chain_type_kwargs={"prompt": simple_prompt}
)

# View Azure OpenAI output
result = chain.invoke({"query": ingredients})
print(result)

NotFoundError: Error code: 404 - {'error': {'code': 'DeploymentNotFound', 'message': 'The API deployment for this resource does not exist. If you created the deployment within the last 5 minutes, please wait a moment and try again.'}}

In [12]:
#Transform output to pandas dataframe and save as CSV file

# Clean up Azure OpenAI Output
json_data = result['result'].strip('` \n')

if json_data.startswith('json'):
    json_data = json_data[4:]  # Remove the first 4 characters 'json'

recipes_from_rag_json = json.loads(json_data)
recipes_from_rag = pd.json_normalize(recipes_from_rag_json)
# path_output = r"/workspaces/azure-openai-lab/data/generated_output/recipes-from-rag.csv" #r"C:\Python\azure-openai-lab\data\generated_output\recipes-from-rag.csv"
# recipes_from_rag.to_csv(path_output, sep='\t', encoding='utf-8', index=False)

[NOT USED IN WORKSHOP]

In [16]:
# [NOT USED IN WORKSHOP]

# Few-Shot learning Prompt 
# examples = [
#     {
#         "input": "canola oil, onion, garlic cloves, fresh ginger, jalapeno, curry powder, diced tomatoes with juice, low sodium vegetable broth, natural-style peanut butter, tamari soy sauce, pepper, sweet potato, carrots, chickpeas, fresh okra, frozen green beans, kale, lime, juice of, fresh cilantro, lime wedge, salted peanuts",
#         "output": """{"name":"african peanut stew","minutes":90,"tags":"['time-to-make', 'course', 'main-ingredient', 'cuisine', 'preparation', 'main-dish', 'beans', 'vegetables', 'african', 'easy', 'vegan', 'vegetarian', 'stews', 'dietary', 'chick-peas-garbanzos', '3-steps-or-less', '4-hours-or-less']","nutrition":"[307.8, 20.0, 27.0, 21.0, 22.0, 10.0, 13.0]","n_steps":11,"steps":"['heat the oil in a large , heavy stockpot', 'add the onion , garlic , jalapeno , and ginger , and cook over moderate heat , stirring frequently , until the onion is lightly browned , about 6 minutes', 'add the curry powder and cook , stirring , until fragrant and lightly toasted , about 2 minutes', 'add the tomatoes , scraping up any bits stuck to the bottom of the pan', 'whisk in the broth and peanut butter , season with pepper and tamari to taste , and bring to a boil', 'cook over moderately high heat for 15 minutes , stirring frequently', 'add the sweet potatoes , carrots , and chickpeas , cover partially and cook over moderately low heat until the vegetables are just tender , about 20 minutes', 'add the okra and green beans , cover partially and cook until all the vegetables are tender , about 10 minutes longer', 'add the kale and juice of 1 lime , and cook for 10 more minutes', 'transfer to deep bowls and serve hot', 'garnish with the cilantro , lime wedges , and chopped peanuts at the table']","description":"i first had this stew at a restaurant in rochester, ny, and when i moved away, spent years recreating it!  delicious with a lovely hearty bread, or with rice cooked in the stew (about 3\/4 cup brown rice added with the sweet potatoes and carrots).","ingredients":"['canola oil', 'onion', 'garlic cloves', 'fresh ginger', 'jalapeno', 'curry powder', 'diced tomatoes with juice', 'low sodium vegetable broth', 'natural-style peanut butter', 'tamari soy sauce', 'pepper', 'sweet potato', 'carrots', 'chickpeas', 'fresh okra', 'frozen green beans', 'kale', 'lime, juice of', 'fresh cilantro', 'lime wedge', 'salted peanuts']","n_ingredients":21}""",
#     },
#     {
#         "input": "balsamic vinegar, lemon juice, fresh garlic, french dijon mustard, sugar, canola oil, extra virgin olive oil, fresh basil, salt, pepper, orzo pasta, cooked wild rice, red onion, currants, canned corn niblet, toasted almond, parsley, red peppers, yellow peppers, green onion, garlic granules",
#         "output": """{"name":"alexanders orzo and wild rice salad","minutes":20,"tags":"['30-minutes-or-less', 'time-to-make', 'course', 'main-ingredient', 'preparation', 'occasion', 'salads', 'side-dishes', 'pasta', 'rice', 'easy', 'beginner-cook', 'dinner-party', 'holiday-event', 'vegan', 'vegetarian', 'dietary', 'pasta-rice-and-grains', 'taste-mood', 'savory', 'sweet']","nutrition":"[815.5, 72.0, 45.0, 7.0, 31.0, 24.0, 28.0]","n_steps":7,"steps":"['for dressing: dissolve vinegar , lemon juice , garlic and sugar with a hand whip', 'fold in the mustard , basil , salt and pepper', 'slowly add oils , while whisking vigorously', 'refrigerate', 'use only 1 \/ 2 cup of dressing for salad', 'place all other ingredients in a mixing bowl and mix well', 'serve ice cold , 38-40f shelf life mixed is two hours']","description":"wonderful blend of flavors makes for a delightful side dish or lunch salad.","ingredients":"['balsamic vinegar', 'lemon juice', 'fresh garlic', 'french dijon mustard', 'sugar', 'canola oil', 'extra virgin olive oil', 'fresh basil', 'salt', 'pepper', 'orzo pasta', 'cooked wild rice', 'red onion', 'currants', 'canned corn niblet', 'toasted almond', 'parsley', 'red peppers', 'yellow peppers', 'green onion', 'garlic granules']","n_ingredients":21}""",
#     },
# ]



In [17]:
# Prompt template used to format each individual example. [NOT USED IN WORKSHOP]
# example_prompt = ChatPromptTemplate.from_messages(
#     [
#         ("human", "{input}"),
#         ("ai", "{output}"),
#     ]
# )

# few_shot_prompt = FewShotChatMessagePromptTemplate(
#     example_prompt=example_prompt,
#     examples=examples,
# )

In [30]:
# Bring together the examples with the system and user (human) inputs. [NOT USED IN WORKSHOP]
# systemcontent = \
# """
# ### Instructions
# Persona: Act as a head chef such as Joël Robuchon who specializes in simple contemporary cuisine.
# Action: Create well-thought-out and flavourful vegan recipes from a list of ingredients {question}, implementing classic culinary techniques.
# Target Audience: The recipients of these vegan recipes are couples who want to cook a special meal at least once a week.

# ### Example
# {context}

# ### Output format
# Output only one vegan recipe and return it as a JSON object with the following format:
# {{"name":"","minutes":,"tags":"[]","nutrition":"[]","n_steps":"","steps":"[]","description":"","ingredients":"[]", "n_ingredients":}}

# The variables should contain the following information:
# - name: the name of the recipe.
# - minutes: the time in minutes to prepare the recipe.
# - tags: a list of words that characterize the recipe.
# - nutrition: a list of numeric values representing calories, total fat, sugar, sodium, protein, saturated fat, and carbohydrates.
# - n_steps: the number of steps to prepare the recipe.
# - steps: a list of steps to prepare the recipe.
# - description: a summary of the recipe.
# - ingredients: a list of the ingredient names in the recipe.
# - n_ingredients: the total number of ingredients used in the recipe.
# """

# final_prompt = ChatPromptTemplate.from_messages(
#     [
#         ("system", systemcontent),
#         few_shot_prompt,
#         ("human", "{question}"),
#     ]
# )

In [None]:
# Run chain to call Azure OpenAI using ChromaDB vector database data to enrich the prompt (RAG). [NOT USED IN WORKSHOP]
# chain = RetrievalQA.from_chain_type(
#        llm=client,
#        retriever = vectordb.as_retriever(),
#        chain_type="stuff",
#        chain_type_kwargs={"prompt": final_prompt}
# )
# result = chain.invoke({"query": ingredients})

# # View Azure OpenAI output
# display(result)

In [33]:
#Transform output to pandas dataframe and save as CSV file

# Clean up Azure OpenAI Output
# json_data = result['result'].strip('` \n')

# if json_data.startswith('json'):
#     json_data = json_data[4:]  # Remove the first 4 characters 'json'

# recipes_from_rag_json = json.loads(json_data)
# recipes_from_rag = pd.json_normalize(recipes_from_rag_json)
# path_output = r"/workspaces/azure-openai-lab/data/generated_output/recipes-from-rag.csv" #r"C:\Python\azure-openai-lab\data\generated_output\recipes-from-rag.csv"
# recipes_from_rag.to_csv(path_output, sep='\t', encoding='utf-8', index=False)