In [None]:
#————————————————————

# Name: Azure OpenAI API, Retrieval-Augmented Generation (RAG) and Fine-tuning

# Purpose: This notebook will implement RAG and Fine-tuning together.

# Company: Allgeier Schweiz AG
# Author: Nicolas Rehder (nrehder@allgeier.ch), Alex Dean (adean@allgeier.ch)
# Create for: SDSC 2024
# Date Created: 22.01.2024
# Last Updated: 19.05.2024
# Python Version: 3.10.4

# Troubleshooting:
# https://github.com/langchain-ai/langchain/issues/14123
# https://github.com/langchain-ai/langchain/issues/15878
# https://stackoverflow.com/questions/77087460/langchain-azure-openai-api-returning-additional-information-than-the-asked-q
# https://stackoverflow.com/questions/77839844/langchain-retrievalqa-missing-some-input-keys
# https://gist.github.com/defulmere/8b9695e415a44271061cc8e272f3c300?permalink_comment_id=4711478
# https://stackoverflow.com/questions/76921252/attributeerror-module-chromadb-has-no-attribute-config

# Additionals:

# If necessary, download Python packages (run the below command in terminal if packages have not yet been installed)
# pip install -r C:\Python\openai-lab\support\requirements\requirements.txt

#————————————————————

In [1]:
# Import Python packages
import os
import io
import time
from io import StringIO
import json
from dotenv import load_dotenv
from pathlib import Path
import pandas as pd
from openai import AzureOpenAI
import chromadb
import chromadb.utils.embedding_functions as embedding_functions
from langchain.vectorstores import Chroma
from langchain_openai import AzureOpenAIEmbeddings
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import AzureChatOpenAI
from langchain.chains import RetrievalQA
from langchain import PromptTemplate
from langchain_core.prompts import (
    ChatPromptTemplate,
    FewShotChatMessagePromptTemplate,
)


In [2]:
# Load required variables from .env file.
load_dotenv(dotenv_path=Path("C:\\Python\\azure-openai-lab\\.venv\\.env")) #Error sometimes due to \ or \\. Try one or the other. /workspaces/azure-openai-lab/.venv/.env

# Load Azure OpenAI Key and Endpoint. These values can be found within the Azure OpenAI Service resource in portal.azure.com under Keys and Endpoint
azure_oai_key = os.environ['AZURE_OPENAI_KEY']
azure_oai_endpoint = os.environ['AZURE_OPENAI_ENDPOINT']

In [3]:
# Initialize the Azure OpenAI client
client = AzureOpenAI(
        azure_endpoint = azure_oai_endpoint, 
        api_key=azure_oai_key,  
        api_version="2024-02-01"
        )

In [4]:
# Generate the Word Embeddings for the Dataset using Azure OpenAI with model text-embedding-ada-002
openai_ef = AzureOpenAIEmbeddings(
                deployment = "text-embedding-ada-002",
                openai_api_key = azure_oai_key,
                azure_endpoint = azure_oai_endpoint,
                openai_api_version = "2024-02-01",
            )

In [7]:
# Re-load an existing vector database from a local path
vectordb = Chroma(persist_directory=r"C:\Python\azure-openai-lab\data\chromadb", embedding_function=openai_ef)#r"/workspaces/azure-openai-lab/data/chromadb"


In [13]:
# Initalize Azure Openai using LangChain (Default gpt-35-turbo and fine-tuned gpt-35-turbo-0613-ft)
client = AzureChatOpenAI(
                deployment_name = "gpt-35-turbo-0613-ft",
                openai_api_key = azure_oai_key,
                azure_endpoint = azure_oai_endpoint,
                openai_api_version = "2024-02-01"
        )   

In [9]:
# Zero-shot learning Prompt
prompt_template = \
"""
### Instructions
Persona: Act as a head chef such as Joël Robuchon who specializes in simple contemporary cuisine with a focus on vegan dishes.
Action: Create well-thought-out and flavourful vegan recipes from a list of ingredients from {question}, implementing classic culinary techniques using the provided {context}.
Target Audience: The recipients of these recipes are vegan couples who want to cook a special meal at least once a week.

### Output format
Return a JSON array with the following format:
{{"name":"","minutes":,"tags":"[]","nutrition":"[]","n_steps":"","steps":"[]","description":"","ingredients":"[]","n_ingredients":}}

The variables should contain the following information:
- name: the name of the recipe.
- minutes: the time in minutes to prepare the recipe.
- tags: a list of words that characterize the recipe.
- nutrition: a list of numeric values representing calories, total fat, sugar, sodium, protein, saturated fat, and carbohydrates.
- n_steps: the number of steps to prepare the recipe.
- steps: a list of steps to prepare the recipe.
- description: a summary of the recipe.
- ingredients: a list of ingredients used in the recipe including the amount and the units using the metric system.
- n_ingredients: the number of ingredients used in the recipe.
"""


simple_prompt = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [16]:
# Run chain to call Azure OpenAI using ChromaDB vector database data to enrich the prompt (RAG).
ingredients = """'Tofu', 'Avocado', 'Soy Sauce', 'Chili', 'Coconut Milk', 'Broccoli'"""

chain = RetrievalQA.from_chain_type(
       llm=client,
       retriever = vectordb.as_retriever(),
       chain_type="stuff",
       chain_type_kwargs={"prompt": simple_prompt}
)
result = chain.invoke({"query": ingredients})

# View Azure OpenAI output
display(result)

{'query': "'Tofu', 'Avocado', 'Soy Sauce', 'Chili', 'Coconut Milk', 'Broccoli'",
 'result': '{"name":"Tofu and Broccoli Stir-Fry","minutes":20,"tags":["vegan","stir-fry"],"nutrition":[220,12,4,560,16,2,20],"n_steps":4,"steps":["Press the tofu to remove excess moisture, then cut it into cubes.","Heat a tablespoon of oil in a large pan or wok over medium heat.","Add the tofu, broccoli, and chili to the pan and stir-fry for about 5 minutes.","Pour in the soy sauce and coconut milk, and cook for another 2 minutes until the sauce thickens slightly."],"description":"A delicious and healthy vegan stir-fry packed with protein and flavor.","ingredients":["300g tofu","1 tablespoon oil","1 head of broccoli, cut into florets","1 red chili, sliced","3 tablespoons soy sauce","200ml coconut milk"],"n_ingredients":6}'}

In [33]:
#Transform output to pandas dataframe and save as CSV file
recipes_from_rag_json = json.loads(result['result'])
recipes_from_rag = pd.json_normalize(recipes_from_rag_json)
path_output = r"C:\Python\azure-openai-lab\data\recipes-from-rag.csv" #/workspaces/azure-openai-lab/data/recipes-from-rag.csv
recipes_from_rag.to_csv(path_output, sep='\t', encoding='utf-8', index=False)