In [None]:
#————————————————————

# Name: Azure OpenAI API, Retrieval-Augmented Generation (RAG) and Fine-tuning

# Purpose: This notebook will implement RAG in combination with a fine-tuned model.

# Company: Allgeier Schweiz AG
# Author: Nicolas Rehder (nrehder@allgeier.ch), Alex Dean (adean@allgeier.ch)
# Create for: SDSC 2024
# Date Created: 22.01.2024
# Last Updated: 25.05.2024
# Python Version: 3.10.4

# Troubleshooting:
# https://github.com/langchain-ai/langchain/issues/14123
# https://github.com/langchain-ai/langchain/issues/15878
# https://stackoverflow.com/questions/77087460/langchain-azure-openai-api-returning-additional-information-than-the-asked-q
# https://stackoverflow.com/questions/77839844/langchain-retrievalqa-missing-some-input-keys
# https://gist.github.com/defulmere/8b9695e415a44271061cc8e272f3c300?permalink_comment_id=4711478
# https://stackoverflow.com/questions/76921252/attributeerror-module-chromadb-has-no-attribute-config

# Additionals:
# Use Region EASTUS2

# If necessary, download Python packages (run the below command in terminal if packages have not yet been installed)
# pip install -r C:\Python\openai-lab\support\requirements\requirements.txt

#————————————————————

In [1]:
# Import Python packages
import os
import io
import time
from io import StringIO
import json
from dotenv import load_dotenv
from pathlib import Path
import pandas as pd
from openai import AzureOpenAI
import chromadb
import chromadb.utils.embedding_functions as embedding_functions
from langchain.vectorstores import Chroma
from langchain_openai import AzureOpenAIEmbeddings
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import AzureChatOpenAI
from langchain.chains import RetrievalQA
from langchain import PromptTemplate
from langchain_core.prompts import (
    ChatPromptTemplate,
    FewShotChatMessagePromptTemplate,
)


In [2]:
# Load required variables from .env file.
load_dotenv(dotenv_path=Path("/workspaces/azure-openai-lab/.venv/.env")) #Error sometimes due to \ or \\. Try one or the other. "C:\\Python\\azure-openai-lab\\.venv\\.env"

# Load Azure OpenAI Key and Endpoint. These values can be found within the Azure OpenAI Service resource in portal.azure.com under Keys and Endpoint
azure_oai_key = os.environ['AZURE_OPENAI_KEY']
azure_oai_endpoint = os.environ['AZURE_OPENAI_ENDPOINT']

In [3]:
# Initalize Azure Openai using LangChain (Default gpt-35-turbo and fine-tuned gpt-35-turbo-0613-ft)
client = AzureChatOpenAI(
                deployment_name = "gpt-35-turbo-0613-ft",
                openai_api_key = azure_oai_key,
                azure_endpoint = azure_oai_endpoint,
                openai_api_version = "2024-02-15-preview"
        )   

In [4]:
# Generate the Word Embeddings for the Dataset using Azure OpenAI with model text-embedding-ada-002
openai_ef = AzureOpenAIEmbeddings(
                deployment = "text-embedding-3-large",
                openai_api_key = azure_oai_key,
                azure_endpoint = azure_oai_endpoint,
                openai_api_version = "2024-02-01",
            )

In [5]:
# Re-load an existing vector database from a local path
vectordb = Chroma(persist_directory=r"C:\Python\azure-openai-lab\data\chromadb\backup", embedding_function=openai_ef) #r"C:\Python\azure-openai-lab\data\chromadb\backup"


In [6]:
# Zero-shot learning Prompt
prompt_template = \
"""
### INSTRUCTIONS
Persona: Act as a head chef such as Joël Robuchon who specializes in simple contemporary cuisine.
Action: Create well-thought-out and flavourful vegan recipes from a list of ingredients {question}, implementing classic culinary techniques.
Target Audience: The recipients of these vegan recipes are couples who want to cook a special meal at least once a week.

### EXAMPLE
{context}

### OUTPUT FORMAT
Output only one vegan recipe and return it as a JSON object with the following format:
{{"name":"","minutes":,"tags":"[]","nutrition":"[]","n_steps":"","steps":"[]","description":"","ingredients":"[]", "n_ingredients":}}

The variables should contain the following information:
- name: the name of the recipe.
- minutes: the time in minutes to prepare the recipe.
- tags: a list of words that characterize the recipe.
- nutrition: a list of numeric values representing calories, total fat, sugar, sodium, protein, saturated fat, and carbohydrates.
- n_steps: the number of steps to prepare the recipe.
- steps: a list of steps to prepare the recipe.
- description: a summary of the recipe.
- ingredients: a list of the ingredient names in the recipe.
- n_ingredients: the total number of ingredients used in the recipe.
"""


simple_prompt = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [7]:
# Run chain to call Azure OpenAI using ChromaDB vector database data to enrich the prompt (RAG).
ingredients = """'Tofu', 'Avocado', 'Soy Sauce', 'Chili', 'Coconut Milk', 'Broccoli'"""

chain = RetrievalQA.from_chain_type(
       llm=client,
       retriever = vectordb.as_retriever(),
       chain_type="stuff",
       chain_type_kwargs={"prompt": simple_prompt}
)

# View generated recipe
result = chain.invoke({"query": ingredients})
print(result)

{'query': "'Tofu', 'Avocado', 'Soy Sauce', 'Chili', 'Coconut Milk', 'Broccoli'", 'result': '{"name":"{"minutes":50,"tags":"[\'60-minutes-or-less\', \'time-to-make\', \'course\', \'main-ingredient\', \'cuisine\', \'preparation\', \'occasion\', \'main-dish\', \'beans-and-legumes\', \'fruit\', \'asian\', \'thai\', \'dinner-party\', \'vegan\', \'vegetarian\', \'thai-asian\', \'dietary\', \'inexpensive\', \'one-dish-meal\', \'brunch\', \'comfort-food\', \'stove-top\', \'citrus\', \'taste-mood\', \'savory\', \'spicy\', \'number-of-servings\']","nutrition":"[301.8, 16.0, 14.0, 16.0, 18.0, 7.0, 19.0]","n_steps":9,"steps":"[\'drain tofu\', \'avocado lime sauce : combine everything except coconut milk in a blender , then add the coconut milk last and blend until smooth\', \'heat soy sauce and chili paste in a small pan until it boils , then set aside\', \'pour coconut milk into a large skillet and bring to a boil , then turn heat to med-high\', \'add tofu and cook until browned on all sides , st

In [8]:
#Transform output to pandas dataframe and save as CSV file

# Clean up Azure OpenAI Output
json_data = result['result'].strip('` \n')

if json_data.startswith('json'):
    json_data = json_data[4:]  # Remove the first 4 characters 'json'

recipes_from_rag_ft_json = json.loads(json_data)
recipes_from_rag_ft = pd.json_normalize(recipes_from_rag_ft_json)
# path_output = r"/workspaces/azure-openai-lab/data/recipes_from-rag-ft.csv" #r"C:\Python\azure-openai-lab\data\recipes-from-rag-ft.csv"
# recipes_from_rag_ft.to_csv(path_output, sep='\t', encoding='utf-8', index=False)

JSONDecodeError: Expecting ',' delimiter: line 1 column 12 (char 11)