### **Load Environment variables from .env file**

In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import AzureOpenAI
import openai
from dotenv import load_dotenv
import os

In [2]:
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_DEPLOYMENT_ENDPOINT = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")
OPENAI_DEPLOYMENT_NAME = os.getenv("OPENAI_DEPLOYMENT_NAME")
OPENAI_MODEL_NAME = os.getenv("OPENAI_MODEL_NAME")
OPENAI_DEPLOYMENT_VERSION = os.getenv("OPENAI_DEPLOYMENT_VERSION")

OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME = os.getenv("OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME")
OPENAI_ADA_EMBEDDING_MODEL_NAME = os.getenv("OPENAI_ADA_EMBEDDING_MODEL_NAME")

# Configure OpenAI API
openai.api_type = "azure"
openai.api_version = OPENAI_DEPLOYMENT_VERSION
openai.api_base = OPENAI_DEPLOYMENT_ENDPOINT
openai.api_key = OPENAI_API_KEY

In [3]:
embeddings = OpenAIEmbeddings(
    deployment=OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME,
    model=OPENAI_ADA_EMBEDDING_MODEL_NAME,
    openai_api_base=OPENAI_DEPLOYMENT_ENDPOINT,
    openai_api_type="azure",
)

#### **Run ONLY ONCE to create the embeddings - Split text into chunks** 

In [None]:
fileName = "./data/fabric-data-engineering.pdf"
loader = PyPDFLoader(fileName)
pages = loader.load_and_split()
print("Number of pages: ", len(pages))

#### **Run ONLY ONCE to create the embeddings - Create embeddings and save to FAISS**

In [None]:
embeddings = OpenAIEmbeddings(model=OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME, chunk_size=1)
db = FAISS.from_documents(documents=pages, embedding=embeddings)
# save the FAISS index to disk
db.save_local("./dbs/documentation/faiss_index")

#### **Initialize LLM**

In [4]:
def init_llm(model=OPENAI_MODEL_NAME,
             deployment_name=OPENAI_DEPLOYMENT_NAME,
             openai_api_version=OPENAI_DEPLOYMENT_VERSION,
             temperature=0,
             max_tokens=400,
             top_p=1,
             ):

    llm = AzureOpenAI(deployment_name=deployment_name,
                      model=model,
                      openai_api_version=openai_api_version,
                      temperature=temperature,
                      max_tokens=max_tokens,
                      top_p=top_p,
                      model_kwargs={"stop": ["<|im_end|>"]}
                      )
    return llm

#### **Initialize retrieval API WITHOUT your data - get wrong answers**

In [5]:
from IPython.display import display, HTML, JSON, Markdown

llm = init_llm()
answer = llm("How do I choose between a lakehouse and a data warehouse in Microsoft Fabric? ")
# prepare prompt
messages = [{"role": "system", "content": "You are a HELPFUL assistant answering users questions. Answer in a clear and concise manner."},
            {"role": "user", "content": "Can I use PowerBI datamart for 110 GB data volume? Explain which other options are available for data volumes of 110 GB?"}]


answer = openai.ChatCompletion.create(engine=OPENAI_DEPLOYMENT_NAME,

                                      messages=messages,)
display(HTML(answer.choices[0].message.content))

#### **Initialize retrieval API WITH your data - get the right answers**

In [6]:
# load the vector store to memory
vectorStore = FAISS.load_local("./dbs/documentation/faiss_index", embeddings)
retriever = vectorStore.as_retriever(search_type="similarity", search_kwargs={"k": 2})  # returns 2 most similar vectors/documents
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=False)

#### **Ask questions**

In [7]:
qa({"query": "Can I use PowerBI datamart for 110 GB data volume? Explain which other options are available for data volumes of 110 GB?"})

{'query': 'Can I use PowerBI datamart for 110 GB data volume? Explain which other options are available for data volumes of 110 GB?',
 'result': ' No, PowerBI datamart is only suitable for data volumes up to 100 GB. For data volumes of 110 GB, you can use a data warehouse or a lakehouse in Microsoft Fabric. \n\nData warehouse is suitable for structured data, and is ideal for SQL engineers and data warehouse developers. It is organized by databases, schemas, and tables, and read operations can be performed using Spark and T-SQL. Write operations can be performed using T-SQL.\n\nLakehouse is suitable for unstructured, semi-structured, and structured data, and is ideal for data engineers and data scientists. It is organized by folders and files, databases, and tables, and read operations can be performed using Spark, T-SQL, and Power BI. Write operations can be performed using Spark (Scala, PySpark, Spark SQL, R) and T-SQL. \n\nBoth data warehouse and lakehouse can handle unlimited data v

In [8]:
qa({"query": "What are the steps to load a CSV file to a delta table in Microsoft Fabric?"})

{'query': 'What are the steps to load a CSV file to a delta table in Microsoft Fabric?',
 'result': ' The steps to load a CSV file to a delta table in Microsoft Fabric are:\n1. Select Synapse Data Engineering experience in Microsoft Fabric.\n2. Make sure you are in the desired workspace or select/create one.\n3. Select the Lakehouse icon under the New section in the main image.\n4. Upload the CSV file to the Lakehouse.\n5. Convert the file to a Delta table. \n6. Generate a Dataset and create a Power BI report.'}