### **Load Environment variables from .env file**

In [10]:
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import AzureOpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import AzureOpenAI
from dotenv import load_dotenv
import os

In [11]:
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_DEPLOYMENT_ENDPOINT = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")
OPENAI_DEPLOYMENT_NAME = os.getenv("OPENAI_DEPLOYMENT_NAME")
OPENAI_MODEL_NAME = os.getenv("OPENAI_MODEL_NAME")
OPENAI_DEPLOYMENT_VERSION = os.getenv("OPENAI_DEPLOYMENT_VERSION")

OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME = os.getenv("OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME")
OPENAI_ADA_EMBEDDING_MODEL_NAME = os.getenv("OPENAI_ADA_EMBEDDING_MODEL_NAME")

client = AzureOpenAI(
  azure_endpoint = OPENAI_DEPLOYMENT_ENDPOINT, 
  api_key=OPENAI_API_KEY,  
  api_version="2023-05-15"
)

In [12]:
embeddings = AzureOpenAIEmbeddings(
    deployment=OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME,
    model=OPENAI_ADA_EMBEDDING_MODEL_NAME,
    openai_api_base=OPENAI_DEPLOYMENT_ENDPOINT,
    chunk_size = 1)




#### **Run ONLY ONCE to create the embeddings - Split text into chunks** 

In [13]:
fileName = "./data/fabric-data-engineering.pdf"
loader = PyPDFLoader(fileName)
pages = loader.load_and_split()
print("Number of pages: ", len(pages))

Number of pages:  292


#### **Run ONLY ONCE to create the embeddings - Create embeddings and save to FAISS**

In [14]:
db = FAISS.from_documents(documents=pages, embedding=embeddings)
# save the FAISS index to disk
db.save_local("./dbs/documentation/faiss_index")

In [15]:
llm = AzureOpenAI(
    deployment_name=OPENAI_DEPLOYMENT_NAME,
    model_name=OPENAI_MODEL_NAME,
    api_version="2023-05-15",
    openai_api_base=OPENAI_DEPLOYMENT_ENDPOINT,
)



#### Initialize retrieval API WITH your data

In [16]:
# load the vector store to memory
vectorStore = FAISS.load_local("./dbs/documentation/faiss_index", embeddings)
retriever = vectorStore.as_retriever(search_type="similarity", search_kwargs={"k": 2})  # returns 2 most similar vectors/documents
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=False)

#### **Ask questions**

In [17]:
qa({"query": "Can I use PowerBI datamart for 110 GB data volume? Explain which other options are available for data volumes of 110 GB?"})

{'query': 'Can I use PowerBI datamart for 110 GB data volume? Explain which other options are available for data volumes of 110 GB?',
 'result': " Yes, PowerBI datamart can be used for 110 GB of data volume. Data volumes of 110 GB or less can also be stored and modeled in a lakehouse. For data volumes larger than 110 GB, a data warehouse may be a better option. \n\nScenario 3: \nA team of data scientists and machine learning engineers need to build and deploy machine\nlearning models. They're familiar with Jupyter notebooks and Python, and they have a\nstrong preference for using open source tools. They need to work with large amounts of\nunstructured and semi-structured data, and they need to be able to collaborate easily. They\nalso need to be able to easily deploy their models to production, and they need to be able\nto monitor and manage them once they're in production.\nWhich solution should they use?\nData warehouse\nLakehouse\nMachine learning\nworkspace\nPower BI Datamar t\n\nH

In [18]:
qa({"query": "What are the steps to load a CSV file to a delta table in Microsoft Fabric?"})

{'query': 'What are the steps to load a CSV file to a delta table in Microsoft Fabric?',
 'result': ' The steps to load a CSV file to a delta table in Microsoft Fabric are: select Synapse Data Engineering experience, make sure you are in desired workspace or select/create one, and select the Lakehouse icon under New section in the main image. Once the Lakehouse is created, you can upload a CSV file to it and convert it to a Delta table.<|im_end|>'}