### Building a RAG  HEF Assistant with Fabric

In [None]:
%pip install openai==1.12.0 azure-kusto-data langchain tenacity langchain-openai pypdf
%pip install beautifulsoup4 langchain-community

In [None]:
%pip install openai --upgrade

In [None]:
from openai import AzureOpenAI
from IPython.display import display, HTML
import os
import textwrap
import json 
import requests
import pandas as pd
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

from notebookutils import mssparkutils
from azure.kusto.data import KustoClient, KustoConnectionStringBuilder
from azure.kusto.data.exceptions import KustoServiceError
from azure.kusto.data.helpers import dataframe_from_result_table

from langchain.text_splitter import CharacterTextSplitter,RecursiveCharacterTextSplitter
from langchain_openai import AzureOpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain_community.document_loaders import WebBaseLoader
from tenacity import retry, wait_random_exponential, stop_after_attempt
from bs4 import SoupStrainer
from bs4 import BeautifulSoup

In [None]:
OPENAI_GPT4_DEPLOYMENT_NAME="gpt-4o-kenya-hack"
OPENAI_DEPLOYMENT_ENDPOINT="Your-OpenAI-Endpoint" # Replace with your OpenAI endpoint
OPENAI_API_KEY="Your-OpenAI-API-KEY" # Replace with your OpenAI API key
OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME = "text-embedding-ada-002-kenya-hack"


KUSTO_URI = 'Your-Kusto-URI' # Replace with your kusto URI
KUSTO_DATABASE = "HEF_eventhouse"
KUSTO_TABLE = "hefEmbeddings"
accessToken = mssparkutils.credentials.getToken(KUSTO_URI)

##### Creating an Azure OpenAI client and defining a function to calculate embeddings

In [None]:
client = AzureOpenAI(
        azure_endpoint=OPENAI_DEPLOYMENT_ENDPOINT,
        api_key=OPENAI_API_KEY,
        api_version="2023-09-01-preview"
    )

#we use the tenacity library to create delays and retries when calling openAI embeddings to avoid hitting throttling limits
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def generate_embeddings(text): 
    # replace newlines, which can negatively affect performance.
    txt = text.replace("\n", " ")
    return client.embeddings.create(input = [txt], model=OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME).data[0].embedding



##### Reading the pdf files, divide it into 1000 chars chunks

In [None]:
# splitting into 1000 char long chunks with 30 char overlap
# split ["\n\n", "\n", " ", ""]
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=30,
)

# List of PDF files (adjust filenames as per your lakehouse)
pdf_files = [
    {"name": "UF-FAQs.pdf", "path": "/lakehouse/default/Files/UF-FAQs.pdf"},
    {"name": "HEF-NFM-FAQs.pdf", "path": "/lakehouse/default/Files/HEF-NFM-FAQs.pdf"},  
    {"name": "Helb-FAQS.pdf", "path": "/lakehouse/default/Files/Helb-FAQS.pdf"}, 
    {"name": "University-Funding-FAQs2.pdf", "path": "/lakehouse/default/Files/University-Funding-FAQs2.pdf"},
]

# List of web URLs
web_urls = [
    "https://www.helb.co.ke/faqs/students-faqs/",
    "https://www.helb.co.ke/faqs/loanees-faqs/",
    "https://www.helb.co.ke/faqs/employers-faqs/",
    "https://www.helb.co.ke/faqs/institutions-faqs/",
    "https://www.hef.co.ke/",
    "https://www.hef.co.ke/faqs/",
    "https://kuccps.net/frequently-asked-questions",
    "https://www.universitiesfund.go.ke/blog/frequently-asked-questions/",
]

In [None]:
# Load PDFs
all_pages = []
for pdf in pdf_files:
    try:
        loader = PyPDFLoader(pdf["path"])
        pages = loader.load_and_split(text_splitter=splitter)
        print(f"Loaded {len(pages)} chunks from {pdf['name']}")
        all_pages.extend(pages)
    except Exception as e:
        print(f"Failed to load {pdf['name']}: {e}")

In [None]:
# Load web content
for url in web_urls:
    try:
        # Try WebBaseLoader with SSL verification disabled
        loader = WebBaseLoader(url, verify_ssl=False)
        pages = loader.load_and_split(text_splitter=splitter)
        print(f"Loaded {len(pages)} chunks from {url}")
        all_pages.extend(pages)
    except Exception as e:
        print(f"WebBaseLoader failed for {url}: {e}")
        # Fallback: Use requests directly
        try:
            response = requests.get(url, verify=False)  # Bypass SSL verification
            response.raise_for_status()  # Check for HTTP errors
            soup = BeautifulSoup(response.text, "html.parser")
            text = soup.get_text(separator=" ")  # Extract all text
            
            # Create a single Document object manually
            from langchain.docstore.document import Document
            doc = Document(page_content=text, metadata={"source": url})
            pages = splitter.split_documents([doc])
            print(f"Fallback loaded {len(pages)} chunks from {url}")
            all_pages.extend(pages)
        except Exception as fallback_e:
            print(f"Fallback failed for {url}: {fallback_e}")

# Total chunks
print("Total number of chunks: ", len(all_pages))

##### Saving the text chunks to a pandas dataframe

In [None]:
# Save to DataFrame
import pandas as pd
df = pd.DataFrame(columns=['document_name', 'content', 'embedding'])
for page in all_pages:
    # Use source (URL or file path) as document name
    doc_name = page.metadata.get('source', 'Unknown PDF')
    df.loc[len(df.index)] = [doc_name, page.page_content, ""]
df.head()

##### Calculating embeddings

In [None]:
# Generate embeddings (assuming generate_embeddings is defined)
df["embedding"] = df.content.apply(lambda x: generate_embeddings(x))
print(df.head(2))

# Optional: Save DataFrame to a file or database for later use
# df.to_csv("/lakehouse/default/Files/combined_embeddings.csv", index=False)

##### Writing the data to MS Fabric Eventhouse

In [None]:
df_sp = spark.createDataFrame(df)

df_sp.write.\
format("com.microsoft.kusto.spark.synapse.datasource").\
option("kustoCluster",KUSTO_URI).\
option("kustoDatabase",KUSTO_DATABASE).\
option("kustoTable", KUSTO_TABLE).\
option("accessToken", accessToken ).\
mode("Append").save()

### Vector search on Fabric Eventhouse

##### A function to calling GPT4 for a Natural Language answer

In [None]:
def call_openAI(text):
    response = client.chat.completions.create(
        model=OPENAI_GPT4_DEPLOYMENT_NAME,
        messages = text,
        temperature=0
    )

    return response.choices[0].message.content

#####  A function  retrieving answers using embeddings with similarity search

In [None]:
def get_answer_from_eventhouse(question, nr_of_answers=1):
        searchedEmbedding = generate_embeddings(question)
        kusto_query = KUSTO_TABLE + " | extend similarity = series_cosine_similarity(dynamic("+str(searchedEmbedding)+"), embedding) | top " + str(nr_of_answers) + " by similarity desc "
        kustoDf  = spark.read\
        .format("com.microsoft.kusto.spark.synapse.datasource")\
        .option("kustoCluster",KUSTO_URI)\
        .option("kustoDatabase",KUSTO_DATABASE)\
        .option("accessToken", accessToken)\
        .option("kustoQuery", kusto_query).load()

        return kustoDf

In [None]:
# Retrieves 2 answers from Eventhouse
nr_of_answers = 2
question = "Can IGCSE graduates apply to KUCCPS for placement to universities and colleges?"
answers_df = get_answer_from_eventhouse(question, nr_of_answers)

# Concatenates the answers
answer = ""
for row in answers_df.rdd.toLocalIterator():
    answer = answer + " " + row['content']

# Creates a prompt for GPT4 with the question and the 2 answers
prompt = 'Question: {}'.format(question) + '\n' + 'Information: {}'.format(answer)
# prepare prompt
messages = [{"role": "system", "content": "You are a HELPFUL assistant answering users questions. Answer the question using the provided information and do not add anything else."},
            {"role": "user", "content": prompt}]

result = call_openAI(messages)
display(result)
