In [18]:
# Uncomment the following line if you need to initialize FAISS with no AVX2 optimization
# os.environ['FAISS_NO_AVX2'] = '1'
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings

## 1. Load documents

### 1.1. Get documents from the website

#### 1.1.1. Crawl the domain

In [174]:
from urllib.parse import urlparse
from langchain_community.document_loaders import RecursiveUrlLoader
from bs4 import BeautifulSoup
import re

def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    # print(soup.text.strip())
    return re.sub("\s+", " ", re.sub("\n+", "", soup.get_text(" ")).strip())

# Create the RecursiveUrlLoader
loader = RecursiveUrlLoader(
    "https://www.fau.eu/studiengang/artificial-intelligence-msc/", # starting url
    max_depth=2,  # Maximum depth to crawl
    extractor=bs4_extractor,  # Custom extractor function (optional)
    metadata_extractor=None,  # Custom metadata extractor function (optional)
    exclude_dirs=(),  # Directories to exclude from crawling (optional)
    timeout=7,  # Timeout for each request (optional)
    check_response_status=True,  # Check if response status is successful (optional)
    continue_on_failure=True,  # Continue crawling even if a request fails (optional)
    prevent_outside=True,  # Prevent crawling outside the starting domain (optional)
    base_url="https://www.fau.eu/" # Base URL to use for relative links (optional)
)
# Load the documents
documents = loader.load()


  soup = BeautifulSoup(html, "lxml")
  k = self.parse_starttag(i)


#### 1.1.2. Remove non html files

In [None]:
import re
html_documents = list(filter(lambda doc: re.search(r"html", doc.metadata["content_type"]), documents))
print(len(html_documents))

#### 1.1.3. Clean html files

##### Data is already cleaned through extractor parameter in RecursiveLoader constructor

#### 1.1.4. Persist the html files (Optional)

In [None]:
import os
import shutil

def delete_all_files_in_directory(directory: str):
    # Check if the directory exists
    if os.path.exists(directory):
        # Iterate through all files and directories in the specified directory
        for filename in os.listdir(directory):
            file_path = os.path.join(directory, filename)
            try:
                # Check if it's a file and delete it
                if os.path.isfile(file_path) or os.path.islink(file_path):
                    os.unlink(file_path)
                # Check if it's a directory and remove it
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)
            except Exception as e:
                print(f'Failed to delete {file_path}. Reason: {e}')
    else:
        print(f'The directory {directory} does not exist.')


In [None]:
import os
from typing import List
from langchain.schema import Document

html_saving_path = "./docs/fau"
# TODO: check the last changed date instead of removing all files
delete_all_files_in_directory(html_saving_path)
class HtmlSaver:
    def _save_html(self, save_path: str, url: str, content: str):
        # Generate a filename based on the URL
        filename = os.path.join(save_path, self._sanitize_filename(url) + '.html')
        if not os.path.exists(filename):
            with open(filename, 'w', encoding='utf-8') as file:
                file.write(content)
    
    def _sanitize_filename(self, url: str) -> str:
        # Replace invalid filename characters with underscores
        return url.replace('http://', '').replace('https://', '').replace('/', '_').replace(':', '_')

    def save(self, documents: List[Document], save_path: str):
        for document in documents:
            self._save_html(save_path=save_path, url=document.metadata['source'], content=str(document))
        return documents
    
saver: HtmlSaver = HtmlSaver()
saver.save(documents=html_documents, save_path=html_saving_path)


[Document(metadata={'source': 'https://www.fau.eu/studiengang/artificial-intelligence-msc/', 'content_type': 'text/html; charset=UTF-8', 'title': 'Artificial Intelligence (AI) | FAU Erlangen-Nürnberg', 'description': 'The desired goal that is to be achieved by the study of Artificial Intelligence is the education of thoroughly skilled computer scientists.', 'language': 'en-GB'}, page_content='Artificial Intelligence (AI) | FAU Erlangen-Nürnberg Skip navigation Skip to navigation Skip to the bottom Simulate organization breadcrumb open Simulate organization breadcrumb close Friedrich-Alexander-Universität Erlangen-Nürnberg Please enter the search term for searching into the documents of this website: Suche öffnen Deutsch UnivIS Campo Friedrich-Alexander-Universität Erlangen-Nürnberg Navigation Navigation close FAU News Energy-saving measures at FAU Welcome to FAU Strategy and objectives History and remembrance Campus locations at FAU Site development at FAU Organisation and committees L

#### 1.2. Load Documents from the local cache. (Optional)

In [21]:
import os, ast
from langchain.schema import Document

# Directory containing the files
directory = './docs/fau'
# Function to parse the file content
def parse_file_content(file_content):
    parts = file_content.split(" metadata=")
    page_content = parts[0].split("=", 1)[1].strip().strip("'")
    metadata = ast.literal_eval(parts[1].strip())
    return page_content, metadata
# List to store Document objects
html_documents = []
# Iterate through all files in the directory
for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)
    with open(file_path, 'r', encoding='utf-8') as file:
        file_content = file.read()
        page_content, metadata = parse_file_content(file_content)
        doc = Document(page_content=page_content, metadata=metadata)
        html_documents.append(doc)
print(html_documents[0].metadata)


{'source': 'https://www.fau.eu/', 'content_type': 'text/html; charset=UTF-8', 'title': 'FAU Erlangen-Nürnberg', 'description': 'FAU is one of the largest research universities in Germany. Its five faculties cover the entire spectrum of modern academic disciplines.', 'language': 'en-GB'}


## 2. Create Index

In [23]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(html_documents)

### 2.1. Load embedding model

In [24]:
model_name = 'all-MiniLM-L6-v2'
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embed_model = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

  from tqdm.autonotebook import tqdm, trange


### 2.2. Load index

In [25]:
index_location = "./storage/fau"
index_name = "fau"
# try:
#     print("Loading index from persisted index...")
#     FAISS.load_local(folder_path=index_location, embeddings=embed_model, index_name=index_name)
# except Exception as e:
#     print("Failed loading index from persisted index: \n", e)
#     db = FAISS.from_documents(html_documents, embed_model)
#     db.save_local(folder_path=index_location, index_name=index_name)
db = FAISS.from_documents(html_documents, embed_model)
db.save_local(folder_path=index_location, index_name=index_name)
print(db.index.ntotal)

104


#### 2.2.1. Test index (Optional)

In [50]:
query = "What is the Design and Structure of the Artificial Intelligence (AI) (M.Sc.)"
query = "What are the required documents for enrolling as a foreigner."
docs = db.similarity_search(k=3, query=query)
# print(docs[0].page_content)
req = ""
for doc in docs:
    req += doc.page_content
print(req)

Application and enrollment for international applicants | FAU Erlangen-Nürnberg Skip navigation Skip to navigation Skip to the bottom Simulate organization breadcrumb open Simulate organization breadcrumb close Friedrich-Alexander-Universität Erlangen-Nürnberg Please enter the search term for searching into the documents of this website: Suche öffnen Deutsch UnivIS Campo Friedrich-Alexander-Universität Erlangen-Nürnberg Navigation Navigation close FAU News Energy-saving measures at FAU Welcome to FAU Strategy and objectives History and remembrance Campus locations at FAU Site development at FAU Organisation and committees Legal regulations Finding your way around: contacts and directions Faculties and organisational units Faculties Universitätsklinikum Erlangen Central research institutions Collaboration and partnerships FAU People Students Employees Researchers Alumni Working at FAU Jobs at FAU Professorships Energy technology and technical building services Vocational training at FAU

### 2.3. For checking availability of the models

In [27]:
import platform    # For getting the operating system name
import subprocess  # For executing a shell command

def ping(host):
    """
    Returns True if host (str) responds to a ping request.
    Remember that a host may not respond to a ping (ICMP) request even if the host name is valid.
    """
    # Option for the number of packets as a function of
    param = '-v'
    cmd = 'curl.exe' if platform.system().lower()=='windows' else 'curl'
    # Building the command. Ex: "ping -c 1 google.com"
    command = [cmd, host, param]
    return subprocess.call(command) == 0

### 2.4. Load OpenAI API key

In [28]:
import os
from dotenv import load_dotenv
load_dotenv(override=True)
OPENAI_API_KEY= os.getenv('OPENAI_API_KEY')

### 2.5. Load model

In [41]:
from langchain_community.chat_models import ChatOllama
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

if ping("localhost:11434"):
    print("ollama is online.")
    llm = ChatOllama(model="llama3", base_url="http://localhost:11434", num_gpu=1, temperature=0.0)
else:
    llm = ChatOpenAI(
        model="gpt-3.5-turbo",
        temperature=0,
        max_tokens=40000,
        timeout=None,
        max_retries=2,
        api_key=OPENAI_API_KEY,  # if you prefer to pass api key in directly instaed of using env vars
        # base_url="...",
        # organization="...",
        # other params...
    )
# prompt = ChatPromptTemplate.from_messages([
#     ("system", "You are an assistant for answering questions based on the following information: {retrieved_docs}.\
#         Ensure that your answers are solely based on this retrieved information and do not include any new or generated content."),
#     ("human", "{user_input}"),
# ])

### 2.6. Function for prompting the chat

In [42]:

def ask_fau(query: str):
    prompt = ChatPromptTemplate.from_messages([
    ("system", "You are an assistant for answering questions only based on the information the user provide."),
    # ("human", "{user_input}"),
    ("human", "{user_input} based on the following information: {retrieved_docs}."),
    ])
    top_k_docs = db.similarity_search(k=2, query=query)
    retrieved_docs = ""
    for doc in top_k_docs:
        retrieved_docs += doc.page_content
    print(prompt.format_prompt(user_input=query, retrieved_docs=retrieved_docs))
    response = llm.invoke(prompt.format_prompt(user_input=query, retrieved_docs=retrieved_docs))
    print(response.content)


In [43]:
query = "when is the application deadline."
ask_fau(query)

messages=[SystemMessage(content='You are an assistant for answering questions only based on the information the user provide.'), HumanMessage(content='when is the application deadline. based on the following information: Application and enrollment for international applicants | FAU Erlangen-Nürnberg Skip navigation Skip to navigation Skip to the bottom Simulate organization breadcrumb open Simulate organization breadcrumb close Friedrich-Alexander-Universität Erlangen-Nürnberg Please enter the search term for searching into the documents of this website: Suche öffnen Deutsch UnivIS Campo Friedrich-Alexander-Universität Erlangen-Nürnberg Navigation Navigation close FAU News Energy-saving measures at FAU Welcome to FAU Strategy and objectives History and remembrance Campus locations at FAU Site development at FAU Organisation and committees Legal regulations Finding your way around: contacts and directions Faculties and organisational units Faculties Universitätsklinikum Erlangen Central

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [235]:
query = "What is the Design and Structure of the Artificial Intelligence (AI) (M.Sc.)? Give short answer."
ask_fau(query)

messages=[SystemMessage(content='You are an assistant for answering questions only based on the information the user provide.'), HumanMessage(content='What is the Design and Structure of the Artificial Intelligence (AI) (M.Sc.)? Give short answer.'), HumanMessage(content='Answer Questions based on the following information: Artificial Intelligence (AI) | FAU Erlangen-Nürnberg Skip navigation Skip to navigation Skip to the bottom Simulate organization breadcrumb open Simulate organization breadcrumb close Friedrich-Alexander-Universität Erlangen-Nürnberg Please enter the search term for searching into the documents of this website: Suche öffnen Deutsch UnivIS Campo Friedrich-Alexander-Universität Erlangen-Nürnberg Navigation Navigation close FAU News Energy-saving measures at FAU Welcome to FAU Strategy and objectives History and remembrance Campus locations at FAU Site development at FAU Organisation and committees Legal regulations Finding your way around: contacts and directions Facu

In [234]:
query = "How are my . I want short answer."
ask_fau(query)

messages=[SystemMessage(content='You are an assistant for answering questions only based on the information the user provide.'), HumanMessage(content='How are my 120 ECTS distributed in Artificial Intelligence (M.Sc) degree program at fau. I want short answer.'), HumanMessage(content="Answer Questions based on the following information: Artificial Intelligence (AI) | FAU Erlangen-Nürnberg Skip navigation Skip to navigation Skip to the bottom Simulate organization breadcrumb open Simulate organization breadcrumb close Friedrich-Alexander-Universität Erlangen-Nürnberg Please enter the search term for searching into the documents of this website: Suche öffnen Deutsch UnivIS Campo Friedrich-Alexander-Universität Erlangen-Nürnberg Navigation Navigation close FAU News Energy-saving measures at FAU Welcome to FAU Strategy and objectives History and remembrance Campus locations at FAU Site development at FAU Organisation and committees Legal regulations Finding your way around: contacts and di

In [233]:

query = "summarize the given information."
ask_fau(query)

messages=[SystemMessage(content='You are an assistant for answering questions only based on the information the user provide.'), HumanMessage(content='summarize the given information.'), HumanMessage(content='Answer Questions based on the following information: Research information system | FAU Erlangen-Nürnberg Skip navigation Skip to navigation Skip to the bottom Simulate organization breadcrumb open Simulate organization breadcrumb close Friedrich-Alexander-Universität Erlangen-Nürnberg Please enter the search term for searching into the documents of this website: Suche öffnen Deutsch UnivIS Campo Friedrich-Alexander-Universität Erlangen-Nürnberg Navigation Navigation close FAU News Energy-saving measures at FAU Welcome to FAU Strategy and objectives History and remembrance Campus locations at FAU Site development at FAU Organisation and committees Legal regulations Finding your way around: contacts and directions Faculties and organisational units Faculties Universitätsklinikum Erl