In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from collections import deque
import logging
import os

In [None]:
class Crawler:
    def __init__(self, max_depth=2, max_pages=1):
        self.max_depth = max_depth
        self.max_pages = max_pages
        self.seen = set()
        self.pages = []
        self.queue = deque()

    def add_to_queue(self, url, depth):
        self.queue.append({'url': url, 'depth': depth})

    def should_continue_crawling(self):
        return self.queue and len(self.pages) < self.max_pages

    def is_too_deep(self, depth):
        return depth > self.max_depth

    def is_already_seen(self, url):
        return url in self.seen

    def fetch_page(self, url):
        try:
            response = requests.get(url)
            response.raise_for_status()
            return response.text
        except requests.RequestException as e:
            print(f"Failed to fetch {url}: {e}")
            return ""

    def parse_html(self, html):
        soup = BeautifulSoup(html, 'html.parser')
        for a in soup.find_all('a'):
            a.attrs = {}
        text = soup.get_text(separator=' ', strip=True)
        return text

    def extract_urls(self, html, base_url):
        urls = list()
        soup = BeautifulSoup(html, 'html.parser')
        relative_urls = [a.get('href') for a in soup.find_all('a') if a.get('href')]
        for relative_url in relative_urls:
            if "zh" in relative_url.split("/"):
                continue
            if relative_url.startswith("#"):
                continue
            if  relative_url.startswith("//nightlies.apache.org/flink/flink-docs-release-1.16/"):
                urls.append(urljoin("https:", relative_url))
                # logging.error(f" -- {relative_url}, {urls[-1]}")
            elif relative_url.startswith("https://nightlies.apache.org/flink/flink-docs-release-1.16/"):
                urls.append(relative_url)
                # logging.error(f" -- {relative_url}, {urls[-1]}")
            elif "https://nightlies.apache.org/flink/flink-docs-release-1.16/" not in relative_url:
                #urls.append(urljoin("https://nightlies.apache.org/flink/flink-docs-release-1.16/", relative_url))
                logging.error(f"{base_url} -- {relative_url}")
        return urls

    def crawl(self, start_url):
        self.add_to_queue(start_url, 0)

        while self.should_continue_crawling():
            current = self.queue.popleft()
            url, depth = current['url'], current['depth']

            if self.is_too_deep(depth) or self.is_already_seen(url):
                continue

            self.seen.add(url)
            html = self.fetch_page(url)
            # print(f"Fetched {url}")
            if html:
                print(f"Parsing {url}")
                self.pages.append({'url': url, 'content': self.parse_html(html)})
                new_urls = self.extract_urls(html, url)
                for new_url in new_urls:
                    self.add_to_queue(new_url, depth + 1)

        return self.pages




In [None]:
crawler = Crawler(max_depth=5, max_pages=2500)
new_pages = crawler.crawl('https://nightlies.apache.org/flink/flink-docs-release-1.16/')
for page in new_pages:
    print(page['url'])

In [None]:
len(new_pages)

In [None]:
len(new_pages[0]['content'].split(" "))

In [None]:
from langchain_text_splitters import CharacterTextSplitter, TokenTextSplitter, RecursiveCharacterTextSplitter


In [None]:
from langchain_core.documents import Document
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

In [None]:
documents = list()
for page in new_pages:
    chunks = splitter.split_text(page['content'])
    for chunk in chunks:
        documents.append(Document(
            page_content=chunk,
            metadata={'url': page['url'], 'type':"document"}
        ))


In [None]:
len(documents)

In [None]:
splitter = TokenTextSplitter(encoding_name="")

In [None]:
documents[0].to_json()

In [None]:
MILVUS_URL = os.environ['MILVUS_URL']
MILVUS_KEY = os.environ['MILVUS_URL']
DIMS = 1024
EMBEDDING_MODEL = "embed-english-v3.0"
COHERE_KEY=os.environ['COHERE_KEY']

In [None]:
from langchain_cohere.embeddings import CohereEmbeddings

In [None]:
import cohere

co = cohere.Client(COHERE_KEY)

response = co.tokenize(text=new_pages[0]['content'], model=EMBEDDING_MODEL)  # optional
print(response)

In [None]:
splitter.split_text(new_pages[0]['content'])

In [None]:
len(response.tokens)

In [None]:
len(new_pages[0]['content'].split(" "))

In [None]:
embedding_fn = CohereEmbeddings(model=EMBEDDING_MODEL, cohere_api_key=COHERE_KEY)

In [None]:
from langchain_community.vectorstores.zilliz import Zilliz

zilliz = Zilliz(
    embedding_function = embedding_fn,
    collection_name="Flink",
    connection_args={"uri": MILVUS_URL, "token": MILVUS_KEY},
    auto_id=True
)

In [None]:
indexes = list(range(len(documents)))

In [None]:
len(indexes)

In [None]:
start = 956

In [None]:
import time
for index, doc in zip(indexes[start:], documents[start:]):
    print(index)
    time.sleep(1)
    zilliz.add_documents([doc], batch_size=1)

In [None]:
retriever = zilliz.as_retriever(search_kwargs={"k": 10})

In [None]:
retriever.invoke("WHat is flink")

In [None]:
import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [None]:
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_cohere import CohereRerank

In [None]:
compressor = CohereRerank(top_n=5, cohere_api_key=COHERE_KEY)

In [None]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

In [None]:
compression_retriever.invoke("What is flink")

In [None]:
from langchain import hub
prompt = hub.pull("rlm/rag-prompt")


In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

from langchain_cohere import ChatCohere

In [None]:
llm = ChatCohere(model="command-r-plus", temperature=0.0, cohere_api_key=COHERE_KEY)

In [None]:
def format_docs(docs: list[Document]):
    
    text = ""

    for doc in docs:
        xml_tag_start = f"<{doc.metadata['url'].lower()}>"
        xml_tag_end = f"</{doc.metadata['url'].lower()}>"
        content = doc.page_content
        text += f"{xml_tag_start}\n{content}\n{xml_tag_end}\n\n"

    return text

In [None]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)



In [None]:
response = rag_chain.invoke("List down all the commands used in the flink documenatation along with explanation of the command")

In [None]:
print(response)

In [None]:
GEMINI_KEY = "AIzaSyBVI2jAHepUzLwWoK6qwXCOYxD0NFzZIns"

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI

In [None]:
google_llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", google_api_key=GEMINI_KEY, temperature=0.0)

In [None]:
from langchain_core.prompts import PromptTemplate
example_prompt = PromptTemplate.from_template("""You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know.Keep the answer concise and to the point. Write down the citation at the end of the answer that you have taken reference from. The citation names are in form of urls, that are provided in the xml tags.
Follow below mention format for citation
Citation:
        (1) Source URL 1
        (2) Source URL 2
Question: {question} \nContext: {context} \nAnswer""")

In [None]:
google_rag = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | example_prompt
    | google_llm
    | StrOutputParser()
)

In [None]:
google_rag.get_graph().print_ascii()

In [None]:
response = google_rag.invoke("List down all the commands used in the flink documenatation along with explanation of the command.")

In [None]:
to_markdown(response)

In [None]:
prompt