In [3]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from collections import deque
import logging
import os

In [2]:
class Crawler:
    def __init__(self, max_depth=2, max_pages=1):
        self.max_depth = max_depth
        self.max_pages = max_pages
        self.seen = set()
        self.pages = []
        self.queue = deque()

    def add_to_queue(self, url, depth):
        self.queue.append({'url': url, 'depth': depth})

    def should_continue_crawling(self):
        return self.queue and len(self.pages) < self.max_pages

    def is_too_deep(self, depth):
        return depth > self.max_depth

    def is_already_seen(self, url):
        return url in self.seen

    def fetch_page(self, url):
        try:
            response = requests.get(url)
            response.raise_for_status()
            return response.text
        except requests.RequestException as e:
            print(f"Failed to fetch {url}: {e}")
            return ""

    def parse_html(self, html):
        soup = BeautifulSoup(html, 'html.parser')
        for a in soup.find_all('a'):
            a.attrs = {}
        text = soup.get_text(separator=' ', strip=True)
        return text

    def extract_urls(self, html, base_url):
        urls = list()
        soup = BeautifulSoup(html, 'html.parser')
        relative_urls = [a.get('href') for a in soup.find_all('a') if a.get('href')]
        for relative_url in relative_urls:
            if "zh" in relative_url.split("/"):
                continue
            if "#" in relative_url:
                continue
            if  relative_url.startswith("//nightlies.apache.org/flink/flink-docs-release-1.16/docs/deployment/"):
                urls.append(urljoin("https:", relative_url))
                # logging.error(f" -- {relative_url}, {urls[-1]}")
            elif relative_url.startswith("https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/deployment/"):
                urls.append(relative_url)
                # logging.error(f" -- {relative_url}, {urls[-1]}")
            elif "https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/deployment/" not in relative_url:
                #urls.append(urljoin("https://nightlies.apache.org/flink/flink-docs-release-1.16/", relative_url))
                # logging.error(f"{base_url} -- {relative_url}")
                pass
        return urls

    def crawl(self, start_url):
        self.add_to_queue(start_url, 0)

        while self.should_continue_crawling():
            current = self.queue.popleft()
            url, depth = current['url'], current['depth']

            if self.is_too_deep(depth) or self.is_already_seen(url):
                continue

            self.seen.add(url)
            html = self.fetch_page(url)
            # print(f"Fetched {url}")
            if html:
                print(f"Parsing {url}")
                self.pages.append({'url': url, 'content': self.parse_html(html)})
                new_urls = self.extract_urls(html, url)
                for new_url in new_urls:
                    self.add_to_queue(new_url, depth + 1)

        return self.pages




In [3]:
crawler = Crawler(max_depth=5, max_pages=2500)
new_pages = crawler.crawl('https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/')
for page in new_pages:
    print(page['url'])

Parsing https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/
Parsing https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/deployment/overview/
Parsing https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/deployment/resource-providers/standalone/overview/
Parsing https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/deployment/resource-providers/standalone/working_directory/
Parsing https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/deployment/resource-providers/standalone/docker/
Parsing https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/deployment/resource-providers/standalone/kubernetes/
Parsing https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/deployment/resource-providers/native_kubernetes/
Parsing https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/deployment/resource-providers/yarn/
Parsing https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/deployment/config/
Parsing https

In [4]:
len(new_pages)

38

In [5]:
len(new_pages[0]['content'].split(" "))

760

In [6]:
from langchain_text_splitters import CharacterTextSplitter, TokenTextSplitter, RecursiveCharacterTextSplitter


In [7]:
from langchain_core.documents import Document
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

In [8]:
documents = list()
for page in new_pages:
    chunks = splitter.split_text(page['content'])
    for chunk in chunks:
        documents.append(Document(
            page_content=chunk,
            metadata={'url': page['url'], 'type':"document"}
        ))


In [9]:
len(documents)

901

In [11]:
documents[10].to_json()

{'lc': 1,
 'type': 'constructor',
 'id': ['langchain', 'schema', 'document', 'Document'],
 'kwargs': {'page_content': 'Python Table API TableEnvironment Operations â\x96¾ Overview Row-based Operations Data Types System (Built-in) Functions User Defined Functions â\x96¾ Overview General User-defined Functions Vectorized User-defined Functions Conversions between PyFlink Table and Pandas DataFrame Conversions between Table and DataStream SQL Catalogs Metrics Connectors DataStream API â\x96¾ Intro to the Python DataStream API Operators â\x96¾ Overview Windows Process Function Data Types State Dependency Management Execution Mode Configuration Debugging Environment Variables FAQ DataSet API (Legacy) â\x96¾ Overview Transformations Iterations Zipping Elements Hadoop MapReduce compatibility with Flink Local Execution Cluster Execution Batch Examples Libraries â\x96¾ Event Processing (CEP) State Processor API Graphs â\x96¾ Overview Graph API Iterative Graph Processing Library Methods Graph Al

In [2]:
MILVUS_URL = os.environ['MILVUS_URL']
MILVUS_KEY = os.environ['MILVUS_URL']
DIMS = 1024
EMBEDDING_MODEL = "embed-english-v3.0"
COHERE_KEY=os.environ['COHERE_KEY']

In [2]:
# from langchain_cohere.embeddings import CohereEmbeddings

In [14]:
# import cohere

# co = cohere.Client(COHERE_KEY)

# response = co.tokenize(text=new_pages[0]['content'], model=EMBEDDING_MODEL)  # optional
# print(response)

In [15]:
splitter.split_text(new_pages[0]['content'])

['Docs | Apache Flink v1.16.2 Try Flink â\x96¾ First steps Fraud Detection with the DataStream API Real Time Reporting with the Table API Flink Operations Playground Learn Flink â\x96¾ Overview Intro to the DataStream API Data Pipelines & ETL Streaming Analytics Event-driven Applications Fault Tolerance Concepts â\x96¾ Overview Stateful Stream Processing Timely Stream Processing Flink Architecture Glossary Application Development â\x96¾ Project Configuration â\x96¾ Overview Using Maven Using Gradle Connectors and Formats Test Dependencies Advanced Configuration DataStream API â\x96¾ Overview Execution Mode (Batch/Streaming) Event Time â\x96¾ Generating Watermarks Builtin Watermark Generators State & Fault Tolerance â\x96¾ Working with State The Broadcast State Pattern Checkpointing Queryable State State Backends Data Types & Serialization â\x96¾ Overview State Schema Evolution Custom State Serialization 3rd Party Serializers User-Defined Functions Operators â\x96¾ Overview Windows Join

In [16]:
len(new_pages[0]['content'].split(" "))

760

In [17]:
# embedding_fn = CohereEmbeddings(model=EMBEDDING_MODEL, cohere_api_key=COHERE_KEY)

In [18]:
from langchain_huggingface import HuggingFaceEmbeddings

model_name = "Alibaba-NLP/gte-large-en-v1.5"
model_kwargs = {'device': 'mps', "trust_remote_code": True}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs)

  from tqdm.autonotebook import tqdm, trange


In [19]:
from langchain_community.vectorstores.zilliz import Zilliz

zilliz = Zilliz(
    embedding_function = hf,
    collection_name="Flink",
    connection_args={"uri": MILVUS_URL, "token": MILVUS_KEY},
    auto_id=True
)

In [20]:
indexes = list(range(len(documents)))

In [21]:
len(indexes)

901

In [22]:
start = 0

In [23]:
for index, doc in zip(indexes[start:], documents[start:]):
    print(index)
    zilliz.add_documents([doc], batch_size=1)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [24]:
retriever = zilliz.as_retriever(search_kwargs={"k": 10})

In [25]:
retriever.invoke("WHat is flink")

[Document(page_content='Flink’s native Kubernetes integration allows you to directly deploy Flink on a running Kubernetes cluster.', metadata={'url': 'https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/deployment/resource-providers/native_kubernetes/', 'type': 'document', 'pk': 450143955092274204}),
 Document(page_content='Hence, you need to build a dedicated Flink Image per application.\nPlease check here for the details.', metadata={'url': 'https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/deployment/resource-providers/standalone/docker/', 'type': 'document', 'pk': 450143955092274072}),
 Document(page_content='is that Flink might immediately build an incremental checkpoint on top of the restored one. Therefore,', metadata={'url': 'https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/ops/state/savepoints/', 'type': 'document', 'pk': 450143955092273016}),
 Document(page_content='It will be dropped in FLINK-26000 .', metadata={'url': 'https://nightl

In [26]:
import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [27]:
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_cohere import CohereRerank

In [28]:
compressor = CohereRerank(top_n=5, cohere_api_key=COHERE_KEY)

In [110]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

In [19]:
compression_retriever.invoke("What is flink")

[Document(page_content='Application Profiling & Debugging Monitoring â\x96¾ Monitoring Checkpointing Monitoring Back Pressure Upgrading Applications and Flink Versions Production Readiness Checklist Flink Development â\x96¾ Importing Flink into an IDE Building Flink from Source Internals â\x96¾ Jobs and Scheduling Task Lifecycle File Systems Project Homepage JavaDocs ScalaDocs PyDocs Pick Docs Version â\x96¾ 1.16 (â\x9c\x93) v1.16 v1.15 All Versions ä¸\xadæ\x96\x87ç\x89\x88 Apache Flink Documentation Apache Flink Documentation # Apache Flink is a framework and distributed processing engine for stateful computations over unbounded and bounded data streams. Flink has been designed to run in all common cluster environments perform computations at in-memory speed and at any scale . Try Flink # If youâ\x80\x99re interested in playing around with Flink, try one of our tutorials: Fraud Detection with the DataStream API Real Time Reporting with the Table API Intro to PyFlink Flink Operations P

In [20]:
from langchain import hub
prompt = hub.pull("rlm/rag-prompt")


In [111]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

from langchain_cohere import ChatCohere

In [112]:
llm = ChatCohere(model="command-r-plus", temperature=0.0, cohere_api_key=COHERE_KEY)

In [113]:
def format_docs(docs: list[Document]):
    
    text = ""

    for doc in docs:
        xml_tag_start = f"<{doc.metadata['url'].lower()}>"
        xml_tag_end = f"</{doc.metadata['url'].lower()}>"
        content = doc.page_content
        text += f"{xml_tag_start}\n{content}\n{xml_tag_end}\n\n"

    return text

In [114]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)



In [115]:
response = rag_chain.invoke("List down all the commands used in the flink documenatation along with explanation of the command")



TooManyRequestsError: status_code: 429, body: data=None

In [117]:
print(response)

Here are some of the commands mentioned in the provided Flink documentation:

* `tar -xzf flink-*.tgz`: This command is used to extract the contents of a tar.gz file named "flink-*.tgz". 

* `cd flink-* && ls -l`: This command navigates to the directory named "flink-*" and lists the contents of the directory in long format. 

* `./bin/start-cluster.sh`: This command starts a local Flink cluster in the background. 

* `ps aux | grep flink`: This command is used to check the status of the Flink cluster. It lists all the processes running on the system and filters the output to show only the processes related to Flink. 

* `./bin/stop-cluster.sh`: This command stops the local Flink cluster and all its running components. 

* `./bin/flink run examples/streaming/WordCount.jar`: This command submits a Flink job to the running cluster. In this case, it deploys an example word count job located in the "examples/streaming/" directory. 

* `tail log/flink-*-taskexecutor-*.out`: This command disp

In [118]:
GEMINI_KEY = "AIzaSyBVI2jAHepUzLwWoK6qwXCOYxD0NFzZIns"

In [119]:
from langchain_google_genai import ChatGoogleGenerativeAI

In [120]:
from langchain_google_genai import ChatGoogleGenerativeAI

In [121]:
google_llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", google_api_key=GEMINI_KEY, temperature=0.0)

In [122]:
from langchain_core.prompts import PromptTemplate
example_prompt = PromptTemplate.from_template("""You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know.Keep the answer concise and to the point. Write down the citation at the end of the answer that you have taken reference from. The citation names are in form of urls, that are provided in the xml tags.
Follow below mention format for citation
Citation:
        (1) Source URL 1
        (2) Source URL 2
Question: {question} \nContext: {context} \nAnswer""")

In [123]:
google_rag = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | example_prompt
    | google_llm
    | StrOutputParser()
)

In [124]:
google_rag.get_graph().print_ascii()

              +---------------------------------+           
              | Parallel<context,question>Input |           
              +---------------------------------+           
                    ****                ****                
                 ***                        ***             
               **                              ***          
+----------------------+                          **        
| VectorStoreRetriever |                           *        
+----------------------+                           *        
            *                                      *        
            *                                      *        
            *                                      *        
+---------------------+                     +-------------+ 
| Lambda(format_docs) |                     | Passthrough | 
+---------------------+                     +-------------+ 
                    ****                ****                
                        

In [125]:
response = google_rag.invoke("List down all the commands used in the flink documenatation along with explanation of the command.")



TooManyRequestsError: status_code: 429, body: data=None

In [116]:
to_markdown(response)

> Here are some of the commands mentioned in the provided Flink documentation:
> 
> * `tar -xzf flink-*.tgz`: This command is used to extract the contents of a tar.gz file named "flink-*.tgz". 
> 
> * `cd flink-* && ls -l`: This command navigates to the directory named "flink-*" and lists the contents of the directory in long format. 
> 
> * `./bin/start-cluster.sh`: This command starts a local Flink cluster in the background. 
> 
> * `ps aux | grep flink`: This command is used to check the status of the Flink cluster. It lists all the processes running on the system and filters the output to show only the processes related to Flink. 
> 
> * `./bin/stop-cluster.sh`: This command stops the local Flink cluster and all its running components. 
> 
> * `./bin/flink run examples/streaming/WordCount.jar`: This command submits a Flink job to the running cluster. In this case, it deploys an example word count job located in the "examples/streaming/" directory. 
> 
> * `tail log/flink-*-taskexecutor-*.out`: This command displays the last few lines of the log file for the Flink task executor. This is useful for verifying the output of the Flink job.
> 
> * `docker-compose build`: This command builds the Docker image for the Flink playground.
> 
> * `mkdir -p /tmp/flink-checkpoints-directory`: This command creates a directory for Flink checkpoints.
> 
> * `mkdir -p /tmp/flink-savepoints-directory`: This command creates a directory for Flink savepoints.
> 
> * `docker-compose up -d`: This command starts the Flink playground in detached mode.
> 
> * `docker-compose ps`: This command lists the running Docker containers for the Flink playground.
> 
> * `docker-compose run --no-deps client flink list`: This command lists the running Flink jobs.
> 
> * `curl localhost:8081/jobs`: This command retrieves information about running jobs from the Flink REST API.
> 
> Citation:
>  (1) <https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/try-flink/local_installation/>
>  (2) <https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/try-flink/flink-operations-playground/> 


In [50]:
prompt

ChatPromptTemplate(input_variables=['context', 'question'], metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))])