In [4]:
!pip install -U crawl4ai
!pip install nest_asyncio



## Crawl the URL and save the output in a file

In [5]:
import crawl4ai
print(crawl4ai.__version__.__version__)

0.4.248


In [6]:
%%capture
!crawl4ai-setup

In [7]:
!crawl4ai-doctor

[INIT].... → Running Crawl4AI health check...
[INIT].... → Crawl4AI 0.4.248
[TEST].... ℹ Testing crawling capabilities...
[EXPORT].. ℹ Exporting PDF and taking screenshot took 1.80s
[FETCH]... ↓ https://crawl4ai.com... | Status: True | Time: 3.87s
[SCRAPE].. ◆ Processed https://crawl4ai.com... | Time: 85ms
[COMPLETE] ● https://crawl4ai.com... | Status: True | Total: 3.96s
[COMPLETE] ● ✅ Crawling test passed!


In [10]:
url = "https://www2.deloitte.com/us/en/insights/economy/global-economic-outlook/weekly-update/weekly-update-2023-10.html"

In [11]:
import asyncio
import nest_asyncio
nest_asyncio.apply()

In [12]:
from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig, CacheMode

In [25]:
async def simple_crawl():
    crawler_run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url=url,
            config=crawler_run_config
        )
        # Process the output
        output = result.markdown

        # Write the output to a markdown file
        with open("output.md", "w", encoding="utf-8") as md_file:
            md_file.write(output)

        print("Output saved to output.md")

In [26]:
asyncio.run(simple_crawl())

[INIT].... → Crawl4AI 0.4.248
[FETCH]... ↓ https://www2.deloitte.com/us/en/insights/economy/g... | Status: True | Time: 7.15s
[SCRAPE].. ◆ Processed https://www2.deloitte.com/us/en/insights/economy/g... | Time: 442ms
[COMPLETE] ● https://www2.deloitte.com/us/en/insights/economy/g... | Status: True | Total: 7.62s
Output saved to output.md


## RAG

In [30]:
!pip install langchain langchain_community langchain_core faiss-cpu openai unstructured

Collecting unstructured
  Downloading unstructured-0.16.23-py3-none-any.whl.metadata (24 kB)
Collecting filetype (from unstructured)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting python-magic (from unstructured)
  Downloading python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting emoji (from unstructured)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting python-iso639 (from unstructured)
  Downloading python_iso639-2025.2.18-py3-none-any.whl.metadata (14 kB)
Collecting langdetect (from unstructured)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 981.5/981.5 kB 12.8 MB/s eta 0:00:00
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting rapidfuzz (from unstructured)
  Downloading rapidfuzz-3.12.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting backoff (from unstructured)
  Downloading backoff-2.2.1-py3-none-any.whl.met

In [31]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader

In [32]:
markdown_path = "output.md"
loader = UnstructuredMarkdownLoader(markdown_path)

In [33]:
data = loader.load()

In [34]:
data[0]



In [42]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [43]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

texts = text_splitter.split_documents(data)

print(texts[1])

page_content='Hospitality

Airlines & Transportation

Energy & Industrials

Home

Aerospace & Defense

Chemicals & Specialty Materials

Engineering & Construction

Industrial Manufacturing

Mining & Metals

Oil & Gas

Power & Utilities

Renewable Energy

Financial Services

Home

Banking & Capital Markets

Commercial Real Estate

Insurance

Investment Management

Cross Financial Services

Government & Public Services

Home

Defense, Security & Justice

Government Health

State & Local Government

Whole of Government

Transportation & Infrastructure

Human Services

Higher Education

Life Sciences & Health Care

Home

Hospitals, Health Systems & Providers​

Pharmaceutical Manufacturers​

Health Plans & Payers​

Medtech & Health Tech Organizations

Tech, Media & Telecom

Home

Technology

Media & Entertainment

Telecommunications

Semiconductor

Sports' metadata={'source': 'output.md'}


In [36]:
!pip install langchain_openai

Collecting langchain_openai
  Downloading langchain_openai-0.3.6-py3-none-any.whl.metadata (2.3 kB)
Downloading langchain_openai-0.3.6-py3-none-any.whl (54 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 54.9/54.9 kB 2.0 MB/s eta 0:00:00
[?25hInstalling collected packages: langchain_openai
Successfully installed langchain_openai-0.3.6


In [37]:
from langchain_openai import OpenAIEmbeddings

In [38]:
import os
from google.colab import userdata

In [39]:
os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")

In [40]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [41]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

In [45]:
vectorstore = FAISS.from_documents(texts, embeddings)

## Persist Vectors

In [46]:
vectorstore.save_local("faiss_index")

## Retrieval Chains

In [47]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA

In [58]:
prompt_template = """
You are an AI assistant tasked with answering questions based solely
on the provided context. Your goal is to generate a comprehensive answer
for the given question using only the information available in the context. Follow these instructions carefully:

1. First, you will be given a context to work with:

context: {context}

2. Then, you will be presented with a question:

question: {question}

3. Carefully analyze the context:
   - Read through the entire context thoroughly.
   - Identify key information relevant to the question.
   - Note any specific facts, figures, or statements that
   directly relate to the question.

4. Generate your answer:
   - Use only the information provided in the context.
   - Do not include any external knowledge or assumptions not
   present in the given context.
   - If the context does not contain enough information to fully
   answer the question, state this clearly in your response.
   - Ensure your answer is comprehensive and addresses all aspects
    of the question that can be answered using the context.

5. Format your answer in Markdown:
   - Use appropriate Markdown syntax to structure your response.
   - Utilize headings, bullet points, or numbered lists where
   applicable to organize information clearly.
   - If quoting directly from the context, use quotation marks
   and consider using blockquote formatting (>).

6. Provide your final answer:
   - Begin your response with <response> and end it with </response>.
   - Ensure your entire answer, including all Markdown formatting,
   is contained within these tags.

Remember, your task is to answer the question based solely on the
given context. Do not include any information or knowledge that
is not explicitly stated in the provided context.
"""

In [59]:
prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])

In [60]:
new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

In [61]:
new_db

<langchain_community.vectorstores.faiss.FAISS at 0x78907ab641d0>

In [62]:
retriever = new_db.as_retriever(search_kwargs={"k":5})

In [63]:
chain_type_kwargs = {"prompt": prompt}

In [77]:
from langchain_openai import ChatOpenAI

In [78]:
llm = ChatOpenAI(
    model_name="gpt-4o-mini",
    temperature=0.3,
    max_tokens=1000
)

In [79]:
qa = RetrievalQA.from_chain_type(llm=llm,
                                 chain_type="stuff",
                                 retriever=retriever,
                                 return_source_documents=True,
                                 chain_type_kwargs=chain_type_kwargs,
                                 verbose=True)

In [80]:
test_query = "Can you provide an overview of the US economy?"

In [81]:
response = qa(test_query)
print(response)
answer = response['result']
source_document = response['source_documents'][0].page_content
doc = response['source_documents'][0].metadata['source']



> Entering new RetrievalQA chain...

> Finished chain.


In [82]:
answer



## Batch Processing

In [84]:
with open("questions.txt", "r") as qfile:
    questions = [line.strip() for line in qfile if line.strip()]

responses = []

for idx, question in enumerate(questions, start=1):
    response = qa(question)
    answer = response["result"]
    formatted_response = f"<response_{idx}>\n{answer}\n</response_{idx}>"
    responses.append(formatted_response)

with open("responses.txt", "w") as rfile:
    rfile.write("\n\n".join(responses))



> Entering new RetrievalQA chain...

> Finished chain.


> Entering new RetrievalQA chain...

> Finished chain.


> Entering new RetrievalQA chain...

> Finished chain.


> Entering new RetrievalQA chain...

> Finished chain.


> Entering new RetrievalQA chain...

> Finished chain.


> Entering new RetrievalQA chain...

> Finished chain.


> Entering new RetrievalQA chain...

> Finished chain.


> Entering new RetrievalQA chain...

> Finished chain.


> Entering new RetrievalQA chain...

> Finished chain.


> Entering new RetrievalQA chain...

> Finished chain.
