In [None]:
!pip install scrapy==2.11.0
!pip install langchain==0.0.350 unstructured==0.11.5 nltk==3.8.1
!pip install openai==1.5.0 tiktoken==0.5.2

## Auto Crawling using Scrapy

In [None]:
%%writefile aws.py

import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.exceptions import CloseSpider

class AWSSpider(CrawlSpider):

    name = "aws"
    base_url = "https://aws.amazon.com"
    start_urls = ["https://aws.amazon.com/faqs/"]
    allowed_domains = ["aws.amazon.com"]
    max_pages = 100
    count = 0

    custom_settings = {
        'CONCURRENT_REQUESTS': 5,
        'CONCURRENT_REQUESTS_PER_DOMAIN': 10,
        'ROBOTSTXT_OBEY': False,
        'CONCURRENT_ITEMS': 100,
        'REACTOR_THREADPOOL_MAXSIZE': 400,
        'LOG_LEVEL': 'INFO',
        'RETRY_ENABLED': False,
        'REDIRECT_MAX_TIMES': 1,
        # do not fetch more than 5mb contents
        'DOWNLOAD_MAXSIZE': 5592405,

        # Grabs xpath before site finish loading
        'DOWNLOAD_FAIL_ON_DATALOSS': False,

        'DEPTH_PRIORITY': 1,
        'SCHEDULER_DISK_QUEUE' : 'scrapy.squeues.PickleFifoDiskQueue',
        'SCHEDULER_MEMORY_QUEUE' :'scrapy.squeues.FifoMemoryQueue'
    }

    rules = (
        Rule(LinkExtractor(), callback='parse_item', follow=True),
    )

    def parse_item(self, response):

        if self.count > self.max_pages:
            raise CloseSpider('Page limit reached')

        data = response.css('#aws-page-content-main').getall()

        page_title = response.css('title::text').get()

        if not page_title or not data or not isinstance(data, list) or len(data) == 0:
            print("\n\n Error:", response.url, "\n\n")
            return

        self.count += 1

        page_title = page_title.replace(' ', '_').replace("/", "_").replace("\\","_").strip()

        print(f"Downloaded {page_title} ... ", response.url)

        filename = 'aws_data/' + page_title +  '.html'

        with open(filename, 'w', encoding="utf8") as f:
            f.write('\n'.join(data))

Overwriting aws.py


In [None]:
!scrapy runspider aws.py

## Extract Data from HTML Files

In [11]:
from glob import glob
from langchain.text_splitter import HTMLHeaderTextSplitter

all_html_files = glob('aws_data/*.html')

In [15]:
headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
    ("p", "Paragraph"),
]

aws_contents = []

html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

In [19]:
for html_file in all_html_files:
    with open(html_file, encoding="utf8") as f:
        html_text = f.read()

    if not html_text or len(html_text) == 0:
        continue

    html_header_splits = html_splitter.split_text(html_text)

    if not html_header_splits or len(html_header_splits) == 0:
        continue

    aws_contents += html_header_splits

## Documents Storing

In [27]:
from langchain.embeddings import OpenAIEmbeddings
import requests

openai_api_key = "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"
cromadb_server_api = "http://34.221.246.196:4001/api/v1"
collection_name = "aws_db"
model = "text-embedding-ada-002"

embeddings_model = OpenAIEmbeddings(openai_api_key=openai_api_key, model=model)

In [25]:
import requests

resp = requests.post(f"{cromadb_server_api}/collections", json={
    "name": collection_name,
    "get_or_create": True,
    "dimension": 1536
})

print(resp.json())

{'name': 'aws_db', 'id': '6e8b13e7-2d30-45c0-9c86-8885c50fc86c', 'metadata': None, 'tenant': 'default_tenant', 'database': 'default_database'}


In [None]:
collection_id = '6e8b13e7-2d30-45c0-9c86-8885c50fc86c'

In [30]:
docs = {
    "embeddings": [],
    "metadatas": [],
    "documents": [],
    "ids": [],
    "uris": []
}

for i, content in enumerate(aws_contents):
    docs["documents"].append(content.page_content)
    docs["embeddings"].append( embeddings_model.embed_query( content.page_content ) )
    docs["ids"].append( str(i) )
    docs["uris"].append(None)
    docs["metadatas"].append(None)

In [32]:
docs["ids"] = [ str(i) for i in docs["ids"] ]

In [33]:
# index the documents
resp = requests.post(f"{cromadb_server_api}/collections/{collection_id}/add", json=docs)
print(resp.json())

True


In [None]:
# delete the collection
# resp = requests.delete(f"{cromadb_server_api}/collections/{collection_name}", json={})