# Test crawler

In [None]:
%load_ext autoreload
%autoreload 2

import asyncio
import httpx
from crawler import crawler, pattern_filter


url_queue = asyncio.Queue()
response_queue = asyncio.Queue()
stop_crawling = asyncio.Event()
await url_queue.put("https://caseyhandmer.wordpress.com/")


import time

addable_urls = []
async with httpx.AsyncClient() as client:
    addable_urls = await crawler(
        url_queue,
        url_filter={"filter_func": pattern_filter, "kwargs": {"regex_patterns": ["https://"]}},
        client=client,
        response_queue=response_queue,
        pause=stop_crawling,
        end=stop_crawling,
        max_iter=1,
    )
    print("addable urls:", addable_urls)
    stop_crawling.set()

# Test Cleaner

In [None]:
from urllib.parse import urlparse, urlunparse
from joblib import load

addable_urls = load("cleanable_urls.joblib")

def clean_urls(urls):
    cleaned_urls = set()  # Use a set to ensure no duplicates

    # Remove fragments and trailing slashes
    for url in urls:
        parsed_url = urlparse(url)

        cleaned_url = parsed_url._replace(fragment="")

        final_url = urlunparse(cleaned_url).rstrip("/")

        cleaned_urls.add(final_url)

    return list(cleaned_urls) 

print(len(addable_urls))
cleaned_urls = clean_urls(addable_urls)
print(len(cleaned_urls))


# Test the processor

- `multi-qa-MiniLM-L6-cos-v1` is one of the best models for semantic search.

In [None]:
import sentence_transformers
from joblib import load
import qdrant_client

soup = load("test_data/html_page.joblib")

model = sentence_transformers.SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")

client = qdrant_client.QdrantClient(":memory:") 

In [40]:
from bs4 import BeautifulSoup
import re
import numpy as np
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct

def extract_visible_text(soup):
    # Remove elements that do not contain user-visible text
    for element in soup(['script', 'style', 'meta', 'header', 'footer', 'nav', 'noscript']):
        element.decompose()  # Removes the element from the soup

    # Extract the raw text
    raw_text = soup.get_text(separator=' ')
    
    # Clean up the extracted text
    visible_text = re.sub(r'\s+', ' ', raw_text).strip()  # Replaces multiple spaces/newlines with a single space
    
    return visible_text

def process_html_to_vectors(
    soup: BeautifulSoup,
    model: sentence_transformers.SentenceTransformer,
    db_client: QdrantClient,
    max_length: int = 450,
) -> None:
    """
    Processes a BeautifulSoup object into a list of sentences and turns each of them
    into a vector using the sentence_transformers model. Puts the vectors into a Qdrant
    collection.

    This is test code for now.
    """
    # Extract visible text from the soup
    visible_text = extract_visible_text(soup)
    
    # Get splits of 450 words
    split_text = visible_text.split(' ')
    splits = list(range(0, len(split_text), max_length))
    splits.append(len(split_text))

    # Create the sequences
    sequences = [' '.join(split_text[i:j]) for i, j in zip(splits[:-1], splits[1:])]

    # Turns the sequences into float16 vectors
    vectors = model.encode(sequences, convert_to_numpy=True)
    vectors = vectors.astype(np.float32)

    # Create the collection
    if not db_client.collection_exists("my_collection"):
        db_client.create_collection(
        collection_name="my_collection",
        vectors_config=VectorParams(size=384, distance=Distance.COSINE),
        )

    # Add the vectors to the collection
    db_client.upsert(
        collection_name="my_collection",
        points=[
            PointStruct(id=idx, vector=vector.tolist(), payload={"text": sequences[idx]})
            for idx, vector in enumerate(vectors)
        ]
    )

process_html_to_vectors(soup, model, client)

In [43]:
vec = model.encode("what does casey handmer do?", convert_to_numpy=True)
hits = client.search(
   collection_name="my_collection",
   query_vector=vec,
   limit=5  # Return 5 closest points
)

In [None]:
hits

# Crawl CJ Handmer's blog for 300 links

- Do this and store all the data in the postgres and vector databases.
- Use this data to run tests using the search module.
- Use this data to serve a local site that provides a simple search interface that OpenEngine will have.
- Build the interface using Svelte.

## Setting up the inputs for `gather.py`

In [None]:
# Set up code for the process
import setup_postgres
import psycopg2
import qdrant_client
from datetime import datetime

# Set up the postgres connection
temp_dir, port = setup_postgres.start_ephemeral_postgres()

client = psycopg2.connect(
    dbname='postgres',
    user='postgres',
    host="localhost",
    port=port
)
print("connected to postgres")

cursor = client.cursor()

# Create the table
table_sql = """CREATE TABLE resources ( 
    id SERIAL PRIMARY KEY,
    url VARCHAR(2048) NOT NULL,
    firstVisited TIMESTAMP NOT NULL,
    lastVisited TIMESTAMP NOT NULL,
    allVisits INT DEFAULT 1,
    externalLinks TEXT[]
);"""

cursor.execute(table_sql)

# Add handmer's website
insert_handmer = """INSERT INTO resources (url, firstVisited, lastVisited, allVisits, externalLinks) VALUES (%s, %s, %s, %s, %s)"""
handmer_data = ("https://caseyhandmer.wordpress.com", datetime.now(), datetime.now(), 0, [])

cursor.execute(insert_handmer, handmer_data)

# Set up the qdrant connection
vector_client = qdrant_client.QdrantClient(":memory:")
vector_client.create_collection(
    collection_name="embeddings",
    vectors_config=qdrant_client.models.VectorParams(size=384, distance=qdrant_client.models.Distance.COSINE),
)

# Set up the model
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")

# Set up the revisit delta
from datetime import timedelta
revisit_delta = timedelta(microseconds=1)

In [6]:
# End the postgres and qdrant servers
setup_postgres.stop_ephemeral_postgres(temp_dir)
del vector_client

In [None]:
%load_ext autoreload
%autoreload 2

import gather

# Set up the max iterations
max_iter = 75

await gather.gather(vector_client, client, model, revisit_delta, max_iter)


## Check vector database and normal database for urls etc.

In [None]:
# Check the vector and metadata were stored correctly
points = vector_client.scroll(
    collection_name="embeddings",
    with_payload=True,
    with_vectors=True,
    limit=10_000,
)[0]

len(points)

In [None]:
# Get resources from the postgres server
sql = "select * from resources;"
cursor.execute(sql)
all_meta = cursor.fetchall()

print(len(all_meta))

## Save the metadata and vector data for use in testing

In [None]:
from joblib import dump

dump(all_meta, "test_data/search_engine/postgres_meta.joblib")
dump(points, "test_data/search_engine/qdrant_vectors.joblib")