# Crawl

> A module to help you crawl policing websites and ingest them for later use with your LLM

This notebook and the associated module and functions assist in scraping and cleaning website from domains.

In [None]:
#| default_exp crawl

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import requests
import re
import urllib.request
from bs4 import BeautifulSoup
from collections import deque
from html.parser import HTMLParser
from urllib.parse import urlparse
import os
import pandas as pd
import tiktoken
import numpy as np

import openai



This notebook is intended to follow on from `police_risk_open_ai.core.llm`, and mostly replicates the opening stages of the [OpenAI embedding tutorial](https://platform.openai.com/docs/tutorials/web-qa-embeddings), though we make some code changes as we go.

We start by replicating their code that helps scrape pages using beautifulsoup.

In [None]:
#| export

# Regex pattern to match a URL
HTTP_URL_PATTERN = r'^http[s]*://.+'


# Create a class to parse the HTML and get the hyperlinks
class HyperlinkParser(HTMLParser):
    def __init__(self):
        super().__init__()
        # Create a list to store the hyperlinks
        self.hyperlinks = []

    # Override the HTMLParser's handle_starttag method to get the hyperlinks
    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)

        # If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks
        if tag == "a" and "href" in attrs:
            self.hyperlinks.append(attrs["href"])

# Function to get the hyperlinks from a URL
def get_hyperlinks(url):
    
    # Try to open the URL and read the HTML
    try:
        # Open the URL and read the HTML
        with urllib.request.urlopen(url) as response:

            # If the response is not HTML, return an empty list
            if not response.info().get('Content-Type').startswith("text/html"):
                return []
            
            # Decode the HTML
            html = response.read().decode('utf-8')
    except Exception as e:
        print(e)
        return []

    # Create the HTML Parser and then Parse the HTML to get hyperlinks
    parser = HyperlinkParser()
    parser.feed(html)

    return parser.hyperlinks

# Function to get the hyperlinks from a URL that are within the same domain
def get_domain_hyperlinks(local_domain, url):
    clean_links = []
    for link in set(get_hyperlinks(url)):
        clean_link = None

        # If the link is a URL, check if it is within the same domain
        if re.search(HTTP_URL_PATTERN, link):
            # Parse the URL and check if the domain is the same
            url_obj = urlparse(link)
            if url_obj.netloc == local_domain:
                clean_link = link

        # If the link is not a URL, check if it is a relative link
        else:
            if link.startswith("/"):
                link = link[1:]
            elif link.startswith("#") or link.startswith("mailto:"):
                continue
            clean_link = "https://" + local_domain + "/" + link

        if clean_link is not None:
            if clean_link.endswith("/"):
                clean_link = clean_link[:-1]
            clean_links.append(clean_link)

    # Return the list of hyperlinks that are within the same domain
    return list(set(clean_links))

def crawl(url):
    # Parse the URL and get the domain
    local_domain = urlparse(url).netloc

    # Create a queue to store the URLs to crawl
    queue = deque([url])

    # Create a set to store the URLs that have already been seen (no duplicates)
    seen = set([url])

    # Create a directory to store the text files
    if not os.path.exists("text/"):
            os.mkdir("text/")

    if not os.path.exists("text/"+local_domain+"/"):
            os.mkdir("text/" + local_domain + "/")

    # Create a directory to store the csv files
    if not os.path.exists("processed"):
            os.mkdir("processed")

    # While the queue is not empty, continue crawling
    while queue:

        # Get the next URL from the queue
        url = queue.pop()
        print(url) # for debugging and to see the progress

        # Save text from the url to a <url>.txt file
        with open('text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w", encoding="UTF-8") as f:

            # Get the text from the URL using BeautifulSoup
            soup = BeautifulSoup(requests.get(url).text, "html.parser")

            # Get the text but remove the tags
            text = soup.get_text()

            # If the crawler gets to a page that requires JavaScript, it will stop the crawl
            if ("You need to enable JavaScript to run this app." in text):
                print("Unable to parse page " + url + " due to JavaScript being required")
            
            # Otherwise, write the text to the file in the text directory
            f.write(text)

        # Get the hyperlinks from the URL and add them to the queue
        for link in get_domain_hyperlinks(local_domain, url):
            if link not in seen:
                queue.append(link)
                seen.add(link)



Using the above function "out of the box" on the College of Policing APP website though doesn't work as intended.

In [None]:
#|eval: false


domain = "college.police.uk/app" # <- put your domain to be crawled
full_url = "https://www.college.police.uk/app" # <- put your domain to be crawled with https or http


crawl(full_url)

https://www.college.police.uk/app
HTTP Error 403: Forbidden


This seems to be a cloudflare response to prevent crawlers, but we can get it around it by modifying out user heading. Rather than requesting the URL directly, we'll inject in a header that looks like the Firefox browser.

That said, when you do scrape websites, make sure you do it ethically: consider how much you're pulling in, what the audience is, and whether you might be impacting service for other users.  Given the APP is public and in high use, I feel that's okay here.

```

    request = urllib.request.Request(url, headers={'User-Agent':'Mozilla/5.0'})
    
    # Try to open the URL and read the HTML
    try:
        # Open the URL and read the HTML
        with urllib.request.urlopen(request) as response:
```
I also add in a max length of the URL, because the College has some real weird stuff that is breaking my code.

```

        # Save text from the url to a <url>.txt file
        if len(url) < 500:
            with open('text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w", encoding="UTF-8") as f:

```


That code does succesfully manage to run through the entirety of the APP domain, so we bundle it up below.

In [None]:
#| export


# Create a class to parse the HTML and get the hyperlinks
class HyperlinkParser(HTMLParser):
    def __init__(self):
        super().__init__()
        # Create a list to store the hyperlinks
        self.hyperlinks = []

    # Override the HTMLParser's handle_starttag method to get the hyperlinks
    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)

        # If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks
        if tag == "a" and "href" in attrs:
            self.hyperlinks.append(attrs["href"])

# Function to get the hyperlinks from a URL
def get_hyperlinks(url):

    request = urllib.request.Request(url, headers={'User-Agent':'Mozilla/5.0'})
    
    # Try to open the URL and read the HTML
    try:
        # Open the URL and read the HTML
        with urllib.request.urlopen(request) as response:

            # If the response is not HTML, return an empty list
            if not response.info().get('Content-Type').startswith("text/html"):
                return []
            
            # Decode the HTML
            html = response.read().decode('utf-8')
    except Exception as e:
        print(e)
        return []

    # Create the HTML Parser and then Parse the HTML to get hyperlinks
    parser = HyperlinkParser()
    parser.feed(html)

    return parser.hyperlinks

# Function to get the hyperlinks from a URL that are within the same domain
def get_domain_hyperlinks(local_domain, url):
    clean_links = []
    for link in set(get_hyperlinks(url)):
        clean_link = None

        # If the link is a URL, check if it is within the same domain
        if re.search(HTTP_URL_PATTERN, link):
            # Parse the URL and check if the domain is the same
            url_obj = urlparse(link)
            if url_obj.netloc == local_domain:
                clean_link = link

        # If the link is not a URL, check if it is a relative link
        else:
            if link.startswith("/"):
                link = link[1:]
            elif link.startswith("#") or link.startswith("mailto:"):
                continue
            clean_link = "https://" + local_domain + "/" + link

        if clean_link is not None:
            if clean_link.endswith("/"):
                clean_link = clean_link[:-1]
            clean_links.append(clean_link)

    # Return the list of hyperlinks that are within the same domain
    return list(set(clean_links))

def crawl(url):
    # Parse the URL and get the domain
    local_domain = urlparse(url).netloc

    # Create a queue to store the URLs to crawl
    queue = deque([url])

    # Create a set to store the URLs that have already been seen (no duplicates)
    seen = set([url])

    # Create a directory to store the text files
    if not os.path.exists("text/"):
            os.mkdir("text/")

    if not os.path.exists("text/"+local_domain+"/"):
            os.mkdir("text/" + local_domain + "/")

    # Create a directory to store the csv files
    if not os.path.exists("processed"):
            os.mkdir("processed")

    # While the queue is not empty, continue crawling
    while queue:

        # Get the next URL from the queue
        url = queue.pop()
        print(url) # for debugging and to see the progress

        # Save text from the url to a <url>.txt file
        if len(url) < 500:
            with open('text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w", encoding="UTF-8") as f:

                # Get the text from the URL using BeautifulSoup
                soup = BeautifulSoup(requests.get(url).text, "html.parser")

                # Get the text but remove the tags
                text = soup.get_text()

                # If the crawler gets to a page that requires JavaScript, it will stop the crawl
                if ("You need to enable JavaScript to run this app." in text):
                    print("Unable to parse page " + url + " due to JavaScript being required")
                
                # Otherwise, write the text to the file in the text directory
                f.write(text)

            # Get the hyperlinks from the URL and add them to the queue
            for link in get_domain_hyperlinks(local_domain, url):
                if link not in seen:
                    queue.append(link)
                    seen.add(link)



Let's test it out on my website.  As you can see, loops through each hyperlink, and crawls it in turn, saving each as a local file.

In [None]:
#|eval: false


# Regex pattern to match a URL
HTTP_URL_PATTERN = r'^http[s]*://.+'

domain = "andreasthinks.me" # <- put your domain to be crawled
full_url = "https://andreasthinks.me/" # <- put your domain to be crawled with https or http


crawl(full_url)

https://andreasthinks.me/
https://andreasthinks.me/./recent_work.html
https://andreasthinks.me/./index.xml




https://andreasthinks.me/./posts/lockdown_effect/index.html
https://andreasthinks.me/../../index.xml
HTTP Error 400: Bad Request
https://andreasthinks.me/../../recent_work.html
HTTP Error 400: Bad Request
https://andreasthinks.me/../../about.html
HTTP Error 400: Bad Request
https://andreasthinks.me/../../index.html
HTTP Error 400: Bad Request
https://andreasthinks.me/./posts/migrated_to_quarto/index.html
https://andreasthinks.me/./posts/burglary_attendance/index.html


KeyboardInterrupt: 

With our API now scraped, we can move on to cleaning irrelevant data, and outputing something we can work with.

In [None]:
#| export

def remove_newlines(serie):
    serie = serie.str.replace('\n', ' ')
    serie = serie.str.replace('\\n', ' ')
    serie = serie.str.replace('  ', ' ')
    serie = serie.str.replace('  ', ' ')
    return serie

def clean_scrapped_data(scrape_directory, output_file='processed/scraped.csv'):
    """ takes a folder containing all the file from your scrapped data, cleans it all, saves as a CSV and returns the dataframe

    if given None as output_file, dataframe returned but not saved"""

    # Create a list to store the text files
    texts=[]

    # Get all the text files in the text directory
    for file in os.listdir(scrape_directory + "/"):

        # Open the file and read the text
        with open(scrape_directory + "/" + file, "r", encoding="UTF-8") as f:
            text = f.read()

            # Omit the first 11 lines and the last 4 lines, then replace -, _, and #update with spaces.
            texts.append((file[11:-4].replace('-',' ').replace('_', ' ').replace('#update',''), text))

    # Create a dataframe from the list of texts
    df = pd.DataFrame(texts, columns = ['fname', 'text'])

    # Set the text column to be the raw text with the newlines removed
    df['text'] = df.fname + ". " + remove_newlines(df.text)
    if output_file is not None:
        df.to_csv(output_file)
    return df

In [None]:
#|eval: false


cleaned_df = clean_scrapped_data("text/www.college.police.uk")
cleaned_df

  serie = serie.str.replace('\\n', ' ')


Unnamed: 0,fname,text
0,.police.uk app,.police.uk app. APP (authorised prof...
1,.police.uk,.police.uk. Working together | Coll...
2,.police.uk about,.police.uk about. About us | College...
3,.police.uk about concordats,.police.uk about concordats. Concord...
4,.police.uk about publication scheme,.police.uk about publication scheme. ...
...,...,...
4441,.police.uk cdn cgi l email protection#cb8fedaa...,.police.uk cdn cgi l email protection#cb8fedaa...
4442,.police.uk cdn cgi l email protection#d3b7f5b2...,.police.uk cdn cgi l email protection#d3b7f5b2...
4443,.police.uk cdn cgi l email protection#206f6446...,.police.uk cdn cgi l email protection#206f6446...
4444,.police.uk cdn cgi l email protection#97f4f8f9...,.police.uk cdn cgi l email protection#97f4f8f9...


So there you have it! An entire website, scraped and cleaned, ready to be ingested into our AI model.

Before we can begin analysis, we need to split our text into `tokens`, recognisable chunks of text-data our model will recognise.

In [None]:
#|eval: false

df = pd.read_csv("processed/scraped.csv",index_col=0)
df.columns = ['title', 'text']

df

Before we can begin analysis, we need to split our text into `tokens`, recognisable chunks of text-data our model will recognise.

In [None]:
#| export

# Load the cl100k_base tokenizer which is designed to work with the ada-002 model
tokenizer = tiktoken.get_encoding("cl100k_base")


In [None]:
#|eval: false

# Tokenize the text and save the number of tokens to a new column
df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))

# Visualize the distribution of the number of tokens per row using a histogram
df.n_tokens.hist()

We can't handle that many tokens, so we build a function to break them into chunks.

In [None]:
#| export


# Function to split the text into chunks of a maximum number of tokens
def split_into_many(text, max_tokens = 500):

    # Split the text into sentences
    sentences = text.split('. ')

    # Get the number of tokens for each sentence
    n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences]
    
    chunks = []
    tokens_so_far = 0
    chunk = []

    # Loop through the sentences and tokens joined together in a tuple
    for sentence, token in zip(sentences, n_tokens):

        # If the number of tokens so far plus the number of tokens in the current sentence is greater 
        # than the max number of tokens, then add the chunk to the list of chunks and reset
        # the chunk and tokens so far
        if tokens_so_far + token > max_tokens:
            chunks.append(". ".join(chunk) + ".")
            chunk = []
            tokens_so_far = 0

        # If the number of tokens in the current sentence is greater than the max number of 
        # tokens, go to the next sentence
        if token > max_tokens:
            continue

        # Otherwise, add the sentence to the chunk and add the number of tokens to the total
        chunk.append(sentence)
        tokens_so_far += token + 1

    return chunks
    
    


In [None]:
#| export
def produce_df_embeddings(df, chunk_size=100):
    """produces embeddings from the open AI api in chunks """

    df =  df.drop_duplicates(subset=['text'])

    chunked_df = pd.DataFrame()

    while len(chunked_df) < len(df):

        awaits_completing_df = df[~df.index.isin(chunked_df.index.tolist())]
        if len(awaits_completing_df) >= chunk_size:
            sample_size = chunk_size
        else:
            sample_size = len(awaits_completing_df)
        print('remaining length')
        print(len(awaits_completing_df))
        new_chunk = awaits_completing_df.sample(sample_size)
        try:
            new_chunk['embeddings'] = new_chunk.text.apply(lambda x: openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding'])
        except:
            print('chunk failed')
            print('passing')
            new_chunk = pd.DataFrame()
        chunked_df = pd.concat([new_chunk,chunked_df])
        chunked_df.to_csv('processed/embeddings.csv')
        i += 1
        
    chunked_df['embeddings'] = chunked_df['embeddings'].apply(np.array)

    
    return chunked_df.sort_index()

## Bringing it Together
Let's now make a master function that scrapes a website, cleans it, tokenises it, converts to embeddings, and saves.

In [None]:

def crawl(url, export_dir='scrape_export'):
    # Parse the URL and get the domain

    if not os.path.exists(export_dir + "/"):
            os.mkdir(export_dir + "/")
    
    export_directory_loc = export_dir + "/"
    
    local_domain = urlparse(url).netloc

    # Create a queue to store the URLs to crawl
    queue = deque([url])

    # Create a set to store the URLs that have already been seen (no duplicates)
    seen = set([url])

    # Create a directory to store the text files
    if not os.path.exists(export_directory_loc + "text/"):
            os.mkdir(export_directory_loc + "text/")

    if not os.path.exists(export_directory_loc + "text/"+local_domain+"/"):
            os.mkdir(export_directory_loc + "text/" + local_domain + "/")

    # Create a directory to store the csv files
    if not os.path.exists(export_directory_loc + "processed"):
            os.mkdir(export_directory_loc + "processed")

    # While the queue is not empty, continue crawling
    while queue:

        # Get the next URL from the queue
        url = queue.pop()
        print(url) # for debugging and to see the progress

        # Save text from the url to a <url>.txt file
        with open(export_directory_loc + 'text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w", encoding="UTF-8") as f:

            # Get the text from the URL using BeautifulSoup
            soup = BeautifulSoup(requests.get(url).text, "html.parser")

            # Get the text but remove the tags
            text = soup.get_text()

            # If the crawler gets to a page that requires JavaScript, it will stop the crawl
            if ("You need to enable JavaScript to run this app." in text):
                print("Unable to parse page " + url + " due to JavaScript being required")
            
            # Otherwise, write the text to the file in the text directory
            f.write(text)

        # Get the hyperlinks from the URL and add them to the queue
        for link in get_domain_hyperlinks(local_domain, url):
            if link not in seen:
                queue.append(link)
                seen.add(link)


In [None]:
#|eval: false

domain = "andreasthinks.me" # <- put your domain to be crawled
full_url = "https://andreasthinks.me/" # <- put your domain to be crawled with https or http


crawl(full_url)

https://andreasthinks.me/
https://andreasthinks.me/./recent_work.html
https://andreasthinks.me/./index.xml




https://andreasthinks.me/./posts/lockdown_effect/index.html
https://andreasthinks.me/../../index.xml


KeyboardInterrupt: 

In [None]:
#| export

def scrape_website(url, export_dir='scrape_export'):
    """ Takes a url, scrapes and saves to the folder"""

    crawl(url, export_dir=export_dir)

    df = pd.read_csv(export_dir + "/" + "processed/scraped.csv",index_col=0)
    df.columns = ['title', 'text']

    chunked_df = produce_df_embeddings(df, chunk_size=100)

    return chunked_df
    

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()