# Web Scraping and Ingestion


This notebook explains how to scrape data from a base url with a specified depth when navigating the root url.

`depth` specifies the level of exploitation to the root url, this is necessary to avoid infinite loops when interacting with urls that have a lot of references (ex. Wikipedia)

`depth = 2` Crawler will crawl the sublinks of the root link along with all the sublinks of the sublinks of the root link, then stop.

In [1]:
# imports

%pip install --upgrade openai beautifulsoup4 requests tiktoken langchain

Collecting openai
  Using cached openai-1.20.0-py3-none-any.whl.metadata (21 kB)
Collecting langchain
  Downloading langchain-0.1.16-py3-none-any.whl.metadata (13 kB)
Collecting langchain-community<0.1,>=0.0.32 (from langchain)
  Downloading langchain_community-0.0.33-py3-none-any.whl.metadata (8.5 kB)
Collecting langchain-core<0.2.0,>=0.1.42 (from langchain)
  Downloading langchain_core-0.1.43-py3-none-any.whl.metadata (5.9 kB)
Using cached openai-1.20.0-py3-none-any.whl (292 kB)
Downloading langchain-0.1.16-py3-none-any.whl (817 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m817.7/817.7 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading langchain_community-0.0.33-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading langchain_core-0.1.43-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("../code")

import os
from openai import AzureOpenAI
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def extract_html_and_media_urls(url):
    """Extracts HTML content and media (images and videos) URLs from a given URL."""
    try:
        # Send a GET request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Raises an HTTPError if the status is 4xx, 5xx

        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract HTML
        html_content = soup.prettify()

        # Extract image URLs
        images = [urljoin(url, img['src']) for img in soup.find_all('img') if 'src' in img.attrs]

        # Extract video URLs
        videos = [urljoin(url, video['src']) for video in soup.find_all('video') if 'src' in video.attrs]

        # Optionally, you can print or return the HTML content, images, and videos
        return html_content, images, videos
    except requests.RequestException as e:
        return f"An error occurred: {e}", [], []

In [5]:
# Example usage
url = "https://example.com/"
html_content, images, videos = extract_html_and_media_urls(url)

In [6]:
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens


In [7]:
print("HTML Number of Tokens:", num_tokens_from_string(html_content, "cl100k_base"))
print("Number of Image URLs:", len(images))

HTML Number of Tokens: 376
Number of Image URLs: 0


### Make sure we have the OpenAI Models information

We will need the GPT-4-Turbo and GPT-4-Vision models for this notebook.

When running the below cell, the values should reflect the OpenAI reource you have created in the `.env` file.

In [8]:
AZURE_OPENAI_API_BASE = os.getenv("AZURE_OPENAI_RESOURCE")
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_KEY")
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")

In [9]:
oai_client = AzureOpenAI(
    azure_endpoint = AZURE_OPENAI_API_BASE, 
    api_key= AZURE_OPENAI_API_KEY,  
    api_version= AZURE_OPENAI_API_VERSION,
)

In [10]:
deployment = "gpt-4" # Fill in the deployment name from the portal here

In [11]:

def run_gpt_cleaning(html_content: str) -> str:
    system_prompt = """
    You are an efficient web scraper. Your goal is to take large HTML files and clean them up. The focus should be on reducing all the redundant HTML tags but keep the overall structure the same.
    Some of the things you should do are:
    - Remove all the redundant tags.
    - Extract the content from the tags and keep it in the same order.
    - The output should be in markdown format
    """

    response = oai_client.chat.completions.create(
        model=deployment,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": html_content},
        ],
        temperature=0,
        
    )
    print(f"{response.choices[0].message.role}: {response.choices[0].message.content}")
    return response.choices[0].message.content
    

In [12]:
import os

def save_content(url_string, html_content, markdown_content):
    """
    Saves HTML and Markdown content into separate files within a directory named after the given URL.

    Parameters:
    - url_string: The URL string used to name the directory.
    - html_content: The HTML content to be saved.
    - markdown_content: The Markdown content to be saved.
    """

    # Function to sanitize and create a directory name from the URL
    def create_directory_from_url(url):
        # Remove protocol (http, https) and replace forbidden characters
        for protocol in ['http://', 'https://']:
            url = url.replace(protocol, '')
        # Replace slashes and other forbidden characters with underscores
        forbidden_chars = ['/', '\\', ':', '*', '?', '"', '<', '>', '|']
        for char in forbidden_chars:
            url = url.replace(char, '_')
        return url

    # Create directory path from URL
    directory_path = os.path.join('data', create_directory_from_url(url_string))

    # Create the directory if it doesn't exist
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)

    # Save HTML content to a file
    html_file_path = os.path.join(directory_path, 'content.html')
    with open(html_file_path, 'w', encoding='utf-8') as file:
        file.write(html_content)

    # Save Markdown content to a file
    markdown_file_path = os.path.join(directory_path, 'content.md')
    with open(markdown_file_path, 'w', encoding='utf-8') as file:
        file.write(markdown_content)

    print(f"Files saved in {directory_path}")

In [13]:
# Example usage:
save_content("http://example.com/article", "<html>Your HTML content here</html>", "Your Markdown content here")


Files saved in data/example.com_article


In [14]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse



def is_valid_url(url):
    """Check if a URL is valid and not an internal link."""
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

def get_links(url, session):
    """Return all valid hyperlinks found on the specified webpage, handle PDF and HTML content."""
    try:
        response = session.get(url, timeout=5)
        response.raise_for_status()  # Ensure we notice bad responses
        
        soup = BeautifulSoup(response.text, 'html.parser')
        links = [urljoin(url, link.get('href')) for link in soup.find_all('a', href=True)]
        return set(filter(is_valid_url, links))
    except requests.RequestException as e:
        print(f"Error accessing {url}: {e}")
        return set()


def scrape_site(root_url, max_depth=3, max_links=100):
    """Scrape a site up to a maximum depth from the root URL or until max_links have been visited, downloading PDFs and HTML encountered."""
    session = requests.Session()
    visited = set()
    
    def _scrape(url, depth):
        if url in visited or depth > max_depth or len(visited) >= max_links:
            return
        visited.add(url)
        print(f"Visiting: {url} | Depth: {depth} | Total Visited: {len(visited)}")
        print(f"Processing: {url}")
        html_content, _, _ = extract_html_and_media_urls(url)
        processed_content = run_gpt_cleaning(html_content)
        save_content(url, html_content, processed_content)
        
        links = get_links(url, session)
        for link in links:
            _scrape(link, depth + 1)
    
    _scrape(root_url, 0)


In [15]:
scrape_site('https://example.com/', max_depth=3, max_links=3)

Visiting: https://example.com/ | Depth: 0 | Total Visited: 1
Processing: https://example.com/
assistant: # Example Domain

This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission.

[More information...](https://www.iana.org/domains/example)
Files saved in data/example.com_
Visiting: https://www.iana.org/domains/example | Depth: 1 | Total Visited: 2
Processing: https://www.iana.org/domains/example
assistant: # Example Domains

As described in [RFC 2606](/go/rfc2606) and [RFC 6761](/go/rfc6761), a number of domains such as example.com and example.org are maintained for documentation purposes. These domains may be used as illustrative examples in documents without prior coordination with us. They are not available for registration or transfer.

We provide a web service on the example domain hosts to provide basic information on the purpose of the domain. These web services are provided as best 

In [None]:

oai_emb_client = AzureOpenAI(
    azure_endpoint = AZURE_OPENAI_API_BASE, 
    api_key= AZURE_OPENAI_API_KEY,
    api_version= AZURE_OPENAI_API_VERSION,
)

## Setup AI Search

## Importing AI Search Credentials

In [None]:
AI_SEARCH_ENDPOINT = os.environ.get('COG_SEARCH_ENDPOINT')
AI_SEARCH_ADMIN_KEY = os.environ.get('COG_SEARCH_ADMIN_KEY')

In [None]:
from utils.cogsearch_rest import *

index_name = 'wikipedia_resources'

fields = [
            {"name": "id", "type": "Edm.String", "key": True, "searchable": True, "filterable": True, "retrievable": True, "sortable": True},
            {"name": "vector", "type": "Collection(Edm.Single)", "searchable": True,"retrievable": True, "dimensions": 1536,"vectorSearchProfile": "my-vector-profile"},
            {"name": "tags", "type": "Edm.String","searchable": True, "filterable": False, "retrievable": True, "sortable": False, "facetable": False},
            {"name": "text", "type": "Edm.String","searchable": True, "filterable": False, "retrievable": True, "sortable": False, "facetable": False},
]

index = CogSearchRestAPI(index_name, fields=fields)

In [None]:
%pip install azure-identity

In [None]:
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
import os

load_dotenv(override=True) # take environment variables from .env.

# Variables not used here do not need to be updated in your .env file
endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
key_credential = os.environ["AZURE_SEARCH_ADMIN_KEY"] if len(os.environ["AZURE_SEARCH_ADMIN_KEY"]) > 0 else None
index_name = os.environ["AZURE_SEARCH_INDEX"]
azure_openai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
azure_openai_key = os.environ["AZURE_OPENAI_KEY"] if len(os.environ["AZURE_OPENAI_KEY"]) > 0 else None
azure_openai_embedding_deployment = os.environ["AZURE_OPENAI_EMBEDDING_DEPLOYMENT"]
embedding_model_name = os.environ["AZURE_OPENAI_EMBEDDING_MODEL_NAME"]
azure_openai_api_version = os.environ["AZURE_OPENAI_API_VERSION"]

credential = key_credential or DefaultAzureCredential()

In [None]:
from openai import AzureOpenAI
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
import json

openai_credential = DefaultAzureCredential()
token_provider = get_bearer_token_provider(openai_credential, "https://cognitiveservices.azure.com/.default")

client = AzureOpenAI(
    azure_deployment=azure_openai_embedding_deployment,
    api_version=azure_openai_api_version,
    azure_endpoint=azure_openai_endpoint,
    api_key=azure_openai_key,
    azure_ad_token_provider=token_provider if not azure_openai_key else None
)


titles = [item['title'] for item in input_data]
content = [item['content'] for item in input_data]
title_response = client.embeddings.create(input=titles, model=embedding_model_name)
title_embeddings = [item.embedding for item in title_response.data]
content_response = client.embeddings.create(input=content, model=embedding_model_name)
content_embeddings = [item.embedding for item in content_response.data]

# Generate embeddings for title and content fields
for i, item in enumerate(input_data):
    title = item['title']
    content = item['content']
    item['titleVector'] = title_embeddings[i]
    item['contentVector'] = content_embeddings[i]

# Output embeddings to docVectors.json file
output_path = os.path.join('..', 'output', 'docVectors.json')
output_directory = os.path.dirname(output_path)
if not os.path.exists(output_directory):
    os.makedirs(output_directory)
with open(output_path, "w") as f:
    json.dump(input_data, f)

In [None]:
from azure.search.documents.indexes import SearchIndexClient
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes.models import (
    SimpleField,
    SearchFieldDataType,
    SearchableField,
    SearchField,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField,
    SemanticSearch,
    SearchIndex
)


# Create a search index
index_client = SearchIndexClient(
    endpoint=endpoint, credential=AzureKeyCredential(credential))
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    SearchableField(name="title", type=SearchFieldDataType.String),
    SearchableField(name="url", type=SearchFieldDataType.String),
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchField(name="titleVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
    SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
]

# Configure the vector search configuration  
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw"
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw",
        )
    ]
)



semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        title_field=SemanticField(field_name="title"),
        content_fields=[SemanticField(field_name="content")]
    )
)

# Create the semantic settings with the configuration
semantic_search = SemanticSearch(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search, semantic_search=semantic_search)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')


In [None]:
from azure.search.documents import SearchClient
from azure.core.credentials import AzureKeyCredential

# Upload some documents to the index
output_path = os.path.join('..', 'output', 'docVectors.json')
output_directory = os.path.dirname(output_path)
if not os.path.exists(output_directory):
    os.makedirs(output_directory)
with open(output_path, 'r') as file:  
    documents = json.load(file)  
search_client = SearchClient(endpoint=endpoint, index_name=index_name, credential=AzureKeyCredential(credential))
result = search_client.upload_documents(documents)
print(f"Uploaded {len(documents)} documents") 

In [None]:
from azure.search.documents.models import VectorizedQuery

# Pure Vector Search
# query = "Where does the word Mathematics come from?"  
query = "When was the first international football match played?"  
  
embedding = client.embeddings.create(input=query, model=embedding_model_name).data[0].embedding
vector_query = VectorizedQuery(vector=embedding, k_nearest_neighbors=3, fields="contentVector")
  
results = search_client.search(  
    search_text=None,  
    vector_queries= [vector_query],
    select=["title", "content", "url"],
)  
  
for result in results:  
    print(f"Title: {result['title']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Content: {result['content']}")  
    print(f"URL: {result['url']}\n")  
