In [None]:
!pip install selenium beautifulsoup4 readability-lxml requests langchain sentence-transformers lxml

In [8]:
# For link Scrapping
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

# For web scrapping
from readability.readability import Document as ReadabilityDocument
import requests
#from bs4 import BeautifulSoup
import urllib.parse

# For semantic search
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema import Document
from sentence_transformers.util import cos_sim
from sentence_transformers import SentenceTransformer


In [24]:
def link_scrape(query):
    # DuckDuckGo search URL with query
    duckduckgo_url = f"https://html.duckduckgo.com/html/?q={query.replace(' ', '+')}"
    
    # Path to Chrome browser
    chrome_binary_path = r"C:\Users\dtafm\Downloads\chrome-win64\chrome-win64\chrome.exe"
    
    options = Options()
    options.binary_location = chrome_binary_path
    
    # Path to ChromeDriver
    chromedriver_path = r'C:\Users\dtafm\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe'
    service = Service(chromedriver_path)
    
    # Initialize WebDriver
    driver = webdriver.Chrome(service=service, options=options)
    driver.get(duckduckgo_url)
    
    # Retrieve page content
    html_context = driver.page_source
    soup = BeautifulSoup(html_context, "html.parser")
    
    # Extract links
    links = []
    for result in soup.find_all("a", class_="result__a"):
        raw_link = result['href']
        if "uddg=" in raw_link:
            decoded_link = urllib.parse.parse_qs(urllib.parse.urlparse(raw_link).query).get('uddg', [None])[0]
            if decoded_link:
                links.append(decoded_link) 
    
    driver.quit()  # close the driver
    return links

x = link_scrape("santos bike trails")
print(x)

['https://www.floridastateparks.org/parks-and-trails/santos-trailhead-campground', 'https://floridahikes.com/santos-trails/', 'https://www.singletracks.com/bike-trails/santos/', 'https://omba.org/trail-maps/', 'https://www.visitflorida.com/travel-ideas/articles/adventure-biking-santos-trailhead/', 'https://www.floridastateparks.org/learn/biking-cross-florida-greenway', 'https://www.floridastateparks.org/parks-and-trails/santos-trailhead-campground/experiences-amenities', 'https://www.alltrails.com/trail/us/florida/cross-florida-greenway-trail-santos-to-cr-200', 'https://stokedmtb.com/santos-bike-trail-ocala-florida-mountain-bike-trails/', 'https://en.wikipedia.org/wiki/Santos_Trail_System']


In [25]:
def web_scrape(links, max_texts=3):
    combined_text = "" # Initialize text string
    i = 0  # Initialize counter
    
    for link in links:
        if i >= max_texts: 
            break
        
        # Initialize headers for http get
        headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
        response = requests.get(link, headers=headers, timeout=10)
        
        # Extract text from html
        if response.status_code == 200:
            doc = ReadabilityDocument(response.text)  # Initialize Document
            main_content_html = doc.summary()  # Extract main context as HTML
            soup = BeautifulSoup(main_content_html, "html.parser") # Parse main content
            main_content_text = soup.get_text(strip=True)  # Extract clean text
            combined_text += main_content_text + " " # Add to combined text
            i += 1  
        else:
            continue
        
    return combined_text


y = web_scrape(x)
print(y)




In [26]:
from sentence_transformers import SentenceTransformer

# Load the embedding model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [30]:
def relevant_context(combined_text, query):
    # Create langchain document
    documents = [Document(page_content=combined_text, metadata={"source": "local"})]

    # Initialize the text splitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50, add_start_index=True)

    # Split the document into chunks
    all_splits = text_splitter.split_documents(documents)
    
    # Embed the chunks using model
    embeddings = [
        {
            "metadata": doc.metadata,
            "embedding": model.encode(doc.page_content, convert_to_tensor=True)
        }
        for doc in all_splits
    ]
    
    # Embed the query
    query_embedding = model.encode(query, convert_to_tensor=True)

    # Compute cosine similarity
    results = []
    for entry in embeddings:
        score = cos_sim(query_embedding, entry["embedding"]).item()
        results.append({"metadata": entry["metadata"], "score": score})

    # Sort results by score in descending order
    sorted_results = sorted(results, key=lambda x: x['score'], reverse=True)

    # Access top 5 results and their content
    top_5_results = sorted_results[:5]
    for i, result in enumerate(top_5_results, start=1):
        # Extract the start_index from metadata
        start_index = result['metadata'].get('start_index', None)
        if start_index is not None:
            # Find matching content
            content = [
                doc.page_content for doc in all_splits
                if doc.metadata.get('start_index') == start_index
            ]
        else:
            content = ["context not found"]

        # Print the result
        print(f"Top {i} Document:")
        print("Score:", result['score'])
        print("content:", content)
        print()
        
        
relevant_context(y, "santos bike park amenities")


Top 1 Document:
Score: 0.6293128728866577
content: ['riders.A picnic area and bike wash stations are at the trailhead.  Restrooms are down a short path to the hiking trailhead. The Santos Campground adjoins that trailhead, and makes a nice base camp for exploring these trails.Bike wash and picnic pavilion at the Santos TrailheadUse caution that you don’t ride right up on a slower rider or pedestrian. Cyclists of all experience levels use these trails, and the bike paths are also open to hikers.As with all off-road trails, avoid riding after heavy rains so as']

Top 2 Document:
Score: 0.6011033058166504
content: ['sure to sign up as soon as it is announced.The Santos Trailhead has restrooms, potable water, a bike wash rack, and several picnic pavilions\xa0for relaxing\xa0and telling stories of “the ride." There are also several bike shops located very close to the trailhead for needed supplies, repairs or refreshments.The trailhead and campground are located just west off\xa0U.S. 441 on

In [31]:
# Combine the functions to retrieve context docs
def retrieve_context_docs(query):
    links = link_scrape(query)
    combined_text = web_scrape(links, max_texts=3)
    relevant_context(combined_text, query)


query = "santos bike park amenities"

retrieve_context_docs(query)

Top 1 Document:
Score: 0.6135956048965454
content: ['for the vendor village, demo rides and over 400 registered participants who enjoy organized trail rides and related events. It sells out every year quickly, so make sure to sign up as soon as it is announced.The bicycle pump track at Santos TrailheadThe Santos Trailhead has restrooms, potable water, a bike wash rack and several picnic pavilions for relaxing and telling stories of â\x80\x9cthe ride." There are also several bike shops located very close to the trailhead for needed supplies, repairs or']

Top 2 Document:
Score: 0.6011033058166504
content: ['sure to sign up as soon as it is announced.The Santos Trailhead has restrooms, potable water, a bike wash rack, and several picnic pavilions\xa0for relaxing\xa0and telling stories of “the ride." There are also several bike shops located very close to the trailhead for needed supplies, repairs or refreshments.The trailhead and campground are located just west off\xa0U.S. 441 on Southe