In [None]:
!pip install selenium beautifulsoup4 readability-lxml requests langchain sentence-transformers lxml

In [1]:
# For link Scrapping
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

# For web scrapping
from readability.readability import Document as ReadabilityDocument
import requests
#from bs4 import BeautifulSoup
import urllib.parse
import uuid

# For semantic search
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema import Document
from sentence_transformers.util import cos_sim
from sentence_transformers import SentenceTransformer


ImportError: tokenizers>=0.21,<0.22 is required for a normal functioning of this module, but found tokenizers==0.20.3.
Try: `pip install transformers -U` or `pip install -e '.[dev]'` if you're working with git main

In [11]:
def link_scrape(query):
    # DuckDuckGo search URL with query
    duckduckgo_url = f"https://html.duckduckgo.com/html/?q={query.replace(' ', '+')}"
    
    # Path to Chrome browser
    chrome_binary_path = r"chrome-win64/chrome-win64/chrome.exe"
    
    options = Options()
    options.binary_location = chrome_binary_path
    
    # Path to ChromeDriver
    chromedriver_path = r"chromedriver-win64/chromedriver-win64/chromedriver.exe"
    service = Service(chromedriver_path)
    
    # Initialize WebDriver
    driver = webdriver.Chrome(service=service, options=options)
    driver.get(duckduckgo_url)
    
    # Retrieve page content
    html_context = driver.page_source
    soup = BeautifulSoup(html_context, "html.parser")
    
    # Extract links
    links = []
    for result in soup.find_all("a", class_="result__a"):
        raw_link = result['href']
        if "uddg=" in raw_link:
            decoded_link = urllib.parse.parse_qs(urllib.parse.urlparse(raw_link).query).get('uddg', [None])[0]
            if decoded_link:
                links.append(decoded_link) 
    
    driver.quit()  # close the driver
    return links

x = link_scrape("what is the suspension forks purpose on a bike")
print(x)

['https://www.bikeperfect.com/features/mountain-bike-suspension-forks-explained', 'https://www.choosemybicycle.com/en/woc/suspension-fork-everything-you-need-to-know-about-a-bicycle-suspension', 'https://outdoorlabwithj.com/suspension-fork-what-you-need-to-know/', 'https://en.wikipedia.org/wiki/Bicycle_fork', 'https://bike.bikegremlin.com/3832/advantages-and-disadvantages-of-suspension-shock-absorbers-on-bicycles/', 'https://cyclistguy.com/suspension-forks-for-mountain-bikes/', 'https://www.merlincycles.com/blog/buyers-guide-to-suspension-forks/', 'https://brainybiker.com/archives/293', 'https://en.wikipedia.org/wiki/Bicycle_suspension', 'https://bikexchange.com/mountain-bike-suspension-guide/']


In [12]:
def web_scrape(links, max_texts=3):
    documents = []  # List to store documents with metadata
    i = 0  # Initialize counter
    
    for link in links:
        if i >= max_texts: 
            break
        
        # Initialize headers for HTTP GET
        headers = {
            'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }
        try:
            response = requests.get(link, headers=headers, timeout=10)
            
            if response.status_code == 200:
                # Extract text from HTML
                doc = ReadabilityDocument(response.text)  # Initialize Readability Document
                main_content_html = doc.summary()  # Extract main content as HTML
                soup = BeautifulSoup(main_content_html, "html.parser")  # Parse HTML
                main_content_text = soup.get_text(strip=True)  # Extract clean text                
                # Append document metadata to the list
                doc = Document(page_content=main_content_text, metadata={"source": link})
                documents.append(doc)
                i += 1
        except Exception as e:
            print(f"Error processing link {link}: {e}")
            continue
    
    return documents

# Example usage:
documents = web_scrape(x, max_texts=3)
print(documents)

[Document(metadata={'source': 'https://www.bikeperfect.com/features/mountain-bike-suspension-forks-explained'}, page_content="Mountain bike suspension has a whole language of its own, and some of it seems to make no sense at all. Learn how to talk fork with our experts though and you’ll know what everything in your front suspension is called and what it does. We’ve also included the common descriptors of fork performance and problems, complete with advice on how to adjust your fork so it rides perfectly for you.You can navigate your way through the terms using the side navigation, we've also linked the explanations where relevant, so click on the hyperlinks for further info.For more on forks, see ourbest mountain bike forksguide andA to Z of MTB suspension.Air springSpring formed by a sealed air chamber. Lighter than acoil springand can be easily adjusted by altering the air pressure. Will naturally ramp up in resistance deeper in the stroke though and can theoretically change spring r

In [5]:
from sentence_transformers import SentenceTransformer

# Load the embedding model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [14]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer, util as st_utils
from sentence_transformers.util import cos_sim

def text_splitter(documents):
    # Initialize the text splitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50, add_start_index=True)

    # Split the documents into chunks
    all_splits = text_splitter.split_documents(documents)
    
    # Add a unique ID to each split
    for split in all_splits:
        unique_id = str(uuid.uuid4())  # Generate a unique ID for each split
        split.metadata["id"] = unique_id  # Add the unique ID to the split's metadata
    
    return all_splits


def relevant_context(all_splits, query):


    # Embed the chunks using model
    embeddings = [
        {
            "metadata": doc.metadata,
            "embedding": model.encode(doc.page_content, convert_to_tensor=True)
        }
        for doc in all_splits
    ]
    
    # Embed the query
    query_embedding = model.encode(query, convert_to_tensor=True)

    # Compute cosine similarity
    results = []
    for entry in embeddings:
        score = cos_sim(query_embedding, entry["embedding"]).item()
        results.append({"metadata": entry["metadata"], "score": score})

    # Sort results by score in descending order
    sorted_results = sorted(results, key=lambda x: x['score'], reverse=True)

    # Access top 5 results and their content
    top_5_results = sorted_results[:5]
    for i, result in enumerate(top_5_results, start=1):
        # Extract the start_index from metadata
        start_index = result['metadata'].get('start_index', None)
        if start_index is not None:
            # Find matching content
            content = [
                doc.page_content for doc in all_splits
                if doc.metadata.get('start_index') == start_index
            ]
        else:
            content = ["context not found"]

        # Print the result
        print(f"Top {i} Document:")
        print("Score:", result['score'])
        print("content:", content)
        print("metadata:", result['metadata'])
        print()
        
        
all_splits = text_splitter(documents)       
relevant_context(all_splits, "suspension forks purpose on a bike")


Top 1 Document:
Score: 0.7459707260131836
content: ['By adjusting your air pressure, compression damping, and rebound damping, you can fine-tune your suspension to suit your riding style and the type of terrain you’ll be riding on.ConclusionUnderstanding suspension forks is essential for any bike enthusiast looking to enhance their performance and experience a smoother ride. Suspension forks come in a variety of types, each suited for different types of biking. When selecting a suspension fork, factors such as bike frame and riding style should be considered']
metadata: {'source': 'https://outdoorlabwithj.com/suspension-fork-what-you-need-to-know/', 'start_index': 15657, 'id': '75b57e95-5d16-439c-aaeb-190206514551'}

Top 2 Document:
Score: 0.7111109495162964
content: ['Another important factor to consider is thewheel axle. Some forks are designed for traditional quick-release axles, while others use through-axles, which provide increased stiffness and stability. Your bike’s frame will 

In [None]:
# Combine the functions to retrieve context docs
def retrieve_context_docs(query):
    links = link_scrape(query)
    combined_text = web_scrape(links, max_texts=3)
    relevant_context(combined_text, query)


query = "santos bike park amenities"

retrieve_context_docs(query)

In [100]:
print(all_splits)



In [8]:
import chromadb

client = chromadb.PersistentClient(path="./my_chroma_data")

collection = client.create_collection(name="trailBud.2")


In [None]:
#erase all content from the database
#collection.delete()

In [15]:
for split in all_splits:
    collection.add(
    documents=split.page_content,
    metadatas={"source": split.metadata.get("source"), "id": split.metadata.get("id")},
    ids= split.metadata.get("id")
)

In [22]:
# Assuming you already have your Chroma client and collection set up

# Example query
query = "what is a suspension fork?"

# Step 1: Convert the query into an embedding
query_embedding = model.encode(query)  # replace 'model' with your model used for embeddings

# Step 2: Perform the semantic search in your Chroma collection
results = collection.query(
    query_embeddings=query_embedding,
    n_results=3  # Adjust the number of results you want
)

# Step 3: Display the results
for i, doc_list in enumerate(results['documents'], start=1):
    print(f"Result {i}:")
    for j, doc in enumerate(doc_list, start=1):
        print(f"  Part {j}: {doc}\n")  # Add a prefix for sub-documents if needed


Result 1:
  Part 1: the terminology is key to making informed decisions. Here are some common terms you should know:TermDefinitionRight suspension forkA suspension fork designed for the specific geometry of abike frame. Using the correct right suspension fork can greatly improve the performance and comfort of your ride.Rear suspensionArear suspensionsystem, such as a shock absorber, can work in conjunction with a front suspension fork to provide a smoother ride over rough terrain.Metal coil springA type of suspension fork

  Part 2: By adjusting your air pressure, compression damping, and rebound damping, you can fine-tune your suspension to suit your riding style and the type of terrain you’ll be riding on.ConclusionUnderstanding suspension forks is essential for any bike enthusiast looking to enhance their performance and experience a smoother ride. Suspension forks come in a variety of types, each suited for different types of biking. When selecting a suspension fork, factors such a

In [19]:
similarity_scores = results["distances"]
print(similarity_scores)

[[0.6323789392157456, 0.7505471482812076, 0.7618706226348877]]
