# Apexon ChatBot(Success Stories)

# Extracting Linked URLs
Webpage used : https://www.apexon.com/success-stories/

In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

# Function to find all linked pages from the main page
def find_linked_pages(url):
    # Set up the Selenium WebDriver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    driver.get(url)
    
    # Scroll down to the bottom of the page to load all content (if infinite scrolling is used)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    
    # Get the page source after it has been fully loaded
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Close the driver
    driver.quit()
    
    # Find all 'a' tags with href attributes
    links = soup.find_all('a', href=True)
    
    # Use a set to collect distinct URLs
    linked_pages = set()
    for link in links:
        href = link['href']
        # Only consider URLs that match the pattern
        if href.startswith('https://www.apexon.com/resources/case-studies/'):
            linked_pages.add(href)
    
    # Convert the set back to a list for ordered printing
    return list(linked_pages)

# Main URL to process
main_url = "https://www.apexon.com/success-stories/"

# Extract and print linked URLs
linked_urls = find_linked_pages(main_url)
x = 0
print("Distinct Linked URLs found:")
for url in linked_urls:
    print(url)
    x = x+1
print(f"Number of Linked URLs found:, {x}")

Distinct Linked URLs found:
Number of Linked URLs found:, 0


# Storing webpage Linked URLs text and embeddings in a file

In [4]:
import requests
from bs4 import BeautifulSoup
import openai
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed

# Function to extract text from a webpage
def extract_text_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Remove script and style elements
    for script_or_style in soup(['script', 'style']):
        script_or_style.decompose()
    
    # Extract and return the cleaned text content
    text = soup.get_text(separator=' ')
    return text

# Function to split text into sections
def split_text_into_sections(text, max_length=1000):
    words = text.split()
    sections = []
    current_section = []
    current_length = 0
    for word in words:
        current_length += len(word) + 1
        if current_length > max_length:
            sections.append(' '.join(current_section))
            current_section = []
            current_length = len(word) + 1
        current_section.append(word)
    if current_section:
        sections.append(' '.join(current_section))
    return sections

# Function to convert text sections into OpenAI embeddings using batching
def get_openai_embeddings_batch(sections):
    response = openai.Embedding.create(
        input=sections,
        model="text-embedding-ada-002"  # Use the correct model name
    )
    return [item['embedding'] for item in response['data']]

# Setting OpenAI API key
openai.api_key = 'sk-None-dvpc3JgnrJeJTsYUkTs9T3BlbkFJaLbQpougGlLcVMLDtRrI'  # Ensure this is correct

# Function to process a single URL
def process_url(url):
    text = extract_text_from_url(url)
    sections = split_text_into_sections(text)
    embeddings = get_openai_embeddings_batch(sections)
    return url, sections, embeddings

# Main URL to process
main_url = "https://www.apexon.com/success-stories/"

# Process the main URL
print(f"Processing main URL: {main_url}")
main_url_text = extract_text_from_url(main_url)
main_url_sections = split_text_into_sections(main_url_text)
main_url_embeddings = get_openai_embeddings_batch(main_url_sections)

# Dictionary to store all embeddings and sections
data_to_save = []

# Add main URL data to the list
for section, embedding in zip(main_url_sections, main_url_embeddings):
    data_to_save.append({"URL": main_url, "Text": section, "Embedding": embedding})

# Sample list of linked URLs (replace with actual URLs)

# Use ThreadPoolExecutor to process URLs concurrently
with ThreadPoolExecutor(max_workers=5) as executor:
    futures = {executor.submit(process_url, url): url for url in linked_urls}
    for future in as_completed(futures):
        url, sections, embeddings = future.result()
        for section, embedding in zip(sections, embeddings):
            data_to_save.append({"URL": url, "Text": section, "Embedding": embedding})

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(data_to_save)

# Save the DataFrame to an Excel file
df.to_excel("extracted_data_and_embeddings.xlsx", index=False)

print("Data has been saved to extracted_data_and_embeddings.xlsx")


Processing main URL: https://www.apexon.com/success-stories/
Data has been saved to extracted_data_and_embeddings.xlsx
