# Apexon ChatBot

# Extracting Linked URLs
Webpage used : https://www.apexon.com/success-stories/

In [18]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

# Function to find all linked pages from the main page
def find_linked_pages(url):
    # Set up the Selenium WebDriver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    driver.get(url)
    
    # Scroll down to the bottom of the page to load all content (if infinite scrolling is used)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    
    # Get the page source after it has been fully loaded
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Close the driver
    driver.quit()
    
    # Find all 'a' tags with href attributes
    links = soup.find_all('a', href=True)
    
    # Use a set to collect distinct URLs
    linked_pages = set()
    for link in links:
        href = link['href']
        # Only consider URLs that match the pattern
        if href.startswith('https://www.apexon.com/resources/case-studies/'):
            linked_pages.add(href)
    
    # Convert the set back to a list for ordered printing
    return list(linked_pages)

# Main URL to process
main_url = "https://www.apexon.com/success-stories/"

# Extract and print linked URLs
linked_urls = find_linked_pages(main_url)
x = 0
print("Distinct Linked URLs found:")
for url in linked_urls:
    print(url)
    x = x+1
print(f"Number of Linked URLs found:, {x}")

Distinct Linked URLs found:
https://www.apexon.com/resources/case-studies/engineering-giant-gets-95-accuracy-in-customer-service-automation/
https://www.apexon.com/resources/case-studies/modernizing-insurance-business-operations-with-data-visualization/
https://www.apexon.com/resources/case-studies/german-automaker-labels-12m-objects-per-annum-from-street-videos/
https://www.apexon.com/resources/case-studies/leading-us-bank-accelerates-their-path-to-digital-banking/
https://www.apexon.com/resources/case-studies/manufacturing-solution-distributor-grows-organic-sales-with-new-ecommerce-platform/
https://www.apexon.com/resources/case-studies/modern-data-infrastructure-provides-new-consumer-insights-for-global-alcoholic-beverage-company/
https://www.apexon.com/resources/case-studies/bring-healthcare-innovation-to-market-faster-willow/
https://www.apexon.com/resources/case-studies/healthcare-payer-saves-overpayments-worth-1-5m-per-annum/
https://www.apexon.com/resources/case-studies/west-hi

# Extracting, Chunking the content, & generating embeddings of linked URLs
Webpage: Success Stories

The below code took 15+ minutes to run before throwing "IOPub data rate exceeded" **error** due to individual API call for each section-


In [25]:
import requests
from bs4 import BeautifulSoup
import openai

#Function to extract text from a webpage
def extract_text_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Remove script and style elements
    for script_or_style in soup(['script', 'style']):
        script_or_style.decompose()
    
    # Extract and return the cleaned text content
    text = soup.get_text(separator=' ')
    return text

#Function to split text into sections
def split_text_into_sections(text, max_length=1000):
    words = text.split()
    sections = []
    current_section = []
    current_length = 0
    for word in words:
        current_length += len(word) + 1
        if current_length > max_length:
            sections.append(' '.join(current_section))
            current_section = []
            current_length = len(word) + 1
        current_section.append(word)
    if current_section:
        sections.append(' '.join(current_section))
    return sections


#Function to convert text sections into OpenAI embeddings
def get_openai_embedding(text):
    response = openai.Embedding.create(
        input=text,
        model="text-embedding-3-small"
    )
    return response['data'][0]['embedding']

#Setting OpenAI API key
openai.api_key = 'sk-None-dvpc3JgnrJeJTsYUkTs9T3BlbkFJaLbQpougGlLcVMLDtRrI'

# Main URL to process
main_url = "https://www.apexon.com/success-stories/"

#Extract data of main URL
print(f"Processing main URL: {main_url}")
main_url_text = extract_text_from_url(main_url)

#Split into sections
main_url_sections = split_text_into_sections(main_url_text)

#Generate Embeddings
main_url_embeddings = [get_openai_embedding(section) for section in main_url_sections]

all_embeddings_dict = {main_url: main_url_embeddings}

#Extract data, split into sections and load embeddings of the linked URLs into the all_embeddings_dict dictionary
for url in linked_urls:
    print(f"Processing linked URL: {url}")
    text = extract_text_from_url(url)
    sections = split_text_into_sections(text)
    
    embeddings = [get_openai_embedding(section) for section in sections]
    all_embeddings_dict[url] = embeddings

#Print the first few embeddings from each URL
for url, embeddings in all_embeddings_dict.items():
    print(url, embeddings)

Processing main URL: https://www.apexon.com/success-stories/
Processing linked URL: https://www.apexon.com/resources/case-studies/engineering-giant-gets-95-accuracy-in-customer-service-automation/
Processing linked URL: https://www.apexon.com/resources/case-studies/modernizing-insurance-business-operations-with-data-visualization/
Processing linked URL: https://www.apexon.com/resources/case-studies/german-automaker-labels-12m-objects-per-annum-from-street-videos/
Processing linked URL: https://www.apexon.com/resources/case-studies/leading-us-bank-accelerates-their-path-to-digital-banking/
Processing linked URL: https://www.apexon.com/resources/case-studies/manufacturing-solution-distributor-grows-organic-sales-with-new-ecommerce-platform/
Processing linked URL: https://www.apexon.com/resources/case-studies/modern-data-infrastructure-provides-new-consumer-insights-for-global-alcoholic-beverage-company/
Processing linked URL: https://www.apexon.com/resources/case-studies/bring-healthcare

APIError: HTTP code 500 from API (<html>
<head><title>500 Internal Server Error</title></head>
<body>
<center><h1>500 Internal Server Error</h1></center>
<hr><center>nginx</center>
</body>
</html>
)

The below code took 2 minutes 8 seconds to run using **Batch Learning Technique**-

In [23]:
import requests
from bs4 import BeautifulSoup
import openai
from concurrent.futures import ThreadPoolExecutor, as_completed

# Function to extract text from a webpage
def extract_text_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Remove script and style elements
    for script_or_style in soup(['script', 'style']):
        script_or_style.decompose()
    
    # Extract and return the cleaned text content
    text = soup.get_text(separator=' ')
    return text

# Function to split text into sections
def split_text_into_sections(text, max_length=1000):
    words = text.split()
    sections = []
    current_section = []
    current_length = 0
    for word in words:
        current_length += len(word) + 1
        if current_length > max_length:
            sections.append(' '.join(current_section))
            current_section = []
            current_length = len(word) + 1
        current_section.append(word)
    if current_section:
        sections.append(' '.join(current_section))
    return sections

# Function to convert text sections into OpenAI embeddings using batching
def get_openai_embeddings_batch(sections):
    response = openai.Embedding.create(
        input=sections,
        model="text-embedding-3-small"
    )
    return [item['embedding'] for item in response['data']]

# Setting OpenAI API key
openai.api_key = 'sk-None-dvpc3JgnrJeJTsYUkTs9T3BlbkFJaLbQpougGlLcVMLDtRrI'  # Ensure this is correct

# Function to process a single URL
def process_url(url):
    text = extract_text_from_url(url)
    sections = split_text_into_sections(text)
    embeddings = get_openai_embeddings_batch(sections)
    return url, embeddings

# Main URL to process
main_url = "https://www.apexon.com/success-stories/"

# Process the main URL
print(f"Processing main URL: {main_url}")
main_url_text = extract_text_from_url(main_url)
main_url_sections = split_text_into_sections(main_url_text)
main_url_embeddings = get_openai_embeddings_batch(main_url_sections)

# Dictionary to store all embeddings
all_embeddings_dict = {main_url: main_url_embeddings}

# Use ThreadPoolExecutor to process URLs concurrently
with ThreadPoolExecutor(max_workers=5) as executor:
    futures = {executor.submit(process_url, url): url for url in linked_urls}
    for future in as_completed(futures):
        url, embeddings = future.result()
        all_embeddings_dict[url] = embeddings

# Print the first few embeddings from each URL
for url, embeddings in all_embeddings_dict.items():
    print(f"\nEmbeddings for {url} :")
    for i, embedding in enumerate(embeddings[:3]):
        print(i, f"Embedding {i+1}: {embedding[:10]}...")  # Print the first 10 dimensions for brevity

Processing main URL: https://www.apexon.com/success-stories/

Embeddings for https://www.apexon.com/success-stories/ :
0 Embedding 1: [0.047493379563093185, -0.008638953790068626, 0.013236657716333866, 0.030215471982955933, 0.02175736613571644, -0.0011711625847965479, -0.03397153690457344, 0.027934009209275246, -0.0014076556544750929, 0.002514478052034974]...
1 Embedding 2: [0.021187709644436836, -0.0005492068594321609, 0.019592495635151863, 0.03583095222711563, 0.049547065049409866, -0.027827616780996323, -0.034194838255643845, 0.008698686957359314, 0.038585081696510315, -0.022142110392451286]...
2 Embedding 3: [0.004289904609322548, -0.005249622743576765, 0.044520054012537, 0.043380603194236755, 0.021066315472126007, -0.020727191120386124, -0.0488608293235302, 0.031877551227808, 0.005934651009738445, -0.01889592781662941]...

Embeddings for https://www.apexon.com/resources/case-studies/modernizing-insurance-business-operations-with-data-visualization/ :
0 Embedding 1: [0.015915801748

# Sample Data Extraction & Conversion of text into embeddings 
(Webpage used - https://www.apexon.com)

In [42]:
import requests
from bs4 import BeautifulSoup
import openai

# Function to extract text from a webpage
def extract_text_from_url(url):
    # Fetch the webpage content
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Remove script and style elements
    for script_or_style in soup(['script', 'style']):
        script_or_style.decompose()  # This removes the tag from the soup
    
    # Extract and return the cleaned text content
    text = soup.get_text(separator=' ')
    return text


# Function to split the text into manageable sections
def split_text_into_sections(text, max_length=1000):
    # Split the text into sections based on a max character length
    words = text.split()
    sections = []
    
    current_section = []
    current_length = 0
    
    for word in words:
        current_length += len(word) + 1
        if current_length > max_length:
            sections.append(' '.join(current_section))
            current_section = []
            current_length = len(word) + 1
            
        current_section.append(word)
    
    if current_section:
        sections.append(' '.join(current_section))
    
    return sections

# Function to convert text sections into OpenAI embeddings
def get_openai_embedding(text):
    response = openai.Embedding.create(
        input=text,
        model="text-embedding-3-small"
    )
    return response['data'][0]['embedding']

#OpenAI API key
openai.api_key = 'sk-None-dvpc3JgnrJeJTsYUkTs9T3BlbkFJaLbQpougGlLcVMLDtRrI'

# Example usage
url = "https://www.apexon.com"
text = extract_text_from_url(url)
sections = split_text_into_sections(text)

# Converting each text section into an embedding
embeddings = [get_openai_embedding(section) for section in sections]
print(embeddings)

# Print the first few embeddings
#for i, embedding in enumerate(embeddings[:3]):
#    print(f"Embedding {i+1}: {embedding[:10]}...")  # Print the first 10 dimensions for brevity
print("\nText from website:", "\n", sections)

[[0.02907681092619896, 0.006551130674779415, 0.023223333060741425, 0.02271966077387333, 0.018186619505286217, -0.006476260721683502, -0.02940351702272892, 0.031009819358587265, -0.004206336569041014, 0.014797047711908817, 0.03335120901465416, -0.08401782810688019, -0.012523720040917397, -0.040947120636701584, 0.03226219117641449, 0.054723210632801056, 0.006319714244455099, -0.005428079515695572, 0.005438289139419794, 0.04503094032406807, 0.04048428684473038, 0.02904958464205265, -0.003961306996643543, 0.02119503542780876, 0.02408093586564064, -0.031989935785532, -0.04015757888555527, 0.0445408821105957, 0.004322044551372528, -0.04759013652801514, 0.051592279225587845, -0.026340650394558907, -0.014170861802995205, 0.0037026649806648493, 0.03650936484336853, 0.03234386816620827, 0.041627757251262665, 0.03046531043946743, -0.021113358438014984, 0.02088194154202938, 0.03215328976511955, 0.02369977906346321, 0.01310226134955883, 0.05946044251322746, -0.014565630815923214, 0.0065136956982314

# Extracting links from main page
Webpage used: https://www.apexon.com/success-stories/

In [36]:
import requests
from bs4 import BeautifulSoup
import openai

# Function to extract text from a webpage
def extract_text_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Remove script and style elements
    for script_or_style in soup(['script', 'style']):
        script_or_style.decompose()
    
    # Extract and return the cleaned text content
    text = soup.get_text(separator=' ')
    return text

# Function to find all linked pages from the main page
def find_linked_pages(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all links on the page
    links = soup.find_all('a', href=True)
    
    # Debug: Print all hrefs to understand what's captured
    for link in links:
        print(link['href'])

    # Filter and return only the full URLs of internal links
    base_url = 'https://www.apexon.com'
    linked_pages = [base_url + link['href'] if link['href'].startswith('/') else link['href'] for link in links if '/resources/case-studies/' in link['href']]
    
    return linked_pages

# Function to convert text sections into OpenAI embeddings
def get_openai_embedding(text):
    response = openai.Embedding.create(
        input=text,
        model="text-embedding-ada-002"  # Changed to the correct embedding model
    )
    return response['data'][0]['embedding']

# Set your OpenAI API key
openai.api_key = 'sk-None-dvpc3JgnrJeJTsYUkTs9T3BlbkFJaLbQpougGlLcVMLDtRrI'

# Main URL to process
main_url = "https://www.apexon.com/success-stories/"

# Extract data from the main URL
print(f"Processing main URL: {main_url}")
main_text = extract_text_from_url(main_url)

def split_text_into_sections(text, max_length=1000):
    words = text.split()
    sections = []
    current_section = []
    current_length = 0
    for word in words:
        current_length += len(word) + 1
        if current_length > max_length:
            sections.append(' '.join(current_section))
            current_section = []
            current_length = len(word) + 1
        current_section.append(word)
    if current_section:
        sections.append(' '.join(current_section))
    return sections

main_sections = split_text_into_sections(main_text)
main_embeddings = [get_openai_embedding(section) for section in main_sections]

# Find and process all linked pages
linked_urls = find_linked_pages(main_url)
all_embeddings = {main_url: main_embeddings}

for url in linked_urls:
    print(f"Processing linked URL: {url}")
    text = extract_text_from_url(url)
    sections = split_text_into_sections(text)
    
    embeddings = [get_openai_embedding(section) for section in sections]
    all_embeddings[url] = embeddings

    #Print the first few embeddings from each URL
for url, embeddings in all_embeddings.items():
    print(f"\nEmbeddings for {url}:")
    for i, embedding in enumerate(embeddings[:3]):
        print(i, f"Embedding {i+1}: {embedding[:10]}...")  # Print the first 10 dimensions for brevity


Processing main URL: https://www.apexon.com/success-stories/
https://www.apexon.com/privacy-policy/
https://www.apexon.com/
/digital-journey/
/digital-journey/digital-lifecycle-methodology/
/digital-journey/apexon-compass/
/our-services/
/our-services/experience/
/our-services/experience/ui-ux-services/
/our-services/experience/salesforce/
/our-services/experience/digital-commerce/
/our-services/digital-engineering/
/our-services/digital-engineering/cloud-native-platform-engineering/
/our-services/digital-engineering/iot-development/
/our-services/digital-engineering/application-development/
/our-services/digital-engineering/quality-engineering-overview/
/our-services/data-analytics/
/our-services/data-analytics/data-strategy/
/our-services/data-analytics/data-engineering/
/our-services/data-analytics/data-visualization/
/our-services/data-analytics/data-management-governance/
/our-services/data-analytics/managed-data-services/
/our-services/artificial-intelligence/
/our-services/data-