# Apexon ChatBot(Success Stories)

# 1. Extracting Linked URLs
Webpage used : https://www.apexon.com/success-stories/

In [7]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

# Function to find all linked pages from the main page
def find_linked_pages(url):
    # Set up the Selenium WebDriver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    driver.get(url)
    
    # Scroll down to the bottom of the page to load all content (if infinite scrolling is used)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    
    # Get the page source after it has been fully loaded
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Close the driver
    driver.quit()
    
    # Find all 'a' tags with href attributes
    links = soup.find_all('a', href=True)
    
    # Use a set to collect distinct URLs
    linked_pages = set()
    for link in links:
        href = link['href']
        # Only consider URLs that match the pattern
        if href.startswith('https://www.apexon.com/resources/case-studies/'):
            linked_pages.add(href)
    
    # Convert the set back to a list for ordered printing
    return list(linked_pages)

# Main URL to process
main_url = "https://www.apexon.com/success-stories/"

# Extract and print linked URLs
linked_urls = find_linked_pages(main_url)
x = 0
print("Distinct Linked URLs found:")
for url in linked_urls:
    print(url)
    x = x+1
print(f"Number of Linked URLs found:, {x}")

Distinct Linked URLs found:
https://www.apexon.com/resources/case-studies/cloud-data-system-accelerates-performance-for-a-fortune-500-company/
https://www.apexon.com/resources/case-studies/peloton-conquers-mobile-testing-challenges/
https://www.apexon.com/resources/case-studies/insurer-collects-800000-in-additional-premiums-through-website-redesign/
https://www.apexon.com/resources/case-studies/medical-dental-supplier-modernizes-its-commerce-experience/
https://www.apexon.com/resources/case-studies/biotech-leader-takes-vision-care-and-treatment-mobile/
https://www.apexon.com/resources/case-studies/leading-weight-management-company-improves-customer-ratings/
https://www.apexon.com/resources/case-studies/disrupting-the-pharmacy-benefit-management-market/
https://www.apexon.com/resources/case-studies/luxury-lifestyle-brand-modernizes-systems-for-seamless-customer-experience/
https://www.apexon.com/resources/case-studies/increasing-maturity-in-agile-software-development/
https://www.apexon

# 2. Extracting Content from URLs

In [10]:
import requests
from bs4 import BeautifulSoup

# Function to extract text from a webpage
def extract_text_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Remove script and style elements
    for script_or_style in soup(['script', 'style']):
        script_or_style.decompose()
    
    # Extract and return the cleaned text content
    text = soup.get_text(separator=' ')
    return text

# Main URL to process
main_url = "https://www.apexon.com/success-stories/"

# Process the main URL to extract text
print(f"Processing main URL: {main_url}")
main_url_text = extract_text_from_url(main_url)


Processing main URL: https://www.apexon.com/success-stories/


# 3. Removing Repetitions

In [11]:
from difflib import SequenceMatcher

# Function to remove repeated lines or phrases
def remove_repetitions(text):
    lines = text.splitlines()
    unique_lines = []
    previous_line = ""

    for line in lines:
        line = line.strip()
        # Compare with previous line using SequenceMatcher to identify similarity
        if line and not any(SequenceMatcher(None, line, prev_line).ratio() > 0.9 for prev_line in unique_lines):
            unique_lines.append(line)
    
    cleaned_text = ' '.join(unique_lines)
    return cleaned_text

# Remove repeated content from the extracted text
main_url_cleaned_text = remove_repetitions(main_url_text)


# 4. Splitting into sections and storing URL & Cleaned Content into .csv file

In [12]:
import pandas as pd

# Function to split text into sections
def split_text_into_sections(text, max_length=1000):
    words = text.split()
    sections = []
    current_section = []
    current_length = 0
    for word in words:
        current_length += len(word) + 1
        if current_length > max_length:
            sections.append(' '.join(current_section))
            current_section = []
            current_length = len(word) + 1
        current_section.append(word)
    if current_section:
        sections.append(' '.join(current_section))
    return sections

# Split the cleaned text into sections
main_url_sections = split_text_into_sections(main_url_cleaned_text)

# Prepare data for CSV storage
urls = []
contents = []

# Add main URL data to the lists
for section in main_url_sections:
    urls.append(main_url)
    contents.append(section)

# Save the URLs and content to a CSV file
df = pd.DataFrame({
    "URL": urls,
    "Content": contents
})
df.to_csv("/Users/raghav/Downloads/urls_and_contents.csv", index=False)

print("URLs and contents have been saved to urls_and_contents.csv")


URLs and contents have been saved to urls_and_contents.csv


# 5. Generating and storing Embeddings

In [13]:
import openai
import numpy as np
import faiss
from concurrent.futures import ThreadPoolExecutor, as_completed

# Function to convert text sections into OpenAI embeddings using batching
def get_openai_embeddings_batch(sections):
    response = openai.Embedding.create(
        input=sections,
        model="text-embedding-3-small"  # Use the correct model name
    )
    return [item['embedding'] for item in response['data']]

# Setting OpenAI API key
openai.api_key = 'sk-None-dvpc3JgnrJeJTsYUkTs9T3BlbkFJaLbQpougGlLcVMLDtRrI'  # Ensure this is correct

# Generate embeddings for the sections
main_url_embeddings = get_openai_embeddings_batch(main_url_sections)

# Lists to store embeddings for the FAISS index
embeddings = []

# Add main URL embeddings to the list
for embedding in main_url_embeddings:
    embeddings.append(embedding)

# Sample list of linked URLs (replace with actual URLs)
linked_urls = ["https://www.apexon.com/resources/case-studies/example1", 
               "https://www.apexon.com/resources/case-studies/example2"]

# Use ThreadPoolExecutor to process URLs concurrently
with ThreadPoolExecutor(max_workers=5) as executor:
    futures = {executor.submit(process_url, url): url for url in linked_urls}
    for future in as_completed(futures):
        url, sections, url_embeddings = future.result()
        for embedding in url_embeddings:
            embeddings.append(embedding)

# Convert embeddings to a numpy array for FAISS
embedding_dim = len(embeddings[0])  # Determine the dimensionality of the embeddings
embedding_matrix = np.array(embeddings).astype('float32')

# Initialize FAISS index
index = faiss.IndexFlatL2(embedding_dim)  # L2 distance is typically used for OpenAI embeddings

# Add embeddings to the FAISS index
index.add(embedding_matrix)

# Save the FAISS index to a file
faiss.write_index(index, "/Users/raghav/Downloads/embeddings.index")

print("FAISS index has been saved to embeddings.index")


FAISS index has been saved to embeddings.index


# 6. Uploading Embeddings & Context in Chroma DB

In [None]:
import chromadb
from chromadb.config import Settings
import pandas as pd
import numpy as np

# Initialize Chroma DB client
client = chromadb.Client(Settings())

# Create a collection for storing embeddings
collection = client.create_collection(name="apexon_context")

# Load the embeddings and content from CSV and .index file
# Assuming the .csv file has "URL", "Content" columns
df = pd.read_csv("/Users/raghav/Downloads/urls_and_contents.csv")

# Assuming that 'embeddings.index' corresponds to the stored FAISS index
import faiss
index = faiss.read_index("/Users/raghav/Downloads/embeddings.index")

# Get all embeddings from the index
embeddings = np.zeros((index.ntotal, index.d))
index.reconstruct_n(0, embeddings)

# Add data to the Chroma DB collection
for i, row in df.iterrows():
    embedding = embeddings[i].tolist()  # Convert to list for Chroma DB
    collection.add(
        embeddings=[embedding],
        metadatas=[{"URL": row['URL'], "Content": row['Content']}],
        ids=[str(i)]
    )

print("Data has been successfully stored in Chroma DB.")