# Apexon ChatBot(Success Stories)

# 1. Extracting Linked URLs
Webpage used : https://www.apexon.com/success-stories/

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

# Function to find all linked pages from the main page
def find_linked_pages(url):
    # Set up the Selenium WebDriver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    driver.get(url)
    
    # Scroll down to the bottom of the page to load all content (if infinite scrolling is used)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    
    # Get the page source after it has been fully loaded
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Close the driver
    driver.quit()
    
    # Find all 'a' tags with href attributes
    links = soup.find_all('a', href=True)
    
    # Use a set to collect distinct URLs
    linked_pages = set()
    for link in links:
        href = link['href']
        # Only consider URLs that match the pattern
        if href.startswith('https://www.apexon.com/resources/case-studies/'):
            linked_pages.add(href)
    
    # Convert the set back to a list for ordered printing
    return list(linked_pages)

# Main URL to process
main_url = "https://www.apexon.com/success-stories/"

# Extract and print linked URLs
linked_urls = find_linked_pages(main_url)
x = 0
print("Distinct Linked URLs found:")
for url in linked_urls:
    print(url)
    x = x+1
print(f"Number of Linked URLs found:, {x}")

Distinct Linked URLs found:
https://www.apexon.com/resources/case-studies/improved-delivery-cycle-digital-health-proteus/
https://www.apexon.com/resources/case-studies/modern-data-infrastructure-provides-new-consumer-insights-for-global-alcoholic-beverage-company/
https://www.apexon.com/resources/case-studies/isos-ramps-digital-efforts-to-support-global-travelers/
https://www.apexon.com/resources/case-studies/marketing-supply-chain-firm-migrates-processes-to-drive-unprecedented-value/
https://www.apexon.com/resources/case-studies/global-payments-brand-accelerates-digital-innovation-market-advantage/
https://www.apexon.com/resources/case-studies/empowering-development-through-framework-agnostic-component-libraries/
https://www.apexon.com/resources/case-studies/multinational-tech-company-creates-a-single-repository-for-its-global-tax-needs/
https://www.apexon.com/resources/case-studies/elevating-ux-for-the-fathers-table-with-web-transformation/
https://www.apexon.com/resources/case-studi

# 2. Extracting Content of Linked URLs

In [2]:
import requests
from bs4 import BeautifulSoup

def extract_text_from_url(url):
    try:
        # Fetch the webpage
        response = requests.get(url)
        response.raise_for_status()  # Ensure the request was successful

        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Remove header and footer from the soup object if they exist
        header = soup.find('header')
        if header:
            header.decompose()  # Remove the header from the soup

        footer = soup.find('footer')
        if footer:
            footer.decompose()  # Remove the footer from the soup

        # Extract text from the remaining HTML
        page_text = soup.get_text(separator=' ', strip=True)
        
        return page_text

    except requests.RequestException as e:
        return f"Error fetching the webpage: {str(e)}"
    except Exception as e:
        return f"An error occurred: {str(e)}"

# Example usage
main_url = "https://www.apexon.com/success-stories/"

# List to store URLs and their extracted text
urls = []
texts = []

# Add list of URLs to process
all_urls = [] + linked_urls

# Iterate over each URL and extract the content
for url in all_urls:
    print(f"Processing URL: {url}")
    text = extract_text_from_url(url)
    urls.append(url)
    texts.append(text)

print("Content extraction completed.")


Processing URL: https://www.apexon.com/resources/case-studies/improved-delivery-cycle-digital-health-proteus/
Processing URL: https://www.apexon.com/resources/case-studies/modern-data-infrastructure-provides-new-consumer-insights-for-global-alcoholic-beverage-company/
Processing URL: https://www.apexon.com/resources/case-studies/isos-ramps-digital-efforts-to-support-global-travelers/
Processing URL: https://www.apexon.com/resources/case-studies/marketing-supply-chain-firm-migrates-processes-to-drive-unprecedented-value/
Processing URL: https://www.apexon.com/resources/case-studies/global-payments-brand-accelerates-digital-innovation-market-advantage/
Processing URL: https://www.apexon.com/resources/case-studies/empowering-development-through-framework-agnostic-component-libraries/
Processing URL: https://www.apexon.com/resources/case-studies/multinational-tech-company-creates-a-single-repository-for-its-global-tax-needs/
Processing URL: https://www.apexon.com/resources/case-studies/ele

# 3. Data Preprocessing 
Removing Common Phrases, Header and Footer

In [8]:
from difflib import SequenceMatcher

# Function to find common lines across all URLs
def find_common_phrases(texts):
    # Split each text into lines
    all_lines = [set(text.splitlines()) for text in texts]
    
    # Find the intersection of lines that are common across all texts
    common_phrases = set.intersection(*all_lines)
    
    return common_phrases

# Function to remove common lines from a given text
def remove_common_phrases(text, common_phrases):
    lines = text.splitlines()
    unique_lines = [line for line in lines if line.strip() and line not in common_phrases]
    return ' '.join(unique_lines)

# Find common phrases in the extracted content
common_phrases=find_common_phrases(texts)
# Remove common phrases from each text
cleaned_texts = [remove_common_phrases(text, common_phrases) for text in texts]

phrases_to_remove = [
    """This website uses cookies to offer you the best experience online. By continuing to use our website, you agree to the use of cookies. If you would like to know more about cookies and how to manage them please view our Privacy Policy & Cookies page. Share this page with a friend. Home Insights Success Story""",
    """More AddThis Share options Share to Facebook Share to Twitter Share to Email More AddThis Share options Share to Facebook Share to Twitter Share to Email"""
]

# Print cleaned texts (Optional)
for i, cleaned_text in enumerate(cleaned_texts):
    print(f"\nCleaned content for URL {urls[i]} :")
    for phrase in phrases_to_remove:
        cleaned_text = cleaned_text.replace(phrase, '')  # Remove the phrase
    cleaned_texts[i] = cleaned_text
    print(cleaned_text)



Cleaned content for URL https://www.apexon.com/resources/case-studies/improved-delivery-cycle-digital-health-proteus/ :
Improved Entire Delivery Lifecycle of Digital Health Company - Apexon  Proteus Maximizes First-Mover Edge in Digital Therapies with AWS Success Story HEALTHCARE Proteus Maximizes First-Mover Edge in Digital Therapies with AWS Apexon supports end-to-end development lifecycle for healthcare innovator And More Download Proteus is a digital health company that is revolutionizing the life sciences industry with groundbreaking solutions that manage and monitor medical care to improve patient outcomes. The company’s flagship product, Proteus Discover, combines an ingestible sensor, a small wearable sensor patch, a mobile application, and a provider portal. All connected, these elements unlock previously unattainable treatment insights for better clinical outcomes. Proteus was looking to accelerate its development lifecycle to speed FDA approval and maximize its first-to-mar

# 4. Saving the urls and cleaned content into .csv file

In [9]:
import csv

# Defining the filename to save the CSV
filename = "linked_urls_and_cleaned_content_.csv"

# Writing to CSV
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
    csvwriter = csv.writer(csvfile)
    # Write the headers
    csvwriter.writerow(['URL', 'Cleaned Text'])
    
    # Write the data
    for url, cleaned_text in zip(urls, cleaned_texts):
        csvwriter.writerow([url, cleaned_text])

print(f"Data has been written to {filename}")

Data has been written to linked_urls_and_cleaned_content_.csv


--------------------------

# 5. Chunking

In [19]:
import os
import openai
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_experimental.text_splitter import SemanticChunker

#Setting OpenAI API key as an environment variable
os.environ["OPENAI_API_KEY"] = 'sk-None-dvpc3JgnrJeJTsYUkTs9T3BlbkFJaLbQpougGlLcVMLDtRrI'

#### Trying Different Chunking Strategies-

Semantic Chunking-

In [20]:
openai.api_key = 'sk-None-dvpc3JgnrJeJTsYUkTs9T3BlbkFJaLbQpougGlLcVMLDtRrI'

# Sample content to chunk
content = """Improved Entire Delivery Lifecycle of Digital Health Company - Apexon  Proteus Maximizes First-Mover Edge in Digital Therapies with AWS Success Story HEALTHCARE Proteus Maximizes First-Mover Edge in Digital Therapies with AWS Apexon supports end-to-end development lifecycle for healthcare innovator And More Download Proteus is a digital health company that is revolutionizing the life sciences industry with groundbreaking solutions that manage and monitor medical care to improve patient outcomes. The company’s flagship product, Proteus Discover, combines an ingestible sensor, a small wearable sensor patch, a mobile application, and a provider portal. All connected, these elements unlock previously unattainable treatment insights for better clinical outcomes. Proteus was looking to accelerate its development lifecycle to speed FDA approval and maximize its first-to-market advantage. They had mastered the frontend clinical and diagnostic aspects of the solution but needed help on the backend software development and connectivity and interaction between the different solution elements. Serving 150M People Requiring new Forms of Therapy 540+ Patents Received 1,000+ System use Proteus Discover 99.1% Positive Detection Accuracy* 97.6% Overall Accuracy* * Based on more than 60 studies with more than 500 participants in over 10 years Apexon worked with Proteus (beginning in 2013) across the end-to-end lifecycle – from application development and software verification and validation to system integration and device expansion. Apexon not only designed a test lab in-house with Patch Test Apparatus (PTA), amplifier and shaker humanoid setup as well as a stimulation setup for patch and firmware testing. THE CUSTOMER JOURNEY 2013 2014 2015 2016 2017 2018 2019 2013 Proteus Legacy PL Viewer Systems, V&V & SI 2014 Development on Android platform 2015 Development on iOS platform Firmware integration 2016 Re-architecting product Portal automation 2017 Mobile automation Simulation & Hooks Cloud integration & HIPAA compliance 2018 Platform release & go-to-market MYCITE® deal signed 2019 Product engineering for mental health Discover & MYCITE® the Results Key Outcomes Faster cycle time Faster cycle time Worked with Proteus across the entire development journey to speed time-to-market. Predicted analysis Predicted analysis Shared predictive analysis of test case requirements enabling a high level of automation. Automated Automated 1000+ records. Testing requirements Testing requirements Automated over 65% of testing requirements using Selenium framework. Testing time Testing time 40% reduction in app testing time. Our methodology how we did it For Proteus, Apexon worked across all 3 stages of the digital lifecycle on multiple projects. Go Digital Accelerating the delivery of new digital initiatives with confidence Be digital Creating the infrastructure and foundation to scale digital initiatives Evolve Digital Leveraging data and analytics to continuously improve digital delivery processes Launch & Experiment Automate & Accelerate Be Intelligent & Autonomous Launch & Experiment Enable digital adoption in a quick and agile manner Apexon Exploration Pod worked with the Proteus team for ideation & blueprinting. The Blueprinting phase focused on user interviews, personas, analysis, brainstorming, scenario identification and assumption validation to translate ideas into MVP solution requirements and UI/UX design . Apexon also designed the communication mechanism between the patch, Mobile Apps & Backend. The team also designed & detailed solution architecture and defined the KPIs to track the project. Apexon team then set up a framework for capturing actionable project intelligence (matrices) aligned to the project KPIs. Apexon team used in-house accelerators such as IoT companion App framework & BLE Framework coupled with our methodologies to provide rapid prototyping and MVPs. Automate & Accelerate Build digital infrastructure and foundation for enterprises to scale Apexon Execution Pod consisting of Backend Squad, Mobile Development Squad & Web Development Squad started maturing the product as per the defined roadmap. The development team added new UI features, enhanced communication mechanisms, fixed defects and integrated with Google Analytics & Swrve SDK and New Relic to capture analytics data. The QA team worked on end-to-end system integration tests involving Patch, Cloud, App & Backend. Apexon also automated test scripting and maintenance during sprint-cycle. Apexon tested BLE and app performance on devices and created real-life test scenarios for the mobile app in use with medication intake in compliance with FDA protocols. Be Intelligent & Autonomous Leverage data engineering to make strategic decisions and get digital right every time As a part of continuous evolution, Apexon Execution Pod focused on helping Proteus grow by enabling other innovators to develop applications that could work with the Proteus digital pill & patch. In addition to app enhancements, the Execution Pod developed two SDKs: a BLE-based framework to receive, parse and store patch data in a database, and a Design Mobile Application UI. Apexon also enabled  the continuous tracking of KPIs via the continuous collection of select operational and business data from the project and the abiloity to generate insights from the collected data. The challenge A complex vision Proteus’ product vision was incredibly compelling but executing on that vision was also complex.  The company’s core competence was on the frontend – with breakthrough integrations of technology with medicine and in-patient care. However, they had several critical needs on the backend: Management of regulatory compliance requirements within software product development Short lead times for product launches in multiple geographies Time-to-market challenges that prevented proper discipline across the product lifecycle Bluetooth (BLE) connectivity across multiple devices Greater control of cost of technology infrastructure; desire to move to an on-demand model The ability to scale quickly with the increase in the number of connected medical devices and patient networks The Solution Solving one of the toughest digital engineering challenges Apexon worked with the company across the entire delivery lifecycle: Application Development Apexon enabled the mobile application to communicate continuously via BLE with the patch worn by the patient. Apexon also supported the company on several other development projects including the middleware for the core company application (both Android and iOS), as well as the web frontend portal for the standard patient application. The development team added new UI features, enhanced communication mechanisms, and fixed defects. It also added new features and improved existing features in the Patient & Provider App in a 2-week development sprint. The team worked on the Integration with Google Analytics & Swrve SDK to capture analytics data and with New Relic to capture Application Crush details. Software Verification & Validation Apexon created an end-to-end process to test and QA the entire service offering across the end-to-end application lifecycle. This included: Manual testing of the mobile app Automated testing and QA of the connection of the frontend, backend and middleware Real-life scenarios for testing the mobile app (Patch testing apparatus), physical motion (Step, Incline), medication intake, heartbeat, etc. in addition to generating FDA compliant protocols and documents Apexon also configured & broke BLE signals using Hoffman Box for the interruption testing and the app performance on devices – CPU, Memory usage as a part of Performance Testing . Systems Integration & Device Expansion Apexon set-up a test center to replicate how the systems would operate and communicate with each other in real life. This included end to end integration tests involving Patch, Cloud, App & Backend. Apexon automated test scripting and maintenance during sprint-cycle. The team was able to automate 600 test cases per platform daily for regression tests Cloud Migration Built a  ‘Gap-Analysis’ matrix of Proteus’ cloud requirements and recommended  AWS as the best choice. Leveraged AWS services  to enable end-to-end cloud setup, build and deployment in the Proteus production environments. AWS Lambda – to process medication and patient vitals;  Overall cost saving due to this architecture was around 25% AWS API Gateway and CloudWatch – as a proxy middleware service to accept HTTP traffic from IoT Gateway and collect data from patient sensor device. AWS DynamoDB Database – to build a scalable NoSQL database to store Patient data. AWS CloudTrail – to monitor infrastructure changes and API logs. Identify and Access management – to manage AWS users, groups and roles for various services. AWS Key Management – to maintain and manage encryption keys for data access protection for  HIPAA compliance. Storage S3, OLAP Redshift, and Data pipeline – to transform and store patient data for analytics. AWS VPC, Subnet, Security Groups – to protect the overall Proteus network  and  enable access protocols and procedure of AWS cloud resources. AWS Cognito, STS, SNS – to enable authentication, Security Token Service, and Simple notification service and ensure secure collaboration between physicians, patients, and other users. SERVICES USED: AWS Lambda AWS API Gateway AWS CloudWatch AWS DynamoDB Database AWS CloudTrail AWS Key Management Amazon Virtual Private Cloud AWS Cognito AWS STS AWS SNS Apexon also delivered several other critical capabilities for Proteus: Secure, continuous communication via BLE between the Patch and mobile device and backend systems. Simultaneous testing on various mobile devices. Extension of existing frameworks and tools to include more advanced functionalities. """

# OpenAI Embeddings Instance
embeddings = OpenAIEmbeddings()

# Initializing the SemanticChunker with embeddings
chunker = SemanticChunker(embeddings=embeddings)

# Chunk the EMS content
chunks = chunker.create_documents([content])
print("Chunks:")
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1}: {chunk.page_content}\n")

Chunks:
Chunk 1: Improved Entire Delivery Lifecycle of Digital Health Company - Apexon  Proteus Maximizes First-Mover Edge in Digital Therapies with AWS Success Story HEALTHCARE Proteus Maximizes First-Mover Edge in Digital Therapies with AWS Apexon supports end-to-end development lifecycle for healthcare innovator And More Download Proteus is a digital health company that is revolutionizing the life sciences industry with groundbreaking solutions that manage and monitor medical care to improve patient outcomes. The company’s flagship product, Proteus Discover, combines an ingestible sensor, a small wearable sensor patch, a mobile application, and a provider portal.

Chunk 2: All connected, these elements unlock previously unattainable treatment insights for better clinical outcomes. Proteus was looking to accelerate its development lifecycle to speed FDA approval and maximize its first-to-market advantage. They had mastered the frontend clinical and diagnostic aspects of the solution 

Recursive Chunking (works best)-

In [31]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# RecursiveCharacterTextSplitter with a chunk size of 800 characters and a overlap of 80
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=100
)

# Splitting the content into chunks
chunks = text_splitter.create_documents([content])

print("Chunks:")
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1}: {chunk.page_content}\n")


Chunks:
Chunk 1: Improved Entire Delivery Lifecycle of Digital Health Company - Apexon  Proteus Maximizes First-Mover Edge in Digital Therapies with AWS Success Story HEALTHCARE Proteus Maximizes First-Mover Edge in Digital Therapies with AWS Apexon supports end-to-end development lifecycle for healthcare innovator And More Download Proteus is a digital health company that is revolutionizing the life sciences industry with groundbreaking solutions that manage and monitor medical care to improve patient outcomes. The company’s flagship product, Proteus Discover, combines an ingestible sensor, a small wearable sensor patch, a mobile application, and a provider portal. All connected, these elements unlock previously unattainable treatment insights for better clinical outcomes. Proteus was looking to

Chunk 2: previously unattainable treatment insights for better clinical outcomes. Proteus was looking to accelerate its development lifecycle to speed FDA approval and maximize its first-to-m

Trying Semantic chunking on Recursive chunks-

In [28]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_experimental.text_splitter import SemanticChunker

# Set up RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,  # Set character size
    chunk_overlap=80  # Overlap to preserve context
)

# Split the content into character-based chunks
character_chunks = text_splitter.split_text(ems_content)

# Initialize OpenAI embeddings for SemanticChunker
embeddings = OpenAIEmbeddings()

# Initialize the SemanticChunker with the embeddings
semantic_chunker = SemanticChunker(embeddings=embeddings)

# Apply SemanticChunker to further refine the character-based chunks
final_chunks = semantic_chunker.create_documents(character_chunks)

# Print the final chunks
print("Final Chunks:")
for i, chunk in enumerate(final_chunks):
    print(f"Chunk {i+1}: {chunk.page_content}\n")


Final Chunks:
Chunk 1: Improved Entire Delivery Lifecycle of Digital Health Company - Apexon  Proteus Maximizes First-Mover Edge in Digital Therapies with AWS Success Story HEALTHCARE Proteus Maximizes First-Mover Edge in Digital Therapies with AWS Apexon supports end-to-end development lifecycle for healthcare innovator And More Download Proteus is a digital health company that is revolutionizing the life sciences industry with groundbreaking solutions that manage and monitor medical care to improve patient outcomes. The company’s flagship product, Proteus Discover, combines an ingestible sensor, a small wearable sensor patch, a mobile application, and a provider portal.

Chunk 2: All connected, these elements unlock previously unattainable treatment insights for better clinical outcomes. Proteus was looking to

Chunk 3: treatment insights for better clinical outcomes. Proteus was looking to accelerate its development lifecycle to speed FDA approval and maximize its first-to-market ad

#### Actual Chunking Implementation-


In [38]:
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Step 1: Read the Excel file
file_path = '/Users/raghav/Downloads/linked_urls_and_cleaned_content_.csv'  # replace with the actual path to your Excel file
df = pd.read_csv(file_path)

# Step 2: Extract URL and Content columns
urls = df['URL']
contents = df['Cleaned Text']

# Step 3: Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=100
)

# Step 4: Loop over each content, chunk it, and display the result
for i, content in enumerate(contents):
    print(f"URL: {urls[i]}")
    
    # Split the content into chunks
    chunks = text_splitter.create_documents([content])
    
    # Display the chunks
    for j, chunk in enumerate(chunks):
        print(f"Chunk {j+1}: {chunk.page_content}\n")

URL: https://www.apexon.com/resources/case-studies/improved-delivery-cycle-digital-health-proteus/
Chunk 1: Improved Entire Delivery Lifecycle of Digital Health Company - Apexon  Proteus Maximizes First-Mover Edge in Digital Therapies with AWS Success Story HEALTHCARE Proteus Maximizes First-Mover Edge in Digital Therapies with AWS Apexon supports end-to-end development lifecycle for healthcare innovator And More Download Proteus is a digital health company that is revolutionizing the life sciences industry with groundbreaking solutions that manage and monitor medical care to improve patient outcomes. The company’s flagship product, Proteus Discover, combines an ingestible sensor, a small wearable sensor patch, a mobile application, and a provider portal. All connected, these elements unlock previously unattainable treatment insights for better clinical outcomes. Proteus was looking to

Chunk 2: previously unattainable treatment insights for better clinical outcomes. Proteus was lookin

#### Indexing and Storing Chunks-

In [2]:
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Step 1: Read the CSV file
file_path = '/Users/raghav/Downloads/linked_urls_and_cleaned_content_.csv'  # replace with the actual path to your CSV file
df = pd.read_csv(file_path)

# Step 2: Extract URL and Content columns
urls = df['URL']
contents = df['Cleaned Text']

# Step 3: Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=100
)

# Step 4: Prepare a list to store URL, chunked content, and chunk index
chunked_data = []

# Step 5: Loop over each content, chunk it, and store the result
for i, content in enumerate(contents):
    # Split the content into chunks
    chunks = text_splitter.create_documents([content])
    
    # Add each chunk and its corresponding URL to the chunked_data list
    for j, chunk in enumerate(chunks):
        chunked_data.append({
            'URL': urls[i],
            'Chunk_Index': j + 1,
            'Chunk_Content': chunk.page_content
        })

# Step 6: Create a DataFrame from the chunked_data list
chunked_df = pd.DataFrame(chunked_data)

# Step 7: Save the DataFrame to a CSV file
output_file_path = '/Users/raghav/Downloads/urls_and_chunks.csv'  # specify where you want to save the output file
chunked_df.to_csv(output_file_path, index=False)

print(f"Chunked content has been saved to {output_file_path}")


Chunked content has been saved to /Users/raghav/Downloads/urls_and_chunks.csv


# 6. Generate Embeddings-

In [12]:
import pandas as pd
import openai
import pickle

# Set your OpenAI API key here
openai.api_key = 'sk-None-dvpc3JgnrJeJTsYUkTs9T3BlbkFJaLbQpougGlLcVMLDtRrI'

# Step 1: Load the chunked data from the CSV file
chunked_file_path = '/Users/raghav/Downloads/chunked_content.csv'  # replace with your path
chunked_df = pd.read_csv(chunked_file_path)

# Step 2: Define a function to generate embeddings using the new OpenAI API
def generate_embedding(text):
    response = openai.embeddings.create(
        input=[text],  # The input needs to be a list
        model="text-embedding-3-small"
    )
    
    # Return the first embedding from the response
    embedding = response.data[0].embedding
    return embedding

# Step 3: Loop through each chunk and generate embeddings
embeddings = []
for index, row in chunked_df.iterrows():
    chunk_content = row['Chunk_Content']
    embedding = generate_embedding(chunk_content)
    
    # Store the embedding along with the URL and Chunk_Index
    embeddings.append({
        'URL': row['URL'],
        'Chunk_Index': row['Chunk_Index'],
        'Embedding': embedding
    })

# Step 4: Save the embeddings to a .index file using pickle (or any other format you prefer)
output_index_file = '/Users/raghav/Downloads/chunked_embeddings.index'
with open(output_index_file, 'wb') as f:
    pickle.dump(embeddings, f)

print(f"Embeddings have been saved to {output_index_file}")

Embeddings have been saved to /Users/raghav/Downloads/chunked_embeddings.index


Display the Embeddings-

In [5]:
import pickle

# Step 1: Load the .index file
output_index_file = '/Users/raghav/Downloads/chunked_embeddings.index'  # replace with your path

with open(output_index_file, 'rb') as f:
    loaded_embeddings = pickle.load(f)

# Step 2: Display the loaded embeddings
# You can loop through the embeddings and display the URL, Chunk Index, and a preview of the embedding
for i, embedding_data in enumerate(loaded_embeddings):
    print(f"Embedding {i+1}:")
    print(f"URL: {embedding_data['URL']}")
    print(f"Chunk Index: {embedding_data['Chunk_Index']}")
    
    # Display the first 5 dimensions of the embedding for brevity
    print(f"Embedding (first 10 dimensions): {embedding_data['Embedding'][:100]}\n")


Embedding 1:
URL: https://www.apexon.com/resources/case-studies/improved-delivery-cycle-digital-health-proteus/
Chunk Index: 1
Embedding (first 10 dimensions): [0.04887835308909416, -0.03453630581498146, 0.04626820981502533, 0.05764293670654297, 0.0015231554862111807, 0.01083896029740572, 0.0016674002399668097, 0.03917961195111275, 0.03354720026254654, -0.002385189523920417, 0.050609290599823, -0.06170926243066788, -0.012095949612557888, -0.06819340586662292, 0.0331350713968277, 0.030579879879951477, 0.02181529626250267, -0.049153104424476624, -0.0362672433257103, 0.02879399247467518, 0.033657100051641464, 0.030744731426239014, 0.013861230574548244, 0.007466381415724754, -0.012714141048491001, -0.009671264328062534, -0.050499387085437775, 0.0548679418861866, -0.014410734176635742, -0.006762329488992691, 0.031074432656168938, -0.02225489914417267, -0.02956329844892025, 0.01059168390929699, -0.009149236604571342, -0.0016433594282716513, -0.010255112312734127, 0.013091925531625748, 0.0088

# 7. Storing the embeddings to Chroma DB

In [2]:
import pickle
import chromadb
from chromadb.utils import embedding_functions

# Step 1: Load the embeddings list from the .index file
index_file_path = '/Users/raghav/Downloads/chunked_embeddings.index'  # replace with your path
with open(index_file_path, 'rb') as f:
    loaded_embeddings = pickle.load(f)

# Step 2: Initialize ChromaDB client and collection
client = chromadb.Client()
collection = client.create_collection(name="embeddings_collection")

# Step 3: Prepare embeddings and metadata for insertion into ChromaDB
# Each embedding has metadata (URL and Chunk_Index) and the embedding itself
for i, embedding_data in enumerate(loaded_embeddings):
    metadata = {
        "URL": embedding_data["URL"],
        "Chunk_Index": embedding_data["Chunk_Index"]
    }
    
    # Convert embedding to list (ChromaDB accepts embeddings in list format)
    embedding = embedding_data["Embedding"]
    
    # Add the embedding to the ChromaDB collection
    collection.add(
        ids=[f"embedding_{i}"],  # Unique ID for each embedding
        embeddings=[embedding],  # Embedding vector
        metadatas=[metadata]  # Associated metadata
    )

print("Embeddings and metadata have been stored in ChromaDB.")

Running Chroma using direct local API.
Using DuckDB in-memory for database. Data will be transient.
Embeddings and metadata have been stored in ChromaDB.
