In [None]:
%pip install requests beautifulsoup4 pandas langchain chromadb -q

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from langchain.embeddings import OpenAIEmbeddings
from chromadb.config import Settings
from chromadb.db import ChromaDB

# Step 2.1: Fetch SHL Product Catalog
def fetch_shl_catalog(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

# Step 2.2: Extract Solution Links
def extract_solution_links(soup):
    solution_links = [a['href'] for a in soup.select('.solution-card a')]
    return solution_links

# Step 2.3: Fetch and Parse Fact Sheets
def fetch_and_parse_fact_sheets(links):
    fact_sheets = {}
    for link in links:
        response = requests.get(link)
        soup = BeautifulSoup(response.content, 'html.parser')
        fact_sheet_text = soup.select_one('.fact-sheet').text
        fact_sheets[link] = fact_sheet_text
    return fact_sheets

# Step 2.4: Extract Test Metadata
def extract_test_metadata(soup):
    tables = pd.read_html(str(soup.find_all('table')))
    metadata = []
    for table in tables:
        for index, row in table.iterrows():
            metadata.append({
                "test_code": row[0],
                "assessment_type": row[1],
                "target_roles": row[2].split(', '),
                "IRT_adaptive": "Adaptive" in row[3]
            })
    return metadata

# Example usage
url = "https://www.shl.com/solutions/products/product-catalog/"
soup = fetch_shl_catalog(url)
links = extract_solution_links(soup)
fact_sheets = fetch_and_parse_fact_sheets(links)
metadata = extract_test_metadata(soup)

print(fact_sheets)
print(metadata)

In [None]:
# Step 3.1: Initialize Vector Database
def init_vector_db():
    settings = Settings()
    db = ChromaDB(settings)
    return db

# Step 3.2: Embed and Store SHL Solutions
def store_solutions_in_vector_db(db, solutions):
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
    for solution in solutions:
        embedding = embeddings.embed(solution)
        db.add(embedding, metadata={"solution": solution})

# Example usage
db = init_vector_db()
solutions = ["Solution A", "Solution B"]  # Replace with actual solution names
store_solutions_in_vector_db(db, solutions)


In [None]:
# Step 3.3: Store Fact Sheets as Documents
import json

def store_fact_sheets(fact_sheets):
    with open('fact_sheets.json', 'w') as f:
        json.dump(fact_sheets, f)

# Example usage
store_fact_sheets(fact_sheets)


In [None]:
# Step 3.4: Store Test Metadata
def store_test_metadata(metadata):
    df = pd.DataFrame(metadata)
    df.to_csv('test_metadata.csv', index=False)

# Example usage
store_test_metadata(metadata)


In [4]:
import pandas as pd
import json

# Load JSON data from file
file_path = "shl_pre.json"  # Adjust the path if necessary
with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Convert to DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
df.head()

Unnamed: 0,title,category,description,job_levels,languages,assessment_length,fact_sheet,tags
0,Claims/Operations Supervisor Solution,Pre-packaged Job Solution,Assessment solution designed for claims and op...,"[Supervisor, Manager]","[English, Hindi]",Short Form,,"[P, S, A, B]"
1,Contact Center Customer Service + 8.0,Pre-packaged Job Solution,Comprehensive assessment for customer service ...,"[Entry, Mid, Senior]",[English],Standard,,"[A, B, C, P, S]"
2,Contact Center Customer Service 8.0,Pre-packaged Job Solution,Assessment focused on customer service skills ...,"[Entry, Mid]",[English],Standard,,"[S, B, C, P]"
3,Contact Center Manager - Short Form,Pre-packaged Job Solution,Short form assessment for contact center manag...,"[Manager, Director]",[English],Short Form,,"[A, B, P, S]"
4,Contact Center Sales & Service + 8.0,Pre-packaged Job Solution,Assessment for sales and service roles in cont...,"[Entry, Mid, Senior]",[English],Standard,,"[P, B, C, A, S]"


In [None]:
from langchain_community.document_loaders import WebBaseLoader
import re

for row, idx in df.iterrows():
    base_url = row['url']
    response = WebBaseLoader(base_url)
    docs = response.load()
    text = docs[0].page_content  # LangChain Document objects have a 'page_content' attribute
    text = text.replace('\n', ' ').strip()
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove non-alphanumeric characters except spaces
    x= x.lower()
    final = x.findall('description/s')
    match = re.search(r'\bdescription\b', text, re.IGNORECASE)  # Case-insensitive search
    if match:
        text_after_description = text[match.start():
        row['description'] = text_after_description
    else:
        print('"description" not found in the text.')
    
    

In [5]:
df.shape

(99, 8)

In [16]:
m = str(df['title'][1])
print(m)

Contact Center Customer Service + 8.0


In [36]:
import pandas as pd

# Sample DataFrame (replace this with your actual DataFrame)
df = pd.DataFrame({
    'title': ['Contact Center Customer Service + 8.0', 'Account Manager Solution', 'Product Catalog']
})

# Base URL
base_url = 'https://www.shl.com/solutions/products/product-catalog/'

# Create the 'url' column by applying the transformations to the 'title' column
df['url'] = base_url + "view/" + df['title'].str.lower().str.replace(' ', '-').str.replace(r'[^\w\s-]', '').str.replace('+', '-').str.replace('.', '-') 

# Print the DataFrame to check the result
print(df['url'].head())


0    https://www.shl.com/solutions/products/product...
1    https://www.shl.com/solutions/products/product...
2    https://www.shl.com/solutions/products/product...
Name: url, dtype: object
