In [2]:
import re
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')  # Already loaded earlier

def extract_text_from_url(url: str) -> str:
    """Extracts visible text from a JD page."""
    try:
        res = requests.get(url, timeout=5)
        soup = BeautifulSoup(res.content, 'html.parser')
        paragraphs = soup.find_all("p")
        text = " ".join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
        return text
    except Exception as e:
        print(f"Error extracting from URL: {e}")
        return ""

def process_input(input_text: str) -> str:
    """Detects whether it's a URL or query and returns clean text."""
    if re.match(r'^https?://', input_text):
        return extract_text_from_url(input_text)
    return input_text

def get_query_embedding(text: str):
    return model.encode([text])
query = "looking for a behavioral test for mid-level managers"
processed_text = process_input(query)
print("Processed Query Text:", processed_text)


Processed Query Text: looking for a behavioral test for mid-level managers


In [4]:
embedding = get_query_embedding(processed_text)
print("Query embedding shape:", embedding.shape)

Query embedding shape: (1, 384)


In [10]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load your enriched catalog and precomputed embeddings
catalog_df = pd.read_csv("shl_enriched_catalog.csv")
embeddings = np.load("shl_embeddings.npy")  # assuming you saved them as .npy

def recommend(query_text: str, top_n: int = 5):
    # Process input (query or JD URL)
    cleaned_text = process_input(query_text)
    
    # Generate query embedding
    query_embedding = get_query_embedding(cleaned_text)
    
    # Compute similarity scores
    scores = cosine_similarity(query_embedding, embeddings)[0]
    
    # Get top-N indices
    top_indices = np.argsort(scores)[::-1][:top_n]
    
    # Extract relevant rows
    results = catalog_df.iloc[top_indices].copy()
    results["similarity"] = scores[top_indices]
    
    # Clean columns
    results = results[[
        "Test Name", "Link", "Remote Testing", "Adaptive/IRT", 
        "duration", "Test Types", "similarity"
    ]]
    
    # Optional: round similarity scores
    results["similarity"] = results["similarity"].round(3)

    return results.reset_index(drop=True)



In [12]:
results_df = recommend("behavioral test for mid-level managers")
print(results_df)


                         Test Name  \
0            Human Resources (New)   
1                 Manager 8.0+ JFA   
2  Customer Service Phone Solution   
3                  Manager 8.0 JFA   
4           Time Management (U.S.)   

                                                Link Remote Testing  \
0  https://www.shl.com/solutions/products/product...            Yes   
1  https://www.shl.com/solutions/products/product...            Yes   
2  https://www.shl.com/solutions/products/product...            Yes   
3  https://www.shl.com/solutions/products/product...            Yes   
4  https://www.shl.com/solutions/products/product...            Yes   

  Adaptive/IRT    duration Test Types  similarity  
0           No   8 minutes          K       0.562  
1           No  44 minutes    B, K, P       0.522  
2           No  30 minutes    B, P, S       0.513  
3           No  26 minutes    B, K, P       0.510  
4          Yes  15 minutes          K       0.505  
