In [11]:
import sys
!{sys.executable} -m pip install pandas






[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
import pandas as pd

path = "Gen_AI Dataset.xlsx"

train_df = pd.read_excel(path, sheet_name='Train-Set')
test_df = pd.read_excel(path, sheet_name='Test-Set')

train_df.head(), test_df.head()


(                                               Query  \
 0  I am hiring for Java developers who can also c...   
 1  I am hiring for Java developers who can also c...   
 2  I am hiring for Java developers who can also c...   
 3  I am hiring for Java developers who can also c...   
 4  I am hiring for Java developers who can also c...   
 
                                       Assessment_url  
 0  https://www.shl.com/solutions/products/product...  
 1  https://www.shl.com/solutions/products/product...  
 2  https://www.shl.com/solutions/products/product...  
 3  https://www.shl.com/solutions/products/product...  
 4  https://www.shl.com/products/product-catalog/v...  ,
                                                Query
 0  Looking to hire mid-level professionals who ar...
 1  Job Description\n\n Join a community that is s...
 2  I am hiring for an analyst and wants applicati...
 3  I have a JD Job Description\n\n People Science...
 4  I am new looking for new graduates in my sale

In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

catalog_url = "https://www.shl.com/solutions/products/product-catalog/"

response = requests.get(catalog_url)
soup = BeautifulSoup(response.text, "html.parser")

print("Page loaded!")


Page loaded!


In [9]:
# Find all product cards on the page
cards = soup.find_all("a", class_="product-card")

print("Total assessments found:", len(cards))


Total assessments found: 0


In [12]:
import requests
import pandas as pd

api_url = "https://www.shl.com/wp-json/shl/v1/products"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36"
}

resp = requests.get(api_url, headers=headers)

print("Status code:", resp.status_code)
print("Content preview:", resp.text[:200])  # just to inspect


Status code: 404
Content preview: <!DOCTYPE html>
<html class="no-js ss-errorpage  || js-html-tag" lang="en-US" data-locale="en_US" data-localeroot="/">
<head>
  
<meta charset="utf-8">
<title>Page not found | English (Global)</title>


In [13]:
# Extract unique assessment URLs from the training data
unique_assessments = train_df['Assessment_url'].unique()

print("Total unique assessments:", len(unique_assessments))

# Create basic assessment dataframe
assessments_df = pd.DataFrame({
    "url": unique_assessments
})

assessments_df.head()


Total unique assessments: 54


Unnamed: 0,url
0,https://www.shl.com/solutions/products/product...
1,https://www.shl.com/solutions/products/product...
2,https://www.shl.com/solutions/products/product...
3,https://www.shl.com/solutions/products/product...
4,https://www.shl.com/products/product-catalog/v...


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Simple text representation based on URL (baseline model)
assessment_texts = assessments_df['url']

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Fit on all assessment URLs
assessment_vectors = vectorizer.fit_transform(assessment_texts)

assessment_vectors.shape


(54, 108)

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_assessments(query, top_k=5):
    # Vectorize the input query
    query_vec = vectorizer.transform([query])
    
    # Compute cosine similarity with all assessments
    similarity_scores = cosine_similarity(query_vec, assessment_vectors).flatten()
    
    # Get top K indices
    top_indices = similarity_scores.argsort()[-top_k:][::-1]
    
    # Return top K assessment URLs with scores
    return assessments_df.iloc[top_indices].assign(score=similarity_scores[top_indices])


In [19]:
recommend_assessments("I want to hire a Java developer")


Unnamed: 0,url,score
2,https://www.shl.com/solutions/products/product...,0.715652
1,https://www.shl.com/solutions/products/product...,0.465186
3,https://www.shl.com/solutions/products/product...,0.436352
51,https://www.shl.com/solutions/products/product...,0.0
53,https://www.shl.com/solutions/products/product...,0.0


In [20]:
def clean_url(url):
    url = url.replace("https://www.shl.com/solutions/products/product-catalog/", "")
    url = url.replace("https://www.shl.com/products/product-catalog/", "")
    url = url.replace("-", " ")
    url = url.replace("/", " ")
    url = url.strip()
    return url

# Apply cleaning
assessments_df["clean_text"] = assessments_df["url"].apply(clean_url)

assessments_df.head()


Unnamed: 0,url,clean_text
0,https://www.shl.com/solutions/products/product...,view automata fix new
1,https://www.shl.com/solutions/products/product...,view core java entry level new
2,https://www.shl.com/solutions/products/product...,view java 8 new
3,https://www.shl.com/solutions/products/product...,view core java advanced level new
4,https://www.shl.com/products/product-catalog/v...,view interpersonal communications


In [22]:
# Use cleaned text for TF-IDF
assessment_texts = assessments_df['clean_text']

vectorizer = TfidfVectorizer(stop_words='english')

# Fit TF-IDF on cleaned text
assessment_vectors = vectorizer.fit_transform(assessment_texts)

assessment_vectors.shape


(54, 101)

In [23]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_assessments_v2(query, top_k=5):
    # Vectorize user query using same TF-IDF model
    query_vec = vectorizer.transform([query])
    
    # Compute similarity
    similarity_scores = cosine_similarity(query_vec, assessment_vectors).flatten()
    
    # Get top results
    top_indices = similarity_scores.argsort()[-top_k:][::-1]
    
    # Return results with scores
    return assessments_df.iloc[top_indices][["url", "clean_text"]].assign(score=similarity_scores[top_indices])


In [24]:
recommend_assessments_v2("Looking for a core Java developer with strong problem solving skills")


Unnamed: 0,url,clean_text,score
1,https://www.shl.com/solutions/products/product...,view core java entry level new,0.572601
3,https://www.shl.com/solutions/products/product...,view core java advanced level new,0.532067
2,https://www.shl.com/solutions/products/product...,view java 8 new,0.459677
18,https://www.shl.com/solutions/products/product...,view global skills assessment,0.360101
53,https://www.shl.com/solutions/products/product...,view data warehousing concepts,0.0


In [25]:
results = []

for q in test_df["Query"]:
    recs = recommend_assessments_v2(q, top_k=10)
    urls = "|".join(recs["url"].tolist())  # combine top 10 into a single string
    results.append([q, urls])


In [26]:
final_df = pd.DataFrame(results, columns=["Query", "Recommended_Assessments"])
final_df.head()


Unnamed: 0,Query,Recommended_Assessments
0,Looking to hire mid-level professionals who ar...,https://www.shl.com/solutions/products/product...
1,Job Description\n\n Join a community that is s...,https://www.shl.com/solutions/products/product...
2,I am hiring for an analyst and wants applicati...,https://www.shl.com/products/product-catalog/v...
3,I have a JD Job Description\n\n People Science...,https://www.shl.com/solutions/products/product...
4,I am new looking for new graduates in my sales...,https://www.shl.com/solutions/products/product...


In [29]:
final_df.to_csv("SHL_Recommendations.csv", index=False)


In [30]:
import os

os.listdir()


['.ipynb_checkpoints',
 'Gen_AI Dataset.xlsx',
 'recommendation_engine.ipynb',
 'SHL_Recommendations.csv']

In [31]:
import pandas as pd
pd.read_csv("SHL_Recommendations.csv").head()


Unnamed: 0,Query,Recommended_Assessments
0,Looking to hire mid-level professionals who ar...,https://www.shl.com/solutions/products/product...
1,Job Description\n\n Join a community that is s...,https://www.shl.com/solutions/products/product...
2,I am hiring for an analyst and wants applicati...,https://www.shl.com/products/product-catalog/v...
3,I have a JD Job Description\n\n People Science...,https://www.shl.com/solutions/products/product...
4,I am new looking for new graduates in my sales...,https://www.shl.com/solutions/products/product...


In [32]:
import joblib

# Save vectorizer & vectors for the API
joblib.dump(vectorizer, "backend/vectorizer.pkl")
joblib.dump(assessment_vectors, "backend/vectors.pkl")


['backend/vectors.pkl']

In [33]:
assessments_df.to_csv("assessments_catalogue.csv", index=False)
