# === STABLE VERSION (DO NOT MODIFY) ===
# Verified Mean Recall@10 ≈ 0.28


In [21]:
import sys
import os

sys.path.append(os.path.abspath(".."))   # if notebook is inside src


Loading train data 

In [22]:
import pandas as pd 
import os 

def load_train_data(path):
    data = pd.read_excel(path)
    return data

In [23]:
data = load_train_data('./Gen_AI Dataset.xlsx')
data.columns
data.head(5)

Unnamed: 0,Query,Assessment_url
0,I am hiring for Java developers who can also c...,https://www.shl.com/solutions/products/product...
1,I am hiring for Java developers who can also c...,https://www.shl.com/solutions/products/product...
2,I am hiring for Java developers who can also c...,https://www.shl.com/solutions/products/product...
3,I am hiring for Java developers who can also c...,https://www.shl.com/solutions/products/product...
4,I am hiring for Java developers who can also c...,https://www.shl.com/products/product-catalog/v...


In [24]:
from collections import defaultdict

def build_query_to_labels(df):
    query_to_labels = defaultdict(list)
    for _, row in df.iterrows():
        query_to_labels[row["Query"]].append(row["Assessment_url"])
    return query_to_labels


In [25]:
query_to_labels = build_query_to_labels(data)

for q,d in query_to_labels.items():
    print(f"Query:{q} \n Assessments: {d}")

Query:I am hiring for Java developers who can also collaborate effectively with my business teams. Looking for an assessment(s) that can be completed in 40 minutes. 
 Assessments: ['https://www.shl.com/solutions/products/product-catalog/view/automata-fix-new/', 'https://www.shl.com/solutions/products/product-catalog/view/core-java-entry-level-new/', 'https://www.shl.com/solutions/products/product-catalog/view/java-8-new/', 'https://www.shl.com/solutions/products/product-catalog/view/core-java-advanced-level-new/', 'https://www.shl.com/products/product-catalog/view/interpersonal-communications/']
Query:I want to hire new graduates for a sales role in my company, the budget is for about an hour for each test. Give me some options 
 Assessments: ['https://www.shl.com/solutions/products/product-catalog/view/entry-level-sales-7-1/', 'https://www.shl.com/solutions/products/product-catalog/view/entry-level-sales-sift-out-7-1/', 'https://www.shl.com/solutions/products/product-catalog/view/entr

In [58]:
from src.Indexing.Index import get_vector_store

vector_store = get_vector_store()
def retrieve_top_k(vector_store, query, k=10, retrieve_k=30):
    docs = vector_store.similarity_search(query, k=retrieve_k)
    return docs



In [50]:
def rerank_by_query_overlap(query, docs):
    query_tokens = set(query.lower().split())

    def score(doc):
        text = doc.page_content.lower()
        return sum(1 for t in query_tokens if t in text)

    return sorted(docs, key=score, reverse=True)


In [51]:
def normalize_url(url):
    return url.rstrip("/").replace("/solutions", "")


In [52]:
sample_query = list(query_to_labels.keys())[0]
predicted = retrieve_top_k(vector_store, sample_query)

print("Predicted:", predicted[:3])
print("Relevant:", query_to_labels[sample_query])


Predicted: [Document(id='66bfb20d-d5e8-4312-ae7f-a6bdc0a3542a', metadata={'id': 'java_2_platform_enterprise_edition_14_fundamental', 'name': 'Java 2 Platform Enterprise Edition 1.4 Fundamental', 'url': 'https://www.shl.com/products/product-catalog/view/java-2-platform-enterprise-edition-1-4-fundamental/', 'test_type': ['Knowledge & Skills']}, page_content='Assessment name: Java 2 Platform Enterprise Edition 1.4 Fundamental. This is an SHL assessment designed for recruitment and hiring. It is used to screen job applicants and evaluate candidates. Measures skills related to Knowledge and Skills. Assessment category includes Knowledge & Skills. It on-site testing and uses standard testing methodology. Suitable for recruiters, hiring managers, and talent acquisition teams to assess candidate suitability for job roles.'), Document(id='fc9a6d97-81ed-4b0e-a660-b5958fd2d51b', metadata={'id': 'java_frameworks_new', 'name': 'Java Frameworks (New)', 'url': 'https://www.shl.com/products/product-ca

In [53]:
def recall_at_k(predicted, relevant, k=10):
    predicted = set(predicted[:k])
    relevant = set(relevant)
    return len(predicted & relevant) / max(len(relevant), 1)

def mean_recall_at_k(vector_store, query_to_labels, k=10):
    recalls = []

    for query, relevant_urls in query_to_labels.items():
        predicted = retrieve_top_k(vector_store, query, k)
        r = recall_at_k(predicted, relevant_urls, k)
        recalls.append(r)

    return sum(recalls) / len(recalls)



In [59]:
def evaluate_query(vector_store, query, relevant_urls, k=10):
    docs = retrieve_top_k(vector_store, query, k, retrieve_k=40)
    docs = rerank_by_query_overlap(query, docs)
    docs = docs[:k]

    predicted_urls = [
        normalize_url(doc.metadata.get("url")) for doc in docs
    ]
    relevant_urls = [
        normalize_url(u) for u in relevant_urls
    ]

    return recall_at_k(predicted_urls, relevant_urls, k)


In [55]:
def mean_recall_at_k(vector_store, query_to_labels, k=10):
    recalls = []

    for query, relevant_urls in query_to_labels.items():
        r = evaluate_query(vector_store, query, relevant_urls, k)
        recalls.append(r)

    return sum(recalls) / len(recalls)



In [60]:
for query, relevant_urls in query_to_labels.items():
    r = evaluate_query(vector_store, query, relevant_urls, k=10)
    print(f"Query: {query[:60]}...")
    print(f"Recall@10: {r}")
    print("-" * 50)


Query: I am hiring for Java developers who can also collaborate eff...
Recall@10: 0.6
--------------------------------------------------
Query: I want to hire new graduates for a sales role in my company,...
Recall@10: 0.0
--------------------------------------------------
Query: I am looking for a COO for my company in China and I want to...
Recall@10: 0.5
--------------------------------------------------
Query: KEY RESPONSIBITILES:

Manage the sound-scape of the station ...
Recall@10: 0.4
--------------------------------------------------
Query: Content Writer required, expert in English and SEO....
Recall@10: 0.8
--------------------------------------------------
Query: Find me 1 hour long assesment for the below job at SHL
Job D...
Recall@10: 0.0
--------------------------------------------------
Query: ICICI Bank Assistant Admin, Experience required 0-2 years, t...
Recall@10: 0.0
--------------------------------------------------
Query: We're looking for a Marketing Manager who c