In [None]:
import pandas as pd
from langchain_community.vectorstores import Chroma


# Load train/test
train_df = pd.read_csv("train_data.csv")
test_df = pd.read_csv("test_data.csv")

# Load vector DB created in 01
vector_db = Chroma(persist_directory="shl_vector_db")


In [None]:
from difflib import get_close_matches
shl_df = pd.read_csv("shl_full_database.csv")

shl_urls = shl_df["URL"].tolist()
shl_names = shl_df["Assessment Name"].tolist()

def match_name(url, url_list, name_list):
    match = get_close_matches(url, url_list, n=1, cutoff=0.6)
    if match:
        return name_list[url_list.index(match[0])]
    return None

train_df["Assessment_name"] = train_df["Assessment_url"].apply(lambda x: match_name(x, shl_urls, shl_names))


In [None]:
def balanced_recommendations(results, final_k=10):
    type_dict = {"K": [], "P": [], "A": []}
    for r in results:
        ttype = r.metadata.get("test_type", "K")
        if ttype in type_dict:
            type_dict[ttype].append(r)

    num_types = len([v for v in type_dict.values() if v])
    per_type = max(final_k // num_types, 1)

    balanced = []
    for ttype, items in type_dict.items():
        balanced.extend(items[:per_type])

    if len(balanced) < final_k:
        remaining = [r for r in results if r not in balanced]
        balanced.extend(remaining[:final_k - len(balanced)])

    return balanced[:final_k]


In [None]:
K = 10
recall_scores = []

for query, group in train_df.groupby("Query"):
    true_names = group["Assessment_name"].tolist()
    results = vector_db.similarity_search(query, k=20)
    balanced = balanced_recommendations(results, final_k=K)
    retrieved_names = [r.metadata.get("name", "").strip() for r in balanced]
    
    score = int(any(name in retrieved_names for name in true_names))
    recall_scores.append(score)

mean_recall_at_10 = sum(recall_scores) / len(recall_scores)
print("Mean Recall@10 (Balanced):", mean_recall_at_10)


C:\Users\Drishti Prakash\.cache\chroma\onnx_models\all-MiniLM-L6-v2\onnx.tar.gz: 100%|██████████████████████████████████████████████████████████████████████████████████████| 79.3M/79.3M [00:56<00:00, 1.47MiB/s]


Mean Recall@10 (Balanced): 0.5


In [None]:
submission = []

for query in test_df["Query"]:
    results = vector_db.similarity_search(query, k=20)
    balanced = balanced_recommendations(results, final_k=10)
    
    for r in balanced:
        submission.append({"Query": query, "Assessment_url": r.metadata["url"]})

submission_df = pd.DataFrame(submission)
submission_df.to_csv("shl_test_predictions.csv", index=False)
print("Submission CSV created!")


Submission CSV created!
