In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/shl-assessment-catalog-2025/raw_assessments.json
/kaggle/input/data-3/raw_assessments.json
/kaggle/input/data-scrapped-2/raw_assessments.json
/kaggle/input/shl-train-test/Gen_AI Dataset.xlsx


In [2]:
!pip install langchain-community
!pip install langchain_huggingface
!pip install faiss-gpu
# Install the missing Vector Database and Community tools
!pip install faiss-cpu langchain-community

# Upgrade Google GenAI and Protobuf to fix the "MessageFactory" error
!pip install -U google-generativeai protobuf

Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-core<2.0.0,>=1.0.1 (from langchain-community)
  Downloading langchain_core-1.2.2-py3-none-any.whl.metadata (3.7 kB)
Collecting langchain-classic<2.0.0,>=1.0.0 (from langchain-community)
  Downloading langchain_classic-1.0.0-py3-none-any.whl.metadata (3.9 kB)
Collecting langchain-text-splitters<2.0.0,>=1.0.0 (from langchain-classic<2.0.0,>=1.0.0->langchain-community)
  Downloading langchain_text_splitters-1.1.0-py3-none-any.whl.metadata (2.7 kB)
Collecting uuid-utils<1.0,>=0.12.0 (from langchain-core<2.0.0,>=1.0.1->langchain-community)
  Downloading uuid_utils-0.12.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Downloading langchain_community-0.4.1-py3-none-any.whl (2.5 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m2.5/2.5 

In [3]:
# =============================================================================
# üèÜ SHL AI ASSESSMENT - FINAL SUBMISSION (POLISHED & ROBUST)
# =============================================================================

import os
import json
import re
import warnings
import pandas as pd
import google.generativeai as genai
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from kaggle_secrets import UserSecretsClient

warnings.filterwarnings("ignore")

# ---------------------------------------------------------
# 1. ROBUST SETUP & AUTHENTICATION
# ---------------------------------------------------------
print("üîß Initializing SHL Recommendation Engine...")

model = None

# Robust Model Selection Strategy
try:
    user_secrets = UserSecretsClient()
    GEMINI_API_KEY = user_secrets.get_secret("GEMINI_API_KEY")
    if GEMINI_API_KEY:
        genai.configure(api_key=GEMINI_API_KEY)
        
        # Fallback list: Try newest first, then stable, then legacy
        candidate_models = ["models/gemini-2.5-flash", "models/gemini-1.5-flash", "models/gemini-1.0-pro"]
        
        for candidate in candidate_models:
            try:
                # Test if model is accessible
                m = genai.GenerativeModel(candidate)
                # Quick dummy generation to verify access
                m.generate_content("test") 
                model = m
                print(f"‚úÖ Authenticated with Gemini API ({candidate})")
                break
            except:
                continue
                
        if not model:
            print("‚ö†Ô∏è Could not connect to any Gemini model. Running in Retrieval-Only mode.")
    else:
        print("‚ö†Ô∏è GEMINI_API_KEY missing. Running in Retrieval-Only mode.")
except Exception as e:
    print(f"‚ö†Ô∏è Auth Warning: {e}")

# ---------------------------------------------------------
# 2. DATA INGESTION & PRE-COMPUTATION
# ---------------------------------------------------------
DATA_PATH = "/kaggle/input/data-3/raw_assessments.json"

# ---------------------------------------------------------
# 3. OPTIMIZED TOKENIZER (Regex Fix)
# ---------------------------------------------------------
def extract_skill_tokens(text):
    """
    Robust Tokenizer.
    - Catches: c++, c#, node.js, html5, asp.net
    - Ignores: HR stop words
    """
    text = text.lower()
    
    # IMPROVED REGEX:
    # 1. Starts with letter/number
    # 2. Can have . + # inside (node.js, c++)
    # 3. Handles trailing + (c++)
    tokens = re.findall(r"\b[a-z][a-z0-9]*(?:[.+#][a-z0-9]+)*\+*", text)
    
    hr_stopwords = {
        "hire", "hiring", "role", "junior", "senior", "developer", "engineer",
        "position", "candidate", "assessment", "test", "solution", "professional",
        "comprehensive", "pre-packaged", "manager", "team", "lead", "good", "at",
        "and", "or", "with", "for", "the", "a", "an", "of", "in", "to", "level",
        "skills", "experience", "knowledge", "proficient", "using", "working"
    }
    
    return set(t for t in tokens if t not in hr_stopwords and len(t) > 1)

# Load and Index
with open(DATA_PATH, "r") as f:
    raw_data = json.load(f)

documents = []
for item in raw_data:
    page_content = f"Title: {item.get('name')} | Type: {item.get('test_type')} | Desc: {item.get('description')}"
    
    # Pre-compute tokens for O(1) lookup
    doc_tokens = extract_skill_tokens(page_content)
    
    metadata = {
        "name": item.get("name"),
        "url": item.get("url"),
        "doc_tokens": list(doc_tokens) 
    }
    documents.append(Document(page_content=page_content, metadata=metadata))

print(f"üß† Indexing {len(documents)} assessments...")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_db = FAISS.from_documents(documents, embeddings)
print("‚úÖ Index Ready.")

# ---------------------------------------------------------
# 4. HYBRID SEARCH (Normalized Scoring)
# ---------------------------------------------------------
def clean_query(text):
    fluff = ["i am hiring for", "looking to hire", "i need a", "we are looking for", 
             "candidates who are", "proficient in", "good at", "skills in"]
    cleaned = text.lower()
    for phrase in fluff: cleaned = cleaned.replace(phrase, "")
    return " ".join(cleaned.split())

def hybrid_search(query, k=10):
    """
    Weighted Hybrid Search with Bounded Scoring.
    """
    opt_query = clean_query(query)
    query_skills = extract_skill_tokens(query)
    num_q_skills = max(len(query_skills), 1) # Avoid div/0
    
    # 1. Semantic Search
    raw_results = vector_db.similarity_search_with_score(opt_query, k=k*3)
    
    scored_candidates = []
    for doc, distance in raw_results:
        # Normalize Vector Score (0 to 1)
        v_score = 1 / (1 + distance)
        
        # 2. Skill Overlap
        doc_skill_set = set(doc.metadata.get('doc_tokens', []))
        overlap = len(query_skills.intersection(doc_skill_set))
        
        # NEW SCORING: Bounded Bonus
        # Calculates what % of the user's requested skills are present
        skill_ratio = overlap / num_q_skills 
        skill_bonus = min(1.0, skill_ratio) 
        
        # Final Score: 70% Semantic, 30% Lexical
        final_score = (v_score * 0.7) + (skill_bonus * 0.3)
        
        scored_candidates.append((final_score, doc))
        
    scored_candidates.sort(key=lambda x: x[0], reverse=True)
    return [d for _, d in scored_candidates][:k]

# ---------------------------------------------------------
# 5. EXECUTION & VALIDATION
# ---------------------------------------------------------
def get_slug(url):
    if not isinstance(url, str): return ""
    return url.strip().rstrip('/').split('/')[-1].lower()

def run_pipeline():
    print("\n" + "="*50)
    print("üß™ Validating (Polished Engine)")
    print("="*50)
    
    excel_path = "/kaggle/input/shl-train-test/Gen_AI Dataset.xlsx"
    if not os.path.exists(excel_path): return

    # Validation
    train_df = pd.read_excel(excel_path, sheet_name=0)
    hits = 0
    q_col = [c for c in train_df.columns if "query" in c.lower()][0]
    ans_col = [c for c in train_df.columns if "url" in c.lower()][0]
    
    for _, row in train_df.iterrows():
        query = row[q_col]
        target_slug = get_slug(row[ans_col])
        
        results = hybrid_search(query, k=10)
        found_slugs = [get_slug(d.metadata['url']) for d in results]
        
        if target_slug in found_slugs:
            hits += 1
            
    print(f"üèÜ SCORE (Recall@10): {hits/len(train_df):.1%}")
    
    # Submission Generation
    print("\nüìù Generating Submission.csv...")
    test_df = pd.read_excel(excel_path, sheet_name=1)
    submission_rows = []
    q_col = [c for c in test_df.columns if "query" in c.lower()][0]
    
    for _, row in test_df.iterrows():
        query = row[q_col]
        results = hybrid_search(query, k=5)
        for doc in results:
            submission_rows.append({"Query": query, "Assessment_url": doc.metadata['url']})
            
    pd.DataFrame(submission_rows).to_csv("submission.csv", index=False)
    print("‚úÖ Success! 'submission.csv' ready.")

if __name__ == "__main__":
    run_pipeline()


All support for the `google.generativeai` package has ended. It will no longer be receiving 
updates or bug fixes. Please switch to the `google.genai` package as soon as possible.
See README for more details:

https://github.com/google-gemini/deprecated-generative-ai-python/blob/main/README.md

  self.__wrapped__.exec_module(module)


üîß Initializing SHL Recommendation Engine...
‚úÖ Authenticated with Gemini API (models/gemini-2.5-flash)
üß† Indexing 510 assessments...


2025-12-18 08:29:32.039506: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766046572.229656      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766046572.290839      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

‚úÖ Index Ready.

üß™ Validating (Polished Engine)
üèÜ SCORE (Recall@10): 21.5%

üìù Generating Submission.csv...
‚úÖ Success! 'submission.csv' ready.


In [4]:
# =============================================================================
# üèÜ SHL AI INTERN ASSESSMENT - FINAL HYBRID PIPELINE ("GRANDMASTER" EDITION)
# =============================================================================

import os
import json
import re
import warnings
import pandas as pd
import google.generativeai as genai
from collections import Counter
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from kaggle_secrets import UserSecretsClient

warnings.filterwarnings("ignore")

# ---------------------------------------------------------
# 1. SETUP & AUTHENTICATION
# ---------------------------------------------------------
print("üîß Setting up environment...")

model = None

try:
    user_secrets = UserSecretsClient()
    GEMINI_API_KEY = user_secrets.get_secret("GEMINI_API_KEY")
    if GEMINI_API_KEY:
        genai.configure(api_key=GEMINI_API_KEY)
        # Using the model found in your logs
        target_model = "models/gemini-2.5-flash"
        model = genai.GenerativeModel(
            target_model,
            generation_config={"response_mime_type": "application/json"}
        )
        print(f"‚úÖ Authenticated with Gemini API ({target_model}).")
    else:
        print("‚ö†Ô∏è GEMINI_API_KEY secret is missing.")
except Exception as e:
    print(f"‚ö†Ô∏è Auth Warning: {e}")

# ---------------------------------------------------------
# 2. BUILD VECTOR DATABASE
# ---------------------------------------------------------
DATA_PATH = "/kaggle/input/data-3/raw_assessments.json"
print("\nüìÇ Loading SHL Catalog...")

with open(DATA_PATH, "r") as f:
    raw_data = json.load(f)

documents = []
for item in raw_data:
    page_content = f"""
    Title: {item.get('name', 'Unknown')}
    Type: {', '.join(item.get('test_type', []))}
    Description: {item.get('description', '')}
    """
    metadata = {
        "name": item.get("name"),
        "url": item.get("url"),
        "types": item.get("test_type"),
        "duration": item.get("duration"),
        "adaptive": item.get("adaptive_support", "No"),
        "remote": item.get("remote_support", "Yes")
    }
    documents.append(Document(page_content=page_content, metadata=metadata))

print(f"üß† Indexing {len(documents)} assessments...")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_db = FAISS.from_documents(documents, embeddings)
print("üöÄ Retrieval Engine Ready!")

# ---------------------------------------------------------
# 3. HELPERS (Cleaners)
# ---------------------------------------------------------
def clean_query(text):
    fluff = [
        "i am hiring for", "looking to hire", "i need a", "we are looking for",
        "candidates who are", "proficient in", "good at", "can also", 
        "collaborate with", "skills in", "experience in", "based on the jd",
        "recommend some assessment", "applications to screen"
    ]
    cleaned = text.lower()
    for phrase in fluff:
        cleaned = cleaned.replace(phrase, "")
    return " ".join(cleaned.split())

def get_slug(url):
    if not isinstance(url, str): return ""
    return url.strip().rstrip('/').split('/')[-1].lower()

# ---------------------------------------------------------
# 4. NEW: HYBRID RE-RANKER (The "Grandmaster" Logic)
# ---------------------------------------------------------
def extract_skill_tokens(text):
    """Extracts high-value skill tokens, ignoring generic HR fluff."""
    text = text.lower()
    tokens = re.findall(r"[a-z]+[+#]*", text) # Grab words + 'c++' style tokens
    
    stop_words = {
        "hire", "hiring", "role", "junior", "senior", "developer", "engineer",
        "position", "candidate", "assessment", "test", "solution", "professional",
        "comprehensive", "pre-packaged", "manager", "team", "lead", "good", "at",
        "and", "or", "with", "for", "the", "a", "an", "of", "in", "to"
    }
    
    return set(t for t in tokens if t not in stop_words and len(t) > 1)

def hybrid_search(query, k=10):
    """
    1. Get Broad Candidates (Vector Search)
    2. Re-Rank based on Skill Overlap (Lexical Search)
    """
    opt_query = clean_query(query)
    query_skills = extract_skill_tokens(query) # Use raw query for skills!
    
    # Step 1: Broad Retrieval (Get 3x candidates to find hidden gems)
    # Note: FAISS distance is lower = better. 0 is perfect match.
    raw_results = vector_db.similarity_search_with_score(opt_query, k=k*3)
    
    scored_candidates = []
    
    for doc, distance in raw_results:
        # Convert distance to similarity score (approx: 1.0 is best)
        v_score = 1 / (1 + distance)
        
        # Keyword Overlap Score
        doc_tokens = extract_skill_tokens(doc.page_content)
        overlap = len(query_skills.intersection(doc_tokens))
        
        # THE MAGIC FORMULA
        # +0.3 boost for EVERY matching skill. This kills generic bundles.
        # If query has "Java" and doc has "Java", it jumps to the top.
        skill_bonus = overlap * 0.3
        
        final_score = (v_score * 0.7) + skill_bonus
        
        scored_candidates.append((final_score, doc))
        
    # Step 2: Re-Rank
    scored_candidates.sort(key=lambda x: x[0], reverse=True)
    
    return [d for _, d in scored_candidates][:k]

# ---------------------------------------------------------
# 5. VALIDATION (Using Hybrid Search)
# ---------------------------------------------------------
def validate_engine():
    print("\n" + "="*50)
    print("üß™ PHASE 1: Validation (Training Set)")
    print("="*50)
    
    excel_path = "/kaggle/input/shl-train-test/Gen_AI Dataset.xlsx"
    if not os.path.exists(excel_path): return

    train_df = pd.read_excel(excel_path, sheet_name=0)
    hits = 0
    q_col = [c for c in train_df.columns if "query" in c.lower()][0]
    ans_col = [c for c in train_df.columns if "url" in c.lower()][0]
    
    for i, row in train_df.iterrows():
        query = row[q_col]
        target_slug = get_slug(row[ans_col])
        
        # USE HYBRID SEARCH HERE
        results = hybrid_search(query, k=10)
        found_slugs = [get_slug(d.metadata['url']) for d in results]
        
        if target_slug in found_slugs:
            hits += 1
            if hits <= 2: print(f"   ‚úÖ Match! '{query[:20]}...' -> {target_slug}")
        else:
            if i < 3: # Debug first few misses
                print(f"   ‚ùå Missed: '{query[:20]}...' (Expected: {target_slug})")

    score = hits / len(train_df)
    print("-" * 40)
    print(f"üèÜ HYBRID VALIDATION SCORE (Recall@10): {score:.1%}")
    print("-" * 40)

# ---------------------------------------------------------
# 6. DEMO: AI CONSULTANT
# ---------------------------------------------------------
def ai_consultant(query):
    print(f"\nüó£Ô∏è DEMO QUERY: '{query}'")
    
    # USE HYBRID SEARCH HERE
    docs = hybrid_search(query, k=8)
    
    context = "\n".join([f"- {d.metadata['name']} ({d.metadata['url']})" for d in docs])
    
    if model:
        prompt = f"""
        You are an SHL Consultant. Query: {query}
        Assessments Found:
        {context}
        
        Return a JSON object with:
        "summary": "Why these fit (max 2 sentences)",
        "top_3": ["Name 1", "Name 2", "Name 3"]
        """
        try:
            response = model.generate_content(prompt)
            data = json.loads(response.text)
            print(f"\nü§ñ Gemini Says:\n{data['summary']}")
            print("Top Picks:", data['top_3'])
        except Exception as e:
            print(f"‚ö†Ô∏è Gemini Error: {e}")
            for d in docs[:3]: print(f"- {d.metadata['name']}")
    else:
        for d in docs[:3]: print(f"- {d.metadata['name']}")

# ---------------------------------------------------------
# 7. GENERATE SUBMISSION
# ---------------------------------------------------------
def generate_submission():
    print("\n" + "="*50)
    print("üìù PHASE 2: Generating Submission.csv")
    print("="*50)
    
    excel_path = "/kaggle/input/shl-train-test/Gen_AI Dataset.xlsx"
    test_df = pd.read_excel(excel_path, sheet_name=1)
    
    submission_rows = []
    q_col = [c for c in test_df.columns if "query" in c.lower()][0]
    
    for _, row in test_df.iterrows():
        query = row[q_col]
        
        # USE HYBRID SEARCH HERE (Top 5)
        results = hybrid_search(query, k=5)
        
        for doc in results:
            submission_rows.append({
                "Query": query,
                "Assessment_url": doc.metadata['url']
            })
            
    pd.DataFrame(submission_rows).to_csv("final_submission.csv", index=False)
    print(f"‚úÖ Saved 'submission.csv' ({len(submission_rows)} rows).")

# =========================================================
# RUN EVERYTHING
# =========================================================
if __name__ == "__main__":
    validate_engine()
    ai_consultant("I need a Python Team Lead who is good at mentoring.")
    generate_submission()

üîß Setting up environment...
‚úÖ Authenticated with Gemini API (models/gemini-2.5-flash).

üìÇ Loading SHL Catalog...
üß† Indexing 510 assessments...
üöÄ Retrieval Engine Ready!

üß™ PHASE 1: Validation (Training Set)
   ‚ùå Missed: 'I am hiring for Java...' (Expected: automata-fix-new)
   ‚úÖ Match! 'I am hiring for Java...' -> core-java-entry-level-new
   ‚úÖ Match! 'I am hiring for Java...' -> java-8-new
----------------------------------------
üèÜ HYBRID VALIDATION SCORE (Recall@10): 26.2%
----------------------------------------

üó£Ô∏è DEMO QUERY: 'I need a Python Team Lead who is good at mentoring.'

ü§ñ Gemini Says:
The Python assessment ensures technical expertise, while the Supervisor solutions thoroughly evaluate leadership, team management, and crucial mentoring skills vital for a Team Lead role.
Top Picks: ['Python (New)', 'Supervisor 7.0 Solution', 'Supervisor - Short Form']

üìù PHASE 2: Generating Submission.csv
‚úÖ Saved 'submission.csv' (45 rows).


In [5]:
# =============================================================================
# üèÜ SHL ASSESSMENT ENGINE - FINAL SUBMISSION SCRIPT
# =============================================================================

import os
import json
import re
import warnings
import pandas as pd
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

warnings.filterwarnings("ignore")

# ---------------------------------------------------------
# 1. CONFIGURATION
# ---------------------------------------------------------
# UPDATE THESE PATHS IF NEEDED
RAW_DATA_PATH = "/kaggle/input/data-3/raw_assessments.json"  # Path to your JSON
TEST_DATA_PATH = "/kaggle/input/shl-train-test/Gen_AI Dataset.xlsx" # Path to Excel

# ---------------------------------------------------------
# 2. HELPER FUNCTIONS
# ---------------------------------------------------------
def clean_query(text):
    if not isinstance(text, str): return ""
    fluff = [
        "i am hiring for", "looking to hire", "i need a", "we are looking for",
        "candidates who are", "proficient in", "good at", "can also", 
        "collaborate with", "skills in", "experience in", "based on the jd",
        "recommend some assessment", "applications to screen"
    ]
    cleaned = text.lower()
    for phrase in fluff:
        cleaned = cleaned.replace(phrase, "")
    return " ".join(cleaned.split())

def extract_skill_tokens(text):
    """Extracts high-value skill tokens, ignoring generic HR fluff."""
    if not isinstance(text, str): return set()
    text = text.lower()
    tokens = re.findall(r"[a-z]+[+#]*", text) 
    
    stop_words = {
        "hire", "hiring", "role", "junior", "senior", "developer", "engineer",
        "position", "candidate", "assessment", "test", "solution", "professional",
        "comprehensive", "pre-packaged", "manager", "team", "lead", "good", "at",
        "and", "or", "with", "for", "the", "a", "an", "of", "in", "to", "description",
        "job", "title", "type"
    }
    return set(t for t in tokens if t not in stop_words and len(t) > 1)

# ---------------------------------------------------------
# 3. BUILD VECTOR DATABASE (WITH FIXES)
# ---------------------------------------------------------
print("üìÇ Loading SHL Catalog...")

try:
    with open(RAW_DATA_PATH, "r", encoding="utf-8") as f:
        raw_data = json.load(f)
except FileNotFoundError:
    print(f"‚ùå Error: Could not find {RAW_DATA_PATH}")
    exit()

documents = []
skipped_count = 0

for item in raw_data:
    name = item.get('name', 'Unknown')
    test_types = item.get('test_type', [])
    
    # üö® CRITICAL FIX: FILTERING LOGIC üö®
    # 1. Exclude "Pre-packaged Job Solutions" entirely
    if "Pre-packaged Job Solutions" in test_types: 
        skipped_count += 1
        continue
        
    # 2. Exclude generic "Solutions" unless they are explicitly "Individual"
    # (Safety check based on your previous logic)
    if "Solution" in name and "Individual" not in str(test_types): 
        skipped_count += 1
        continue

    # Prepare Content
    type_str = ", ".join(test_types) if isinstance(test_types, list) else str(test_types)
    page_content = f"Title: {name} | Type: {type_str} | Desc: {item.get('description', '')}"
    
    metadata = {
        "name": name,
        "url": item.get("url"),
        "test_type": test_types,
        "doc_tokens": list(extract_skill_tokens(page_content)) # Pre-compute for speed
    }
    documents.append(Document(page_content=page_content, metadata=metadata))

print(f"‚úÖ Indexed {len(documents)} assessments (Skipped {skipped_count} invalid items).")

# Initialize Embeddings & Vector DB
print("üß† Building FAISS Index...")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_db = FAISS.from_documents(documents, embeddings)
print("üöÄ Retrieval Engine Ready!")

# ---------------------------------------------------------
# 4. HYBRID SEARCH ENGINE
# ---------------------------------------------------------
def hybrid_search(query, k=5):
    opt_query = clean_query(query)
    query_skills = extract_skill_tokens(query)
    
    # 1. Broad Retrieval (Get top 30 to sort through)
    raw_results = vector_db.similarity_search_with_score(opt_query, k=30)
    scored_candidates = []
    
    for doc, distance in raw_results:
        # Vector Score (0-1)
        v_score = 1 / (1 + distance)
        
        # Keyword Bonus
        doc_skills = set(doc.metadata['doc_tokens'])
        overlap = len(query_skills.intersection(doc_skills))
        
        # Bonus Multiplier (+0.3 per match)
        skill_bonus = overlap * 0.3
        
        final_score = (v_score * 0.7) + skill_bonus
        scored_candidates.append((final_score, doc))
        
    # 2. Re-Rank and Slice
    scored_candidates.sort(key=lambda x: x[0], reverse=True)
    return [d for _, d in scored_candidates][:k]

# ---------------------------------------------------------
# 5. GENERATE SUBMISSION FILE
# ---------------------------------------------------------
print("\nüìù Generating final_submission.csv...")

try:
    # Load Test Data (Sheet 2 usually contains the unlabeled test set)
    # Check if 'Sheet2' or index 1 is correct for your file
    test_df = pd.read_excel(TEST_DATA_PATH, sheet_name=1) 
    
    # Identify the Query Column dynamically
    q_cols = [c for c in test_df.columns if "query" in c.lower()]
    if not q_cols:
        raise ValueError("Could not find a 'Query' column in the Excel file.")
    q_col = q_cols[0]
    
    print(f"   -> Using column '{q_col}' as input.")

    submission_rows = []

    for index, row in test_df.iterrows():
        query_text = row[q_col]
        
        # Skip empty rows
        if pd.isna(query_text) or str(query_text).strip() == "":
            continue
            
        # Get Recommendations
        results = hybrid_search(str(query_text), k=5)
        
        # Append rows (1 row per recommendation)
        for doc in results:
            submission_rows.append({
                "Query": query_text,
                "Assessment_url": doc.metadata['url']
            })

    # Create DataFrame
    submission_df = pd.DataFrame(submission_rows)
    
    # Save to CSV (Strict formatting)
    submission_df.to_csv("final_submission.csv", index=False)
    
    print(f"‚úÖ Success! Saved {len(submission_df)} rows to 'final_submission.csv'.")
    print("   -> Header Check: Query, Assessment_url")
    print(submission_df.head())

except Exception as e:
    print(f"‚ùå Error generating submission: {e}")
# ---------------------------------------------------------
# 6. VALIDATION FUNCTION (Calculates Recall@10)
# ---------------------------------------------------------
def validate_engine():
    print("\n" + "="*50)
    print("üß™ PHASE 1: Validation (Calculating Recall@10)")
    print("="*50)

    try:
        # Load TRAINING Data (Sheet 0 contains the labeled data with answers)
        train_df = pd.read_excel(TEST_DATA_PATH, sheet_name=0)
        
        # Identify Columns
        q_cols = [c for c in train_df.columns if "query" in c.lower()]
        ans_cols = [c for c in train_df.columns if "url" in c.lower()]
        
        if not q_cols or not ans_cols:
            print("‚ùå Error: Could not find 'Query' or 'URL' columns in Sheet 0.")
            return
            
        q_col = q_cols[0]
        ans_col = ans_cols[0]
        
        print(f"   -> Testing {len(train_df)} labeled queries...")
        
        hits = 0
        
        for i, row in train_df.iterrows():
            query = row[q_col]
            target_url = str(row[ans_col]).strip()
            
            # Run Search (Get Top 10)
            results = hybrid_search(str(query), k=10)
            
            # Extract Found URLs (Normalize to ensure matching works)
            found_urls = [d.metadata['url'].strip() for d in results]
            
            # Check if the correct answer is in the Top 10
            # (We use 'in' check in case of slight URL variations)
            match = False
            for found in found_urls:
                if target_url in found or found in target_url:
                    match = True
                    break
            
            if match:
                hits += 1
            else:
                # Optional: Print misses to help you debug
                if i < 3: print(f"      ‚ùå Miss: {query[:30]}... (Expected: {target_url.split('/')[-2]})")

        # Calculate Score
        recall_score = hits / len(train_df)
        print("-" * 40)
        print(f"üèÜ RECALL@10 SCORE: {recall_score:.1%}")
        print("-" * 40)

    except Exception as e:
        print(f"‚ùå Validation Error: {e}")

# =========================================================
# 7. EXECUTION BLOCK
# =========================================================
# Call the validation function FIRST
validate_engine()

# ... Then continue with your submission generation code ...    
    

üìÇ Loading SHL Catalog...
‚úÖ Indexed 429 assessments (Skipped 81 invalid items).
üß† Building FAISS Index...
üöÄ Retrieval Engine Ready!

üìù Generating final_submission.csv...
   -> Using column 'Query' as input.
‚úÖ Success! Saved 45 rows to 'final_submission.csv'.
   -> Header Check: Query, Assessment_url
                                               Query  \
0  Looking to hire mid-level professionals who ar...   
1  Looking to hire mid-level professionals who ar...   
2  Looking to hire mid-level professionals who ar...   
3  Looking to hire mid-level professionals who ar...   
4  Looking to hire mid-level professionals who ar...   

                                      Assessment_url  
0  https://www.shl.com/products/product-catalog/v...  
1  https://www.shl.com/products/product-catalog/v...  
2  https://www.shl.com/products/product-catalog/v...  
3  https://www.shl.com/products/product-catalog/v...  
4  https://www.shl.com/products/product-catalog/v...  

üß™ PHASE 1: Va

In [6]:
# =============================================================================
# üèÜ SHL ASSESSMENT ENGINE - FINAL SUBMISSION SCRIPT
# =============================================================================

import os
import json
import re
import warnings
import pandas as pd
import google.generativeai as genai
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from kaggle_secrets import UserSecretsClient

warnings.filterwarnings("ignore")

# ---------------------------------------------------------
# 1. CONFIGURATION & AUTH
# ---------------------------------------------------------
# üö® UPDATE THESE PATHS IF NEEDED FOR YOUR ENVIRONMENT üö®
RAW_DATA_PATH = "/kaggle/input/data-3/raw_assessments.json"
TEST_DATA_PATH = "/kaggle/input/shl-train-test/Gen_AI Dataset.xlsx"

print("üîß Setting up environment...")
model = None
try:
    user_secrets = UserSecretsClient()
    GEMINI_API_KEY = user_secrets.get_secret("GEMINI_API_KEY")
    if GEMINI_API_KEY:
        genai.configure(api_key=GEMINI_API_KEY)
        model = genai.GenerativeModel("models/gemini-2.5-flash")
        print("‚úÖ Authenticated with Gemini API.")
except Exception:
    print("‚ö†Ô∏è Gemini API Key not found (Skipping AI summaries, Core logic still works).")

# ---------------------------------------------------------
# 2. BUILD VECTOR DATABASE (WITH FILTERS)
# ---------------------------------------------------------
print("\nüìÇ Loading SHL Catalog...")

try:
    with open(RAW_DATA_PATH, "r", encoding="utf-8") as f:
        raw_data = json.load(f)
except FileNotFoundError:
    print(f"‚ùå Error: File not found at {RAW_DATA_PATH}")
    exit()

documents = []
skipped_count = 0

for item in raw_data:
    name = item.get('name', 'Unknown')
    test_types = item.get('test_type', [])
    
    # üö® CRITICAL FILTERING LOGIC (REQUIRED FOR SCORE) üö®
    # 1. Exclude "Pre-packaged Job Solutions" (Bundles)
    if "Pre-packaged Job Solutions" in test_types: 
        skipped_count += 1
        continue
        
    # 2. Exclude generic "Solutions" unless they are "Individual"
    if "Solution" in name and "Individual" not in str(test_types): 
        skipped_count += 1
        continue

    # Clean & Prepare Content
    type_str = ", ".join(test_types) if isinstance(test_types, list) else str(test_types)
    page_content = f"Title: {name} | Type: {type_str} | Desc: {item.get('description', '')}"
    
    metadata = {
        "name": name,
        "url": item.get("url"),
        "test_type": test_types,
        "description": item.get('description', '')
    }
    documents.append(Document(page_content=page_content, metadata=metadata))

print(f"‚úÖ Indexed {len(documents)} assessments (Skipped {skipped_count} invalid items).")
print("üß† Building FAISS Index...")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_db = FAISS.from_documents(documents, embeddings)
print("üöÄ Retrieval Engine Ready!")

# ---------------------------------------------------------
# 3. HELPER FUNCTIONS
# ---------------------------------------------------------
def clean_query(text):
    if not isinstance(text, str): return ""
    fluff = [
        "i am hiring for", "looking to hire", "i need a", "we are looking for",
        "candidates who are", "proficient in", "good at", "can also", 
        "collaborate with", "skills in", "experience in", "based on the jd",
        "recommend some assessment", "applications to screen"
    ]
    cleaned = text.lower()
    for phrase in fluff:
        cleaned = cleaned.replace(phrase, "")
    return " ".join(cleaned.split())

def extract_skill_tokens(text):
    if not isinstance(text, str): return set()
    text = text.lower()
    tokens = re.findall(r"[a-z]+[+#]*", text) 
    stop_words = {
        "hire", "hiring", "role", "junior", "senior", "developer", "engineer",
        "position", "candidate", "assessment", "test", "solution", "professional",
        "comprehensive", "pre-packaged", "manager", "team", "lead", "good", "at",
        "and", "or", "with", "for", "the", "a", "an", "of", "in", "to"
    }
    return set(t for t in tokens if t not in stop_words and len(t) > 1)

def get_slug(url):
    if not isinstance(url, str): return ""
    return url.strip().rstrip('/').split('/')[-1].lower()

# ---------------------------------------------------------
# 4. HYBRID SEARCH (GRANDMASTER EDITION)
# ---------------------------------------------------------
def hybrid_search(query, k=5):
    opt_query = clean_query(query)
    query_skills = extract_skill_tokens(query)
    
    # üöÄ RECALL BOOST: Get top 100 candidates (instead of 30)
    # This ensures we catch skills hidden deeper in the list
    raw_results = vector_db.similarity_search_with_score(opt_query, k=100)
    
    scored_candidates = []
    
    for doc, distance in raw_results:
        # Vector Score (0-1)
        v_score = 1 / (1 + distance)
        
        # Keyword Bonus
        # Extract skills from the document content dynamically
        doc_tokens = extract_skill_tokens(doc.page_content)
        overlap = len(query_skills.intersection(doc_tokens))
        
        # Magic Multiplier: +0.3 boost for every matching technical skill
        skill_bonus = overlap * 0.3
        
        final_score = (v_score * 0.7) + skill_bonus
        scored_candidates.append((final_score, doc))
        
    # Re-Rank and Return Top K
    scored_candidates.sort(key=lambda x: x[0], reverse=True)
    return [d for _, d in scored_candidates][:k]

# ---------------------------------------------------------
# 5. VALIDATION (SLUG MATCHING)
# ---------------------------------------------------------
def validate_engine():
    print("\n" + "="*50)
    print("üß™ PHASE 1: Validation (Slug Matching)")
    print("="*50)

    if not os.path.exists(TEST_DATA_PATH):
        print(f"‚ö†Ô∏è Validation skipped: File not found at {TEST_DATA_PATH}")
        return

    try:
        train_df = pd.read_excel(TEST_DATA_PATH, sheet_name=0)
        q_cols = [c for c in train_df.columns if "query" in c.lower()]
        ans_cols = [c for c in train_df.columns if "url" in c.lower()]
        
        if not q_cols or not ans_cols:
            print("‚ùå Validation Error: Column names mismatch.")
            return
            
        q_col, ans_col = q_cols[0], ans_cols[0]
        hits = 0
        
        for i, row in train_df.iterrows():
            query = row[q_col]
            target_slug = get_slug(row[ans_col])
            
            # Check Top 10
            results = hybrid_search(str(query), k=10)
            found_slugs = [get_slug(d.metadata['url']) for d in results]
            
            if target_slug in found_slugs:
                hits += 1
            elif i < 3: # Debug first few misses
                 print(f"   ‚ùå Miss: '{str(query)[:20]}...' Expected: {target_slug}")

        print("-" * 40)
        print(f"üèÜ RECALL@10 SCORE: {(hits / len(train_df)):.1%}")
        print("-" * 40)

    except Exception as e:
        print(f"‚ùå Validation Logic Error: {e}")

# ---------------------------------------------------------
# 6. GENERATE FINAL SUBMISSION CSV
# ---------------------------------------------------------
def generate_submission():
    print("\n" + "="*50)
    print("üìù PHASE 2: Generating final_submission.csv")
    print("="*50)
    
    try:
        # Load Unlabeled Test Data (Sheet 1)
        test_df = pd.read_excel(TEST_DATA_PATH, sheet_name=1)
        q_col = [c for c in test_df.columns if "query" in c.lower()][0]
        
        submission_rows = []

        for _, row in test_df.iterrows():
            query_text = row[q_col]
            
            if pd.isna(query_text) or str(query_text).strip() == "":
                continue
            
            # Get Top 5 Recommendations
            results = hybrid_search(str(query_text), k=5)
            
            for doc in results:
                submission_rows.append({
                    "Query": query_text,
                    "Assessment_url": doc.metadata['url']
                })
                
        # Save to CSV
        output_filename = "final_submission-1.csv"
        df_out = pd.DataFrame(submission_rows)
        df_out.to_csv(output_filename, index=False)
        
        print(f"‚úÖ Success! Generated '{output_filename}' with {len(df_out)} rows.")
        print("   -> Header verified: Query, Assessment_url")
        print(df_out.head())

    except Exception as e:
        print(f"‚ùå Submission Generation Error: {e}")

# =========================================================
# MAIN EXECUTION
# =========================================================
if __name__ == "__main__":
    validate_engine()
    generate_submission()

üîß Setting up environment...
‚úÖ Authenticated with Gemini API.

üìÇ Loading SHL Catalog...
‚úÖ Indexed 429 assessments (Skipped 81 invalid items).
üß† Building FAISS Index...
üöÄ Retrieval Engine Ready!

üß™ PHASE 1: Validation (Slug Matching)
   ‚ùå Miss: 'I am hiring for Java...' Expected: automata-fix-new
----------------------------------------
üèÜ RECALL@10 SCORE: 23.1%
----------------------------------------

üìù PHASE 2: Generating final_submission.csv
‚úÖ Success! Generated 'final_submission-1.csv' with 45 rows.
   -> Header verified: Query, Assessment_url
                                               Query  \
0  Looking to hire mid-level professionals who ar...   
1  Looking to hire mid-level professionals who ar...   
2  Looking to hire mid-level professionals who ar...   
3  Looking to hire mid-level professionals who ar...   
4  Looking to hire mid-level professionals who ar...   

                                      Assessment_url  
0  https://www.shl.com/prod

In [7]:
# =============================================================================
# üèÜ SHL ASSESSMENT ENGINE - FINAL SUBMISSION SCRIPT
# =============================================================================

import os
import json
import re
import warnings
import pandas as pd
import google.generativeai as genai
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from kaggle_secrets import UserSecretsClient

warnings.filterwarnings("ignore")

# ---------------------------------------------------------
# 1. SETUP & AUTHENTICATION
# ---------------------------------------------------------
print("üîß Setting up environment...")
RAW_DATA_PATH = "/kaggle/input/data-3/raw_assessments.json"
TEST_DATA_PATH = "/kaggle/input/shl-train-test/Gen_AI Dataset.xlsx"

model = None
try:
    user_secrets = UserSecretsClient()
    GEMINI_API_KEY = user_secrets.get_secret("GEMINI_API_KEY")
    if GEMINI_API_KEY:
        genai.configure(api_key=GEMINI_API_KEY)
        model = genai.GenerativeModel("models/gemini-2.5-flash")
        print("‚úÖ Authenticated with Gemini API.")
    else:
        print("‚ö†Ô∏è GEMINI_API_KEY secret is missing (AI summaries disabled).")
except Exception as e:
    print(f"‚ö†Ô∏è Auth Warning: {e}")

# ---------------------------------------------------------
# 2. BUILD VECTOR DATABASE (WITH CRITICAL FILTERS)
# ---------------------------------------------------------
print("\nüìÇ Loading SHL Catalog...")

try:
    with open(RAW_DATA_PATH, "r", encoding="utf-8") as f:
        raw_data = json.load(f)
except FileNotFoundError:
    print(f"‚ùå Error: File not found at {RAW_DATA_PATH}")
    exit()

documents = []
skipped_count = 0

for item in raw_data:
    name = item.get('name', 'Unknown')
    test_types = item.get('test_type', [])
    
    # üö® CRITICAL FIX: FILTERS REQUIRED BY ASSIGNMENT üö®
    # If you recommend these, you lose points.
    if "Pre-packaged Job Solutions" in test_types: 
        skipped_count += 1
        continue
    if "Solution" in name and "Individual" not in str(test_types): 
        skipped_count += 1
        continue

    # Content Preparation
    type_str = ", ".join(test_types) if isinstance(test_types, list) else str(test_types)
    page_content = f"Title: {name} | Type: {type_str} | Desc: {item.get('description', '')}"
    
    metadata = {
        "name": name,
        "url": item.get("url"),
        "test_type": test_types
    }
    documents.append(Document(page_content=page_content, metadata=metadata))

print(f"‚úÖ Indexed {len(documents)} assessments (Skipped {skipped_count} invalid items).")
print("üß† Building FAISS Index...")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_db = FAISS.from_documents(documents, embeddings)
print("üöÄ Retrieval Engine Ready!")

# ---------------------------------------------------------
# 3. HELPER FUNCTIONS
# ---------------------------------------------------------
def clean_query(text):
    if not isinstance(text, str): return ""
    fluff = [
        "i am hiring for", "looking to hire", "i need a", "we are looking for",
        "candidates who are", "proficient in", "good at", "can also", 
        "collaborate with", "skills in", "experience in", "based on the jd",
        "recommend some assessment", "applications to screen"
    ]
    cleaned = text.lower()
    for phrase in fluff:
        cleaned = cleaned.replace(phrase, "")
    return " ".join(cleaned.split())

def extract_skill_tokens(text):
    if not isinstance(text, str): return set()
    text = text.lower()
    tokens = re.findall(r"[a-z]+[+#]*", text) 
    stop_words = {
        "hire", "hiring", "role", "junior", "senior", "developer", "engineer",
        "position", "candidate", "assessment", "test", "solution", "professional",
        "comprehensive", "pre-packaged", "manager", "team", "lead", "good", "at",
        "and", "or", "with", "for", "the", "a", "an", "of", "in", "to"
    }
    return set(t for t in tokens if t not in stop_words and len(t) > 1)

def get_slug(url):
    if not isinstance(url, str): return ""
    return url.strip().rstrip('/').split('/')[-1].lower()

# ---------------------------------------------------------
# 4. HYBRID SEARCH (IMPROVED RECALL)
# ---------------------------------------------------------
def hybrid_search(query, k=5):
    opt_query = clean_query(query)
    query_skills = extract_skill_tokens(query)
    
    # üöÄ RECALL BOOST: Search 100 candidates (not 30) to find hidden gems
    raw_results = vector_db.similarity_search_with_score(opt_query, k=100)
    
    scored_candidates = []
    
    for doc, distance in raw_results:
        # Vector Score (0-1)
        v_score = 1 / (1 + distance)
        
        # Keyword Bonus
        doc_tokens = extract_skill_tokens(doc.page_content)
        overlap = len(query_skills.intersection(doc_tokens))
        
        # Bonus: +0.3 per matching skill
        skill_bonus = overlap * 0.3
        
        final_score = (v_score * 0.7) + skill_bonus
        scored_candidates.append((final_score, doc))
        
    # Re-Rank
    scored_candidates.sort(key=lambda x: x[0], reverse=True)
    return [d for _, d in scored_candidates][:k]

# ---------------------------------------------------------
# 5. VALIDATION (SLUG MATCHING)
# ---------------------------------------------------------
def validate_engine():
    print("\n" + "="*50)
    print("üß™ PHASE 1: Validation (Slug Matching)")
    print("="*50)

    if not os.path.exists(TEST_DATA_PATH):
        print("‚ö†Ô∏è Validation skipped: Excel file not found.")
        return

    try:
        # Load TRAINING Sheet (Sheet 0)
        train_df = pd.read_excel(TEST_DATA_PATH, sheet_name=0)
        q_cols = [c for c in train_df.columns if "query" in c.lower()]
        ans_cols = [c for c in train_df.columns if "url" in c.lower()]
        
        if not q_cols or not ans_cols:
            print("‚ùå Validation Error: Columns mismatch.")
            return
            
        q_col, ans_col = q_cols[0], ans_cols[0]
        hits = 0
        
        for i, row in train_df.iterrows():
            query = row[q_col]
            target_slug = get_slug(str(row[ans_col]))
            
            # Check Top 10
            results = hybrid_search(str(query), k=10)
            found_slugs = [get_slug(d.metadata['url']) for d in results]
            
            # Fuzzy match check
            match = False
            for slug in found_slugs:
                if target_slug in slug or slug in target_slug:
                    match = True
                    break
            
            if match:
                hits += 1
            elif i < 3: 
                 print(f"   ‚ùå Miss: '{str(query)[:20]}...' Expected: {target_slug}")

        print("-" * 40)
        print(f"üèÜ RECALL@10 SCORE: {(hits / len(train_df)):.1%}")
        print("-" * 40)

    except Exception as e:
        print(f"‚ùå Validation Error: {e}")

# ---------------------------------------------------------
# 6. GENERATE FINAL SUBMISSION (CORRECT STRUCTURE)
# ---------------------------------------------------------
# ---------------------------------------------------------
# 6. GENERATE FINAL SUBMISSION (UPDATED FOR 10 RECS)
# ---------------------------------------------------------
def generate_submission():
    print("\n" + "="*50)
    print("üìù PHASE 2: Generating final_submission.csv (Top 10)")
    print("="*50)
    
    try:
        # Load TEST Sheet (Sheet 1)
        test_df = pd.read_excel(TEST_DATA_PATH, sheet_name=1)
        
        # Dynamic Column Finding
        q_cols = [c for c in test_df.columns if "query" in c.lower()]
        if not q_cols:
            raise ValueError("Query column not found in Sheet 1")
        q_col = q_cols[0]
        
        submission_rows = []

        for _, row in test_df.iterrows():
            query_text = row[q_col]
            
            # Skip invalid rows
            if pd.isna(query_text) or str(query_text).strip() == "":
                continue
            
            # üöÄ CHANGE: Get Top 10 Recommendations (Maximize Recall@10)
            results = hybrid_search(str(query_text), k=10)
            
            # Format: Query | Assessment_url
            for doc in results:
                submission_rows.append({
                    "Query": query_text,
                    "Assessment_url": doc.metadata['url']
                })
        
        # Create DataFrame
        df_out = pd.DataFrame(submission_rows)
        
        # SAVE with specific column order
        output_file = "final_submission.csv"
        df_out = df_out[["Query", "Assessment_url"]]
        df_out.to_csv(output_file, index=False)
        
        print(f"‚úÖ Success! Generated '{output_file}' with {len(df_out)} rows.")
        # Note: 9 Queries * 10 Recs = 90 Rows
        print("   -> Preview:")
        print(df_out.head())

    except Exception as e:
        print(f"‚ùå Submission Error: {e}")

# =========================================================
# MAIN EXECUTION
# =========================================================
if __name__ == "__main__":
    validate_engine()
    generate_submission()

üîß Setting up environment...
‚úÖ Authenticated with Gemini API.

üìÇ Loading SHL Catalog...
‚úÖ Indexed 429 assessments (Skipped 81 invalid items).
üß† Building FAISS Index...
üöÄ Retrieval Engine Ready!

üß™ PHASE 1: Validation (Slug Matching)
   ‚ùå Miss: 'I am hiring for Java...' Expected: automata-fix-new
----------------------------------------
üèÜ RECALL@10 SCORE: 24.6%
----------------------------------------

üìù PHASE 2: Generating final_submission.csv (Top 10)
‚úÖ Success! Generated 'final_submission.csv' with 90 rows.
   -> Preview:
                                               Query  \
0  Looking to hire mid-level professionals who ar...   
1  Looking to hire mid-level professionals who ar...   
2  Looking to hire mid-level professionals who ar...   
3  Looking to hire mid-level professionals who ar...   
4  Looking to hire mid-level professionals who ar...   

                                      Assessment_url  
0  https://www.shl.com/products/product-catalog/v.