In [1]:
%pip install faiss-cpu




In [2]:
%pip install langchain_community

Note: you may need to restart the kernel to use updated packages.


In [3]:
%pip install groq

Note: you may need to restart the kernel to use updated packages.


In [4]:
%pip install langchain-groq

Note: you may need to restart the kernel to use updated packages.


In [None]:


from firecrawl import FirecrawlApp
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
import time
import pandas as pd
import os  # Added for file existence check

app = FirecrawlApp(api_key="fc-b325465df6d740c39fcb67ba0df51d82")
base_url = 'https://www.shl.com'
csv_filename = 'shl_product_catalog120_243.csv'  # Define CSV filename once

# Generate paginated URLs from start=12 to start=372 with step 12
start_values = range(12, 373, 12)  # 12, 24, 36,..., 372
urls = [f'https://www.shl.com/solutions/products/product-catalog/?start={start}&type=1&type=1' for start in start_values]

headers = None  # Maintain headers state across pages

for page_num, url in enumerate(urls, 1):
    print(f"\n{'='*40}\nProcessing Page {page_num} ({url})\n{'='*40}")
    page_data = []  # Stores data for current page only
    
    try:
        # Scrape catalog page
        catalog_result = app.scrape_url(url, params={'formats': ['html']})
        catalog_soup = BeautifulSoup(catalog_result['html'], 'html.parser')

        # Process all tables on the page
        for table in catalog_soup.find_all('table'):
            # Extract headers only once
            if headers is None:
                headers = [th.get_text(strip=True) for th in table.find_all('th')]
                headers += ['Description', 'Job Level', 'Language', 'Assessment Length']

            # Process table rows
            for row in table.find_all('tr')[1:]:  # Skip header row
                cols = row.find_all('td')
                try:
                    # Extract basic info
                    solution_link = cols[0].find('a')
                    solution_name = solution_link.get_text(strip=True) if solution_link else 'N/A'
                    solution_url = urljoin(base_url, solution_link['href']) if solution_link else ''
                    
                    remote_testing = 'Yes' if cols[1].find('span') and '-yes' in cols[1].span.get('class', []) else 'No'
                    adaptive_irt = 'Yes' if cols[2].find('span') and '-yes' in cols[2].span.get('class', []) else 'No'
                    test_types = ' '.join([span.get_text(strip=True) for span in cols[3].find_all('span')])

                    # Initialize default details
                    details = {
                        'Description': 'Not found',
                        'Job Level': 'Not found',
                        'Language': 'Not found',
                        'Assessment Length': 'Not found'
                    }

                    # Scrape detailed product page
                    if solution_url:
                        try:
                            product_result = app.scrape_url(solution_url, params={'formats': ['html']})
                            product_soup = BeautifulSoup(product_result['html'], 'html.parser')
                            
                            for div in product_soup.find_all('div', class_='product-catalogue-training-calendar__row'):
                                h4 = div.find('h4')
                                p_tag = div.find('p')
                                if h4 and p_tag:
                                    key = h4.get_text(strip=True)
                                    value = p_tag.get_text(strip=True).rstrip(', ')
                                    
                                    if 'description' in key.lower():
                                        details['Description'] = value
                                    elif 'job level' in key.lower():
                                        details['Job Level'] = value
                                    elif 'language' in key.lower():
                                        details['Language'] = value
                                    elif 'assessment length' in key.lower():
                                        if minutes := re.search(r'\d+', value):
                                            details['Assessment Length'] = f"{minutes.group()} minutes"
                            
                            time.sleep(10)  # Reduced delay for scalability
                        except Exception as e:
                            print(f"Error scraping {solution_url}: {str(e)}")

                    # Add row data to page_data
                    page_data.append([
                        f"[{solution_name}]({solution_url})" if solution_url else solution_name,
                        remote_testing,
                        adaptive_irt,
                        test_types,
                        details['Description'],
                        details['Job Level'],
                        details['Language'],
                        details['Assessment Length']
                    ])

                except Exception as e:
                    print(f"Error processing row: {str(e)}")
                    continue

        # Save page data immediately after processing
        if headers and page_data:
            df_page = pd.DataFrame(page_data, columns=headers)
            
            # Write to CSV with appropriate mode
            if not os.path.isfile(csv_filename):
                df_page.to_csv(csv_filename, index=False)
                print(f"Created new CSV with {len(df_page)} records from page {page_num}")
            else:
                df_page.to_csv(csv_filename, mode='a', header=False, index=False)
                print(f"Appended {len(df_page)} records from page {page_num}")

    except Exception as e:
        print(f"Error processing page {url}: {str(e)}")
        continue

print("\nScraping completed. Final data saved in:", csv_filename)

In [None]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import pandas as pd

# ======================
# Precompute Embeddings
# ======================
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def create_faiss_index(df: pd.DataFrame):
    """Generate and store embeddings in FAISS index"""
    # Create embeddings
    embeddings = model.encode(df['Skills_JobLevel'].tolist(), show_progress_bar=True)
    embeddings = embeddings.astype('float32')
    
    # Create FAISS index
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)  # L2 distance index
    index.add(embeddings)
    
    return index

# ======================
# Enhanced Recommendation
# ======================
def recommend_with_faiss(input_skills: str, df: pd.DataFrame, index):
    """FAISS-powered semantic search"""
    # Generate query embedding
    query_embedding = model.encode([input_skills]).astype('float32')
    
    # Search FAISS index
    distances, indices = index.search(query_embedding, k=50)
    
    # Get top matches from original DF
    results = df.iloc[indices[0]].copy()
    results['similarity'] = 1 - (distances[0] / 4)  # Convert L2 distance to similarity score
    
    return results.nlargest(10, 'similarity')

import json
import os
from langchain_groq import ChatGroq
from langchain.schema import HumanMessage
from typing import List, Optional, Dict, Any

# Set Groq API key
os.environ["GROQ_API_KEY"] = "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"

class Supervisor:
    def __init__(self, supervisor_name: str, supervisor_prompt: str, model: Any):
        self.name = supervisor_name
        self.prompt_template = supervisor_prompt
        self.model = model

    def format_prompt(self, team_members: List[str]) -> str:
        return self.prompt_template.format(team_members=", ".join(team_members))

class Worker:
    def __init__(self, worker_name: str, worker_prompt: str, supervisor: Supervisor, tools: Optional[List[Any]] = None):
        self.name = worker_name
        self.prompt_template = worker_prompt
        self.supervisor = supervisor
        self.tools = tools or []
        
    def clean_response(self, response: str) -> Any:  # Changed return type to Any
        # Extract content after last </think> tag
        if '</think>' in response:
            response = response.split('</think>')[-1]
        
        # Remove markdown formatting and numbering
        response = response.replace('**', '').strip()
        response = response.split(':')[-1].strip()
        
        # Custom cleaning per worker type
        if self.name == 'TestTypeAnalyst':
            return ''.join([c for c in response if c.isupper() or c == ','])
        elif self.name == 'Skill Extractor':
            return '\n'.join([s.split('. ')[-1] for s in response.split('\n')])
        elif self.name == 'Time Limit Identifier':
            return response.split()[0]
        elif self.name == 'Testing Type Identifier':
            # Special handling for testing type response
            response = response.strip('[]')
            parts = [part.strip().lower() for part in response.split(',')]
            return [part if part in ('yes', 'no') else 'no' for part in parts]
        
        return response.split('\n')[0].strip('"').strip()
    
    def process_input(self, user_input: str) -> str: # Added the missing process_input function
        prompt = f"{self.prompt_template}\n\nUser Input: {user_input}"
        messages = [HumanMessage(content=prompt)]
        response = self.supervisor.model.invoke(messages)
        return self.clean_response(response.content)

# Initialize Groq Chat Model
groq_model = ChatGroq(
    model_name="deepseek-r1-distill-llama-70b",
    temperature=0,
    streaming=True
)

# Initialize Supervisor with 6 workers
supervisor = Supervisor(
    supervisor_name="AssessmentCoordinator",
    supervisor_prompt="You manage these specialists: {team_members}. Coordinate assessment creation workflow. Select next worker strategically. FINISH when complete.",
    model=groq_model
)

# Initialize 6 Specialist Workers
workers = [
    Worker(
        worker_name="TestTypeAnalyst",
        worker_prompt='''You are an AI classifier that maps user inputs to test type codes from this taxonomy:

Test Types (Code: Description)

A: Ability & Aptitude (cognitive skills, problem-solving)

B: Biodata & Situational Judgement (past behavior, hypothetical scenarios)

C: Competencies (job-specific skills like leadership)

D: Development & 360 (growth feedback, multi-rater reviews)

E: Assessment Exercises (role-plays, case studies)

K: Knowledge & Skills (technical/domain expertise)

P: Personality & Behavior (traits, motivations)

S: Simulations (realistic job-task replicas)

Rules:

Return only the relevant letter codes (e.g., K, A,S).

Use commas for multiple matches (no spaces).

Prioritize specificity (e.g., "Python coding test" → K, not A).

Default to B for biographical/historical scenarios.

Examples:

Input: "Quiz on Java and cloud architecture" → K

Input: "Test how someone leads a team during a crisis" → C,S

Input: "Evaluate agreeableness and reaction to feedback" → P,D

Output Format:
Return only the letter code(s) as a comma-separated string (e.g., P or B,S).

''',
        supervisor=supervisor
    ),
    Worker(
        worker_name="Skill Extractor",
        worker_prompt='''You are a skill extractor for assessment design. Identify both hard and soft skills explicitly mentioned in the user’s input that are relevant to the test’s purpose.

Rules:
Focus: Extract hard skills (technical) and soft skills (non-technical):

✅ Hard Skills:

Tools: Python, SQL, AWS

Frameworks: TensorFlow, React

Domains: cybersecurity, CAD, data analysis

✅ Soft Skills:

communication, leadership, teamwork, problem-solving

🚫 Exclude:

Generic terms: "experience," "knowledge," "proficiency"

Job roles: "developer," "engineer"

Test Type Context: Use the test type code (A/B/C/D/E/K/P/S) to refine extraction:

Example: Test type K (Knowledge & Skills) → Prioritize hard skills like Python.

Example: Test type C (Competencies) → Include both hard skills (CAD) and soft skills (leadership).

Example: Test type P (Personality) → Extract only soft skills if mentioned (e.g., adaptability).

Normalization:

Standardize terms: JS → JavaScript, ML → machine learning.

Merge equivalents: CAD → Computer-Aided Design.

Output:

Return a comma-separated list (e.g., Python, leadership, CAD).

If no skills are found, return [].

Examples:
Input	Test Type	Output
“Test Python coding and teamwork.”	K	Python, teamwork
“Assess problem-solving and cloud architecture.”	A	problem-solving, cloud architecture
“Evaluate leadership and CAD proficiency.”	C	leadership, CAD
“Behavioral test focusing on communication.”	P	communication
“No skills mentioned.”	S	[]''',
        supervisor=supervisor
    ),
    Worker(
        worker_name="Job Level Identifier",
        worker_prompt='''You are an AI assistant tasked with identifying the job level for which a test is intended. Given input that may include job titles, responsibilities, or descriptions, determine the most appropriate job level from the following list:

Director

Entry Level

Executive

Frontline Manager

General Population

Graduate

Manager

Mid-Professional

Professional

Professional Individual Contributor

Supervisor

Use contextual clues in the input to make an accurate classification. Respond only with the job level.
 ''',
        supervisor=supervisor
    ),
    Worker(
        worker_name="Language Preference Identifier",
        worker_prompt='''You are a language detector for assessments. Identify spoken (natural) languages (e.g., English, Mandarin, Spanish) explicitly mentioned in the user’s input.

Rules:

Focus:

Extract only natural languages (e.g., "French", "Japanese").

Ignore programming languages (Python, Java), tools (SQL), or frameworks (React).

Defaults:

Return English if no spoken language is mentioned.

For multi-language requests (e.g., "English and Spanish"), return a comma-separated list: English, Spanish.

Output:

Use full language names (e.g., "German" not "Deutsch").

Case-insensitive (e.g., "spanish" → Spanish).

Examples:

Input: "Test must be in Portuguese." → Output: Portuguese

Input: "Python coding test with instructions in Arabic." → Output: Arabic

Input: "Math exam for Spanish-speaking students." → Output: Spanish

Input: "Timed Java assessment." → Output: English

Respond only with the language name(s). No explanations.

'''
,
        supervisor=supervisor
    ),
    Worker(
        worker_name="Time Limit Identifier",
        worker_prompt='''You are an AI that extracts explicit test durations from user input.

Rules
Extract:

Return exact phrases with a number + time unit (e.g., 90 minutes, 2.5 hrs, no more than 45 mins).

Include comparative phrasing (e.g., under 1 hour, at least 20 minutes).

Ignore:

Deadlines (e.g., submit by Friday).

Experience durations (e.g., 5 years of experience).

Vague terms (e.g., timed test, time-sensitive).

Output:

For valid durations: Return them as a comma-separated list (e.g., 1 hour, 30 mins).

For no valid durations: Return no time specified.

Examples
Input	Output
"Complete the test in 45 mins."	45 mins
"Section A: 1 hour; Section B: 30 mins."	1 hour, 30 mins
"Timed exam with no duration mentioned."	no time specified
"Submit by 5 PM and allow up to 2 hrs."	2 hrs
"Requires 3+ years of experience."	no time specified
Strict Constraints
Never return explanations, formatting, or placeholders.

Only return extracted durations or no time specified.

''',
        supervisor=supervisor
    ),
    Worker(
        worker_name="Testing Type Identifier",
        worker_prompt='''You are an AI classifier that detects mentions of remote testing or adaptive testing/IRT in user inputs and returns a structured response.

Rules
Detection Logic:

Remote Testing: yes if the exact phrase "remote testing" is present.

Adaptive Testing: yes if "adaptive testing" or "IRT" (case-insensitive) is present.

Default to no for missing terms.

Output Format:

Return [yes,yes] if both terms are present.

Return [yes,no] if only remote testing is mentioned.

Return [no,yes] if only adaptive testing/IRT is mentioned.

Return [no,no] if neither is mentioned.

Constraints:

NO explanations, NO deviations from the format.

Exact matches only (e.g., "remote" ≠ "remote testing").

Examples
Input	Output
"Conduct remote testing with IRT."	[yes,yes]
"Use adaptive testing."	[no,yes]
"Remote testing required."	[yes,no]
"Timed onsite exam."	[no,no]
Command:
Return ONLY the structured list ([yes,yes], [no,yes], etc.). No other text!''',
        supervisor=supervisor
    )
]



df=pd.read_csv('/content/shl_product_catalog.csv')

# ======================
# Data Processing Functions
# ======================

def extract_skills(test_name: str, description: str, test_type: str) -> str:
    """Extract skills using Skill Extractor worker
    Args:
        test_name: Name of the individual test
        description: Test description content
        test_type: Type of test (e.g., Technical, Behavioral)
    Returns:
        Comma-separated string of extracted skills
    """
    skill_extractor = next(w for w in workers if w.name == 'Skill Extractor')
    input_text = f"Individual Test Solutions: {test_name}\nDescription: {description}\nTest Type: {test_type}"
    return skill_extractor.process_input(input_text)

def combine_skills_joblevel(skills: str, job_level: str) -> str:
    """Combine skills and job level into standardized format
    Args:
        skills: Raw skills string from extraction
        job_level: Job level from original data
    Returns:
        Formatted combination string (skills + job level)
    """
    cleaned_skills = skills.replace('[]', '').strip()
    if cleaned_skills and job_level:
        return f"{cleaned_skills} , {job_level}"
    return cleaned_skills or job_level

# ======================
# Main Data Processing
# ======================

if __name__ == "__main__":
    # ---- Skill Extraction ----
    df['Skills'] = df.apply(
        lambda row: extract_skills(
            test_name=row['Individual Test Solutions'],
            description=row['Description'],
            test_type=row['Test Type']
        ), 
        axis=1
    )
    
    # ---- Skill-Job Level Combination ----
    df['Skills_JobLevel'] = df.apply(
        lambda row: combine_skills_joblevel(
            skills=row['Skills'],
            job_level=row['Job Level']
        ),
        axis=1
    )
    
    # ---- Data Cleaning ----
    # Convert assessment length to numeric
    df['Assessment Length'] = (
        pd.to_numeric(
            df['Assessment Length'].str.extract('(\d+)', expand=False),
            errors='coerce'
        )
        .fillna(0)
        .astype(int)
    )
    
    # ---- Column Management ----
    # Remove intermediate/original columns
    df = df.drop(columns=['Description', 'Job Level', 'Skills'])
    
    # Define final column order
    ordered_columns = [
        'Individual Test Solutions',
        'Remote Testing',
        'Adaptive/IRT', 
        'Test Type',
        'Skills_JobLevel',
        'Language',
        'Assessment Length'
    ]
    
    # Reorganize dataframe
    df = df[ordered_columns]
    
    # ======================
    # Final Output
    # ======================
    print("\nProcessed DataFrame Structure:")
    print(df.head(3).to_markdown(index=False))

# ======================
# Data Processing Functions
# ======================

def extract_skills(test_name: str, description: str, test_type: str) -> str:
    """Extract skills using Skill Extractor worker"""
    skill_extractor = next(w for w in workers if w.name == 'Skill Extractor')
    input_text = f"Individual Test Solutions: {test_name}\nDescription: {description}\nTest Type: {test_type}"
    return skill_extractor.process_input(input_text)

def combine_skills_joblevel(skills: str, job_level: str) -> str:
    """Combine skills and job level into standardized format"""
    cleaned_skills = skills.replace('[]', '').strip()
    if cleaned_skills and job_level:
        return f"{cleaned_skills} , {job_level}"
    return cleaned_skills or job_level

# ======================
# Main Data Processing
# ======================

if __name__ == "__main__":
    # ---- Skill Extraction ----
    df['Skills'] = df.apply(
        lambda row: extract_skills(
            test_name=row['Individual Test Solutions'],
            description=row['Description'],
            test_type=row['Test Type']
        ), 
        axis=1
    )
    
    # ---- Skill-Job Level Combination ----
    df['Skills_JobLevel'] = df.apply(
        lambda row: combine_skills_joblevel(
            skills=row['Skills'],
            job_level=row['Job Level']
        ),
        axis=1
    )
    
    # ---- Data Cleaning ----
    df['Assessment Length'] = (
        pd.to_numeric(
            df['Assessment Length'].str.extract('(\d+)', expand=False),
            errors='coerce'
        )
        .fillna(0)
        .astype(int)
    )
    
    # ---- Column Management ----
    df = df.drop(columns=['Description', 'Job Level', 'Skills'])
    ordered_columns = [
        'Individual Test Solutions',
        'Remote Testing',
        'Adaptive/IRT', 
        'Test Type',
        'Skills_JobLevel',
        'Language',
        'Assessment Length'
    ]
    df = df[ordered_columns]
    
    # ======================
    # Save Processed Data
    # ======================
    output_filename = "processed_assessments.csv"
    df.to_csv(output_filename, index=False)
    print(f"\n✅ Successfully saved processed data to {output_filename}")
    print(f"📊 Total records saved: {len(df):,}")
    
    # Optional: Show sample output
    print("\nSample of saved data:")
    print(df.head(3).to_markdown(index=False))

import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# ======================
# Precompute Embeddings
# ======================
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def create_faiss_index(df: pd.DataFrame):
    """Generate and store embeddings in FAISS index"""
    # Create embeddings
    embeddings = model.encode(df['Skills_JobLevel'].tolist(), show_progress_bar=True)
    embeddings = embeddings.astype('float32')
    
    # Create FAISS index
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)  # L2 distance index
    index.add(embeddings)
    
    return index

# ======================
# Enhanced Recommendation
# ======================
def recommend_with_faiss(input_skills: str, df: pd.DataFrame, index):
    """FAISS-powered semantic search"""
    # Generate query embedding
    query_embedding = model.encode([input_skills]).astype('float32')
    
    # Search FAISS index
    distances, indices = index.search(query_embedding, k=50)
    
    # Get top matches from original DF
    results = df.iloc[indices[0]].copy()
    results['similarity'] = 1 - (distances[0] / 4)  # Convert L2 distance to similarity score
    
    return results.nlargest(10, 'similarity')

# preprocessing.py
import faiss
import pandas as pd
from sentence_transformers import SentenceTransformer

def create_and_save_index():
    # Load processed data
    df = pd.read_csv("/content/processed_assessments.csv")  # Your cleaned dataset
    
    # Generate embeddings
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    embeddings = model.encode(df['Skills_JobLevel'].tolist(), show_progress_bar=True)
    embeddings = embeddings.astype('float32')
    
    # Create optimized index
    index = faiss.IndexFlatIP(embeddings.shape[1])
    faiss.normalize_L2(embeddings)  # Crucial for cosine similarity
    index.add(embeddings)
    
    # Save index and metadata
    faiss.write_index(index, "precomputed_faiss_index.bin")
    df.to_parquet("metadata.parquet")  # Faster read than CSV



create_and_save_index()