#DATA Processing

In [1]:
!pip install pandas scikit-learn gradio


Collecting gradio
  Downloading gradio-5.23.3-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
df = pd.read_csv("/content/shl_assesment_Final.csv")  # Use your filename
df.head()

Unnamed: 0,Assessment Name,URL,Remote Testing,Adaptive/IRT,Test Type,Duration,description,job_levels,languages,assessment_length
0,Account Manager Solution,https://www.shl.com/solutions/products/product...,Yes,Yes,"C, P, A, B",,The Account Manager solution is an assessment ...,"Mid-Professional,","English (USA),",Approximate Completion Time in minutes = 49
1,Administrative Professional - Short Form,https://www.shl.com/solutions/products/product...,Yes,Yes,"A, K, P",,The Administrative Professional solution is fo...,"Entry-Level,","English (USA),",Approximate Completion Time in minutes = 36
2,Agency Manager Solution,https://www.shl.com/solutions/products/product...,Yes,Yes,"A, B, P, S",,The Agency Manager solution is for mid-level s...,"Front Line Manager, Manager, Supervisor,","English (USA),",Approximate Completion Time in minutes = 51
3,Apprentice + 8.0 Job Focused Assessment,https://www.shl.com/solutions/products/product...,Yes,No,"B, P",,The Apprentice + 8.0 Job-Focused Assessment is...,"General Population, Graduate, Entry-Level,","English International, German,",Approximate Completion Time in minutes = 30
4,Apprentice 8.0 Job Focused Assessment,https://www.shl.com/solutions/products/product...,Yes,No,"B, P",,The Apprentice 8.0 Job-Focused Assessment is a...,"Entry-Level, General Population, Graduate,","English International, German, French,",Approximate Completion Time in minutes = 20


In [4]:
# prompt: drop duration coloumn

df = df.drop('Duration', axis=1)


In [5]:
df.columns


Index(['Assessment Name', 'URL', 'Remote Testing', 'Adaptive/IRT', 'Test Type',
       'description', 'job_levels', 'languages', 'assessment_length'],
      dtype='object')

In [6]:
df["document"] = (
    df["Assessment Name"].fillna('') + " " +
    df["description"].fillna('') + " " +
    df["job_levels"].fillna('') + " " +
    df["Test Type"].fillna('') + " " +
    df["assessment_length"].astype(str)
)

In [7]:
df['document'][0]

'Account Manager Solution The Account Manager solution is an assessment used for job candidates applying to mid-level leadership positions that tend to manage the day-to-day operations and activities of client accounts. Sample tasks for these jobs include, but are not limited to: communicating with clients about project status, developing and maintaining project plans, coordinating internally with appropriate project personnel, and ensuring client expectations are being met. Potential job titles that use this solution are: Account Executive, Account Manager, and Senior Account Manager. There are multiple configurations of this solution available. Mid-Professional, C, P, A, B Approximate Completion Time in minutes = 49'

In [8]:
import re

def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r"[^a-z0-9\s]", " ", text)  # remove punctuation
    text = re.sub(r"\s+", " ", text).strip()  # remove extra spaces
    return text

# Apply to your data
df["document_cleaned"] = df["document"].apply(clean_text)


In [9]:
code_mapping = {
    'A': 'Ability & Aptitude',
    'B': 'Biodata & Situational Judgement',
    'C': 'Competencies',
    'D': 'Development & 360',
    'E': 'Assessment Exercises',
    'K': 'Knowledge & Skills',
    'P': 'Personality & Behavior',
    'S': 'Simulations'
}




In [10]:
import re

def replace_codes_in_text(text):
    # Replace each isolated code (surrounded by word boundaries) with the full form
    for code, full_form in code_mapping.items():
        # \b ensures we only match whole codes (like 'C', not part of a word like 'Client')
        text = re.sub(rf'\b{code}\b', full_form, text)
    return text

# Apply it to your 'document' column
df["document_cleaned"] = df["document"].apply(replace_codes_in_text)


In [11]:
df['document_cleaned'][0]

'Account Manager Solution The Account Manager solution is an assessment used for job candidates applying to mid-level leadership positions that tend to manage the day-to-day operations and activities of client accounts. Sample tasks for these jobs include, but are not limited to: communicating with clients about project status, developing and maintaining project plans, coordinating internally with appropriate project personnel, and ensuring client expectations are being met. Potential job titles that use this solution are: Account Executive, Account Manager, and Senior Account Manager. There are multiple configurations of this solution available. Mid-Professional, Competencies, Personality & Behavior, Ability & Aptitude, Biodata & Situational Judgement Approximate Completion Time in minutes = 49'

#Using TF-IDF and cosine _similarity

## ONlY USE 10% of sampple for embedding Because it is taking too much time !

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Fit the vectorizer on assessment documents
vectorizer = TfidfVectorizer(stop_words="english")
doc_vectors = vectorizer.fit_transform(df["document_cleaned"])


In [13]:
df.columns

Index(['Assessment Name', 'URL', 'Remote Testing', 'Adaptive/IRT', 'Test Type',
       'description', 'job_levels', 'languages', 'assessment_length',
       'document', 'document_cleaned'],
      dtype='object')

In [14]:
# prompt: in assesment lenghth which is string only extract integer parts and add next word just after integer word

import re

def extract_integer_and_next_word(text):
    # Regular expression to find integers and the next word
    match = re.search(r"(\d+)\s*(\w+)", text)
    if match:
        integer_part = match.group(1)
        next_word = match.group(2)
        return integer_part+ next_word + " minutes"
    else:
        return text  # Return original text if no match

# Apply the function to the 'assessment_length' column
df['assessment_length_modified'] = df['assessment_length'].astype(str).apply(extract_integer_and_next_word)


In [15]:
df[['assessment_length_modified','assessment_length']]

Unnamed: 0,assessment_length_modified,assessment_length
0,49 minutes,Approximate Completion Time in minutes = 49
1,36 minutes,Approximate Completion Time in minutes = 36
2,51 minutes,Approximate Completion Time in minutes = 51
3,30 minutes,Approximate Completion Time in minutes = 30
4,20 minutes,Approximate Completion Time in minutes = 20
...,...,...
136,16 minutes,Approximate Completion Time in minutes = 16
137,20 minutes,Approximate Completion Time in minutes = 20
138,20 minutes,Approximate Completion Time in minutes = 20
139,20 minutes,Approximate Completion Time in minutes = 20


In [16]:
def recommend_assessments(query, top_k=10):
    query_vector = vectorizer.transform([query])
    similarities = cosine_similarity(query_vector, doc_vectors).flatten()
    top_indices = similarities.argsort()[::-1][:top_k]

    return df.iloc[top_indices][[
       'Assessment Name', 'URL', 'Remote Testing', 'Adaptive/IRT', 'Test Type','assessment_length_modified'
    ]].reset_index(drop=True)


In [19]:
query="Looking to hire mid-level professionals who are proficient in Python, SQL and Java Script. Need an assessment package that can test all skills with max duration of 60 minutes"

In [20]:
  query = clean_text(query)

  results = recommend_assessments(query)
  results

Unnamed: 0,Assessment Name,URL,Remote Testing,Adaptive/IRT,Test Type,assessment_length_modified
0,Supervisor 7.1 (Americas),https://www.shl.com/solutions/products/product...,Yes,No,"B, C, P",29 minutes
1,Manager + 7.1 (Americas),https://www.shl.com/solutions/products/product...,Yes,Yes,"B, C, P",63 minutes
2,Contact Center Customer Service + 8.0,https://www.shl.com/solutions/products/product...,Yes,No,"A, B, C, P, S",41 minutes
3,Entry Level Customer Service 7.1 (Americas),https://www.shl.com/solutions/products/product...,Yes,No,"C, P",19 minutes
4,Entry Level Customer Service 7.1 (International),https://www.shl.com/solutions/products/product...,Yes,No,"C, P",19 minutes
5,Contact Center Sales & Service + 8.0,https://www.shl.com/solutions/products/product...,Yes,No,"P, B, C, A, S",41 minutes
6,Supervisor 7.1 (International),https://www.shl.com/solutions/products/product...,Yes,No,"P, C, B",29 minutes
7,Manager + 7.1 (International),https://www.shl.com/solutions/products/product...,Yes,Yes,"B, C, P",63 minutes
8,Entry Level Customer Service 7.1 (South Africa),https://www.shl.com/solutions/products/product...,Yes,No,"C, P",19 minutes
9,Apprentice 8.0 Job Focused Assessment,https://www.shl.com/solutions/products/product...,Yes,No,"B, P",20 minutes


In [25]:
!pip install mistralai

Collecting mistralai
  Downloading mistralai-1.6.0-py3-none-any.whl.metadata (30 kB)
Collecting eval-type-backport>=0.2.0 (from mistralai)
  Downloading eval_type_backport-0.2.2-py3-none-any.whl.metadata (2.2 kB)
Downloading mistralai-1.6.0-py3-none-any.whl (288 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m288.7/288.7 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading eval_type_backport-0.2.2-py3-none-any.whl (5.8 kB)
Installing collected packages: eval-type-backport, mistralai
Successfully installed eval-type-backport-0.2.2 mistralai-1.6.0


#Using MIstral AI

In [28]:
import requests
import time

In [35]:
import time
import os
# Import mistralai within the function
def get_embedding(text, max_retries=3, retry_delay=5):
    from mistralai import Mistral, SDKError
    api_key = "eja45jg2rnmVSLV5epNjNR8XBJjlMjmq"  # Replace with your API key
    model = "mistral-embed"

    client = Mistral(api_key=api_key)
    retries = 0
    while retries < max_retries:
        try:
            # Add a delay between requests to avoid exceeding rate limits
            time.sleep(3)

            embeddings_batch_response = client.embeddings.create(
                model=model,
                inputs=[text],
            )
            return embeddings_batch_response.data[0].embedding
        except SDKError as e: # Use SDKError directly since it's imported
            if e.status_code == 429:  # Rate limit error
                print(f"Rate limit exceeded. Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
                retries += 1
            else:
                raise  # Re-raise other errors

    raise Exception(f"Failed to get embedding after {max_retries} retries.")

In [42]:
# prompt: df["embedding"] = df["document_cleaned"].apply(get_embedding) only 10 % cconvert toe mbedding for test

import pandas as pd
# Assuming df and get_embedding are defined as in the previous code

# Calculate the number of rows for the 10% sample
sample_size = int(0.1 * len(df))

# Sample  of the DataFrame
df_sample = df.sample(n=sample_size)




In [43]:
df_sample["embedding"] = df_sample["document_cleaned"].apply(get_embedding)

In [44]:
df_sample.columns

Index(['Assessment Name', 'URL', 'Remote Testing', 'Adaptive/IRT', 'Test Type',
       'description', 'job_levels', 'languages', 'document_cleaned',
       'assessment_length_modified', 'embedding'],
      dtype='object')

In [46]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Function to recommend assessments based on a user query
def recommend_assessments_api(query,top_k=3):


  query_embedding = get_embedding(query)
  # Calculate cosine similarity between the query and all document embeddings
  similarities = cosine_similarity([query_embedding], list(df_sample["embedding"]))
  # Get indices of top_k most similar assessments
  top_indices = np.argsort(similarities[0])[::-1][:top_k]
  # Return the top_k assessments
  return df_sample.iloc[top_indices][['Assessment Name', 'URL', 'Test Type',
      'assessment_length_modified']].reset_index(drop=True)

# Example usage
user_query = "Looking for a mid-level Java developer who can collaborate with business teams"
recommendations = recommend_assessments_api(user_query,)
print(recommendations)


                                     Assessment Name  \
0   Professional/Individual Contributor - Short Form   
1                           Manager + 7.1 (Americas)   
2  General Entry Level - All Industries 7.1(Ameri...   

                                                 URL   Test Type  \
0  https://www.shl.com/solutions/products/product...  A, B, P, S   
1  https://www.shl.com/solutions/products/product...     B, C, P   
2  https://www.shl.com/solutions/products/product...           B   

  assessment_length_modified  
0                 44 minutes  
1                 63 minutes  
2                 19 minutes  


In [47]:
recommendations['URL']

Unnamed: 0,URL
0,https://www.shl.com/solutions/products/product...
1,https://www.shl.com/solutions/products/product...
2,https://www.shl.com/solutions/products/product...


In [39]:
# prompt: drop document and assisment lenth coloumn

df = df.drop(['document', 'assessment_length'], axis=1)


In [40]:
# prompt: save df as csv file

df.to_csv('updated_dataframe.csv', index=False)
