In [20]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import ast

# Load CSV
df = pd.read_csv("final_cleaned.csv")

# Ensure 'Description' column is string
df["Description"] = df["Description"].astype(str)

# Initialize Sentence-BERT model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Generate embeddings for job descriptions
df["Embeddings"] = df["Description"].apply(lambda x: model.encode(x).tolist())

# Save the updated CSV
df.to_csv("final_with_embeddings.csv", index=False)

print("✅ Sentence-BERT embeddings added and saved as 'final_with_embeddings.csv'.")

✅ Sentence-BERT embeddings added and saved as 'final_with_embeddings.csv'.


In [1]:
!pip install sentence-transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

Index(['URL', 'Description', 'Job Levels', 'Assessment Name', 'Remote_Testing',
       'adaptive', 'Assessment Length (min)', 'Embeddings'],
      dtype='object')

In [21]:
import pandas as pd
df=pd.read_csv("/content/final_combined_with_embeddings.csv")
print(df.shape)
df = df.drop_duplicates(subset=["URL"])
df.to_csv("final_combined_with_embeddings.csv",index=False)
df.shape

(101, 8)


(101, 8)

In [9]:
!pip install faiss-cpu sentence-transformers openai pandas numpy


Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [22]:
import pandas as pd
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# Load CSV
df = pd.read_csv("/content/final_combined_with_embeddings.csv")

# Convert embeddings from strings to numpy arrays
df["Embeddings"] = df["Embeddings"].apply(eval).apply(np.array)

# Convert embeddings into a matrix
embedding_matrix = np.vstack(df["Embeddings"].values)

# Build FAISS index
index = faiss.IndexFlatL2(embedding_matrix.shape[1])  # L2 Distance
index.add(embedding_matrix)

print("✅ FAISS index created with", embedding_matrix.shape[0], "entries.")


✅ FAISS index created with 101 entries.


In [23]:
def find_similar_assessments(query, top_k=5):
    model = SentenceTransformer("all-MiniLM-L6-v2")
    query_embedding = model.encode(query).reshape(1, -1)

    # Search in FAISS index
    distances, indices = index.search(query_embedding, top_k)

    # Get results
    results = df.iloc[indices[0]][["Assessment Name", "Job Levels", "Assessment Length (min)", 'Remote_Testing',
       'adaptive', ]]
    return results

# Example Query
query = "I am hiring for Java developers who can also collaborate effectively with my business teams. Looking for an assessment(s) that can be completed in 40 minutes."
similar_assessments = find_similar_assessments(query)
print(similar_assessments)


                                      Assessment Name  \
78                        Professional + 7.0 Solution   
80                      Professional + 7.1 (Americas)   
82                 Professional + 7.1 (International)   
83                               Professional 8.0 JFA   
37  Java 2 Platform Enterprise Edition 1.4 Fundame...   

                                           Job Levels  \
78  Mid-Professional, Professional Individual Cont...   
80  General Population, Mid-Professional, Professi...   
82  General Population, Mid-Professional, Professi...   
83               Professional Individual Contributor,   
37  Entry-Level, Mid-Professional, Professional In...   

    Assessment Length (min)  Remote_Testing  adaptive  
78                     51.0               1         1  
80                     56.0               1         1  
82                     56.0               1         1  
83                     16.0               1         0  
37                     30.0       

In [13]:
df.shape

(22493, 8)

In [4]:
import pandas as pd
import torch
from transformers import AutoModel, AutoTokenizer

# Load DistilBERT (768-dim)
MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Function to generate embeddings


def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt",
                       padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy().flatten()  # 768-dim


# Load CSV and Recompute Embeddings
df = pd.read_csv("final_combined_with_embeddings.csv")
df["Embeddings"] = df["Description"].apply(lambda x: get_embedding(x).tolist())

# Save Updated CSV
df.to_csv("final_combined_with_embeddings.csv", index=False)
print("✅ Embeddings updated to 768 dimensions!")

ImportError: tokenizers>=0.21,<0.22 is required for a normal functioning of this module, but found tokenizers==0.19.1.
Try: `pip install transformers -U` or `pip install -e '.[dev]'` if you're working with git main

In [2]:
!pip install transformers==4.41.2

Collecting transformers==4.41.2
  Using cached transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
Collecting huggingface-hub<1.0,>=0.23.0 (from transformers==4.41.2)
  Using cached huggingface_hub-0.30.1-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers==4.41.2)
  Downloading tokenizers-0.19.1-cp312-none-win_amd64.whl.metadata (6.9 kB)
Downloading transformers-4.41.2-py3-none-any.whl (9.1 MB)
   ---------------------------------------- 0.0/9.1 MB ? eta -:--:--
   --------- ------------------------------ 2.1/9.1 MB 11.8 MB/s eta 0:00:01
   ------------------- -------------------- 4.5/9.1 MB 11.7 MB/s eta 0:00:01
   -------------------------- ------------- 6.0/9.1 MB 10.0 MB/s eta 0:00:01
   --------------------------------- ------ 7.6/9.1 MB 9.4 MB/s eta 0:00:01
   ---------------------------------------- 9.1/9.1 MB 9.6 MB/s eta 0:00:00
Using cached huggingface_hub-0.30.1-py3-none-any.whl (481 kB)
Downloading tokenizers-0.19.1-cp312-none-win_amd64