In [21]:
import json
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import numpy as np

In [22]:
JOB_DESCRIPTION = """
Looking for a Machine Learning Engineer with experience in Python, PyTorch/TensorFlow,
NLP/transformers, and cloud deployment (AWS/Azure). Familiarity with MLOps is a plus.
"""
TOP_N = 5  # number of candidates to select

In [3]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [23]:
with open("form-submissions.json", "r") as f:
    data = json.load(f)

In [24]:
# Ensure data is a list
if isinstance(data, dict) and "submissions" in data:
    data = data["submissions"]

In [25]:
df = pd.json_normalize(data)

In [26]:
df.head()

Unnamed: 0,name,email,phone,location,submitted_at,work_availability,work_experiences,skills,annual_salary_expectation.full-time,education.highest_level,education.degrees
0,Clever Monkey,clever-monkey@example.com,5582981474204,Maceió,2025-01-28 09:02:16.000000,"[full-time, part-time]","[{'company': 'StarLab Digital Ventures', 'role...","[Data Analysis, Docker, Microservices]",$117548,Bachelor's Degree,"[{'degree': 'Bachelor's Degree', 'subject': 'C..."
1,Noble Flamingo,noble-flamingo@example.com,12156688210,Philadelphia,2025-01-26 07:40:39.000000,"[full-time, part-time]",[{'company': 'Intellectual Asset Management Gr...,[],$112253,Juris Doctor (J.D),"[{'degree': 'Master's Degree', 'subject': 'Acc..."
2,Noble Antelope,noble-antelope@example.com,8801993762548,Bangladesh,2025-01-28 07:29:47.000000,"[full-time, part-time]","[{'company': 'Red.Digital', 'roleName': 'Softw...","[Laravel, Next JS, React, React Native, Redux,...",$63556,Bachelor's Degree,"[{'degree': 'Bachelor's Degree', 'subject': 'C..."
3,Unique Platypus,unique-platypus@example.com,55118120974,Brazil,2025-01-28 23:29:29.000000,"[full-time, part-time]","[{'company': 'Autônomo', 'roleName': 'Backend ...","[Amazon Web Services, Python, Flask, Agile, RE...",$143487,Bachelor's Degree,"[{'degree': 'Bachelor's Degree', 'subject': 'I..."
4,Clever Jellyfish,clever-jellyfish@example.com,54169018596,Argentina,2025-01-29 03:45:30.000000,"[full-time, part-time]","[{'company': 'Ituran de Argentina', 'roleName'...","[JavaScript, Arduino, Amazon Web Services, Azu...",$120152,Bachelor's Degree,"[{'degree': 'Associate's Degree', 'subject': '..."


In [27]:
df.isnull().sum()

name                                     0
email                                    0
phone                                  160
location                                 0
submitted_at                             0
work_availability                        0
work_experiences                         0
skills                                   0
annual_salary_expectation.full-time      0
education.highest_level                  0
education.degrees                        0
dtype: int64

In [28]:
df.fillna('', inplace=True)

In [29]:
df.isnull().sum()

name                                   0
email                                  0
phone                                  0
location                               0
submitted_at                           0
work_availability                      0
work_experiences                       0
skills                                 0
annual_salary_expectation.full-time    0
education.highest_level                0
education.degrees                      0
dtype: int64

In [30]:
df = df[['name', 'email', 'phone', 'location', 'work_availability', 'work_experiences', 'skills', 'annual_salary_expectation.full-time', 'education.highest_level', 'education.degrees']]

In [31]:
df.head()

Unnamed: 0,name,email,phone,location,work_availability,work_experiences,skills,annual_salary_expectation.full-time,education.highest_level,education.degrees
0,Clever Monkey,clever-monkey@example.com,5582981474204,Maceió,"[full-time, part-time]","[{'company': 'StarLab Digital Ventures', 'role...","[Data Analysis, Docker, Microservices]",$117548,Bachelor's Degree,"[{'degree': 'Bachelor's Degree', 'subject': 'C..."
1,Noble Flamingo,noble-flamingo@example.com,12156688210,Philadelphia,"[full-time, part-time]",[{'company': 'Intellectual Asset Management Gr...,[],$112253,Juris Doctor (J.D),"[{'degree': 'Master's Degree', 'subject': 'Acc..."
2,Noble Antelope,noble-antelope@example.com,8801993762548,Bangladesh,"[full-time, part-time]","[{'company': 'Red.Digital', 'roleName': 'Softw...","[Laravel, Next JS, React, React Native, Redux,...",$63556,Bachelor's Degree,"[{'degree': 'Bachelor's Degree', 'subject': 'C..."
3,Unique Platypus,unique-platypus@example.com,55118120974,Brazil,"[full-time, part-time]","[{'company': 'Autônomo', 'roleName': 'Backend ...","[Amazon Web Services, Python, Flask, Agile, RE...",$143487,Bachelor's Degree,"[{'degree': 'Bachelor's Degree', 'subject': 'I..."
4,Clever Jellyfish,clever-jellyfish@example.com,54169018596,Argentina,"[full-time, part-time]","[{'company': 'Ituran de Argentina', 'roleName'...","[JavaScript, Arduino, Amazon Web Services, Azu...",$120152,Bachelor's Degree,"[{'degree': 'Associate's Degree', 'subject': '..."


In [32]:
keywords = ['location', 'work_availability', 'work_experiences', 'skills', 'annual_salary_expectation.full-time', 'education.highest_level', 'education.degrees']

In [33]:
relevant_cols = [col for col in df.columns if any(key in col.lower() for key in keywords)]


In [34]:
df["combined_text"] = df[relevant_cols].astype(str).agg(" ".join, axis=1)


In [35]:
df["combined_text"] = df[relevant_cols].astype(str).agg(" ".join, axis=1)


In [36]:
df = df[['name', 'email', 'phone', 'combined_text' ]]

In [37]:
df.head()

Unnamed: 0,name,email,phone,combined_text
0,Clever Monkey,clever-monkey@example.com,5582981474204,"Maceió ['full-time', 'part-time'] [{'company':..."
1,Noble Flamingo,noble-flamingo@example.com,12156688210,"Philadelphia ['full-time', 'part-time'] [{'com..."
2,Noble Antelope,noble-antelope@example.com,8801993762548,"Bangladesh ['full-time', 'part-time'] [{'compa..."
3,Unique Platypus,unique-platypus@example.com,55118120974,"Brazil ['full-time', 'part-time'] [{'company':..."
4,Clever Jellyfish,clever-jellyfish@example.com,54169018596,"Argentina ['full-time', 'part-time'] [{'compan..."


In [38]:
df['combined_text'] = df['combined_text'].str.replace(r'\s+', ' ', regex=True).str.strip()

In [43]:
df["combined_text"] = (
    df["combined_text"]
    .astype(str)  # Ensure it's string
    .str.replace(r"[\[\]\{\}]", " ", regex=True)  # Remove [ ] { }
    .str.replace(r"[\'\"]", "", regex=True)       # Remove quotes
    .str.replace(r"\s+", " ", regex=True)         # Collapse extra spaces
    .str.strip()                                  # Remove leading/trailing spaces
    .str.lower()
)


In [44]:
df.head()

Unnamed: 0,name,email,phone,combined_text
0,Clever Monkey,clever-monkey@example.com,5582981474204,"maceió full-time, part-time company: starlab d..."
1,Noble Flamingo,noble-flamingo@example.com,12156688210,"philadelphia full-time, part-time company: int..."
2,Noble Antelope,noble-antelope@example.com,8801993762548,"bangladesh full-time, part-time company: red.d..."
3,Unique Platypus,unique-platypus@example.com,55118120974,"brazil full-time, part-time company: autônomo,..."
4,Clever Jellyfish,clever-jellyfish@example.com,54169018596,"argentina full-time, part-time company: ituran..."


In [45]:
# Encode job description and candidates
job_embedding = model.encode(JOB_DESCRIPTION.lower(), convert_to_tensor=True)
candidate_embeddings = model.encode(df["combined_text"].tolist(), convert_to_tensor=True)

In [46]:
# Compute cosine similarity
similarities = util.cos_sim(candidate_embeddings, job_embedding).cpu().numpy().flatten()
df["ai_similarity_score"] = similarities

In [47]:
df.head()

Unnamed: 0,name,email,phone,combined_text,ai_similarity_score
0,Clever Monkey,clever-monkey@example.com,5582981474204,"maceió full-time, part-time company: starlab d...",0.304486
1,Noble Flamingo,noble-flamingo@example.com,12156688210,"philadelphia full-time, part-time company: int...",0.111979
2,Noble Antelope,noble-antelope@example.com,8801993762548,"bangladesh full-time, part-time company: red.d...",0.286586
3,Unique Platypus,unique-platypus@example.com,55118120974,"brazil full-time, part-time company: autônomo,...",0.303429
4,Clever Jellyfish,clever-jellyfish@example.com,54169018596,"argentina full-time, part-time company: ituran...",0.243392


In [51]:
# ----- HEURISTIC BOOSTS -----
def heuristic_score(row):
    # Simple example: bonus for having portfolio or GitHub, and >2 years experience
    portfolio_cols = [c for c in df.columns if any(x in c.lower() for x in ["portfolio", "github", "website"])]
    years_cols = [c for c in df.columns if "year" in c.lower() or "experience" in c.lower()]

    has_portfolio = any(bool(str(row[c]).strip()) for c in portfolio_cols)
    years_exp = 0
    for c in years_cols:
        try:
            years_exp = float(row[c])
            break
        except:
            pass
    return (1 if has_portfolio else 0) * 0.1 + min(years_exp, 10) * 0.02

df["heuristic_boost"] = df.apply(heuristic_score, axis=1)

In [54]:
df["final_score"] = df["ai_similarity_score"] + df["heuristic_boost"]

In [57]:
top_candidates = df.sort_values("final_score", ascending=False).head(TOP_N)
selected_ids = top_candidates.get("id", top_candidates.index).tolist()

print("Top candidate IDs to hire:", selected_ids)

Top candidate IDs to hire: [804, 699, 172, 431, 672]


In [59]:
# Select top N candidates
top_candidates = df.sort_values("final_score", ascending=False).head(TOP_N)

# Use index as ID since no 'id' column exists
selected_candidates = top_candidates[["name"]].copy()
selected_candidates["id"] = top_candidates.index

# Convert to list of tuples (ID, Name)
selected_list = list(selected_candidates[["id", "name"]].itertuples(index=False, name=None))

print("Top candidates (ID, Name):", selected_list)


Top candidates (ID, Name): [(804, 'Mighty'), (699, 'Enthusiastic Urchin'), (172, 'Calm Penguin'), (431, 'Courageous Ostrich'), (672, 'Playful Ostrich')]
