In [13]:
import json, pickle
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import pickle
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
MODEL_NAME = "all-mpnet-base-v2"
model = SentenceTransformer(MODEL_NAME)

In [15]:

with open('../data/career_data.json', 'r', encoding='utf-8') as f:
    careers = json.load(f)


In [16]:
job_titles = []
texts_to_embed = []

for career in tqdm(careers):
    job_title = career.get("job_title", "Unknown Job")
    description = career.get("desc", "")
    tagline = career.get("Tagline", "")
    interest_tags = " ".join(career.get("interest_tags", []))
    combined_text = f'{job_title}. {tagline}. {description}. {interest_tags}'
    job_titles.append(job_title)
    texts_to_embed.append(combined_text)


100%|██████████| 200/200 [00:00<00:00, 198593.94it/s]


In [17]:
embeddings = model.encode(texts_to_embed, show_progress_bar=True, convert_to_tensor=False)

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

In [18]:
with open('../data/embeddings.pkl','wb') as f_out:
    pickle.dump({
        "job_titles": job_titles,
        "embeddings": embeddings
    }, f_out)

### Testing

In [19]:
with open('../data/embeddings.pkl', 'rb') as f:
    data = pickle.load(f)

In [20]:
job_titles = data['job_titles']
career_embeddings = np.array(data['embeddings'])


In [21]:
query = "Interest in Web app, HTML, CSS"

query_embeddings = model.encode(query)


sims = cosine_similarity([query_embeddings], career_embeddings)[0]


top_n = 15
top_indices = np.argsort(sims)[::-1][:top_n]


for idx in top_indices:
    print(f'{job_titles[idx]}: {sims[idx]:.4f}')


Web Designer: 0.4774
Front End Developer: 0.4741
Full Stack Developer: 0.4219
Back End Developer: 0.3815
UI Developer: 0.3407
Mobile App Developer: 0.2666
AR/VR Developer: 0.2644
Graphic Designer: 0.2490
Blockchain Developer: 0.2418
Motion Graphics Designer: 0.2408
API Developer: 0.2362
Computer Vision Engineer: 0.2315
Digital Marketing Specialist: 0.2285
SEO/SEM Specialist: 0.2254
EdTech Content Developer: 0.2236
