In [18]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer


In [20]:
resumes = pd.read_csv("combined_resume_cleaned.csv")
jobs = pd.read_csv("combined_jobs_cleaned.csv")
courses = pd.read_csv("coursera_cleaned.csv")


In [21]:
print("Resume columns:", resumes.columns.tolist())
print("Job columns:", jobs.columns.tolist())
print("Course columns:", courses.columns.tolist())


Resume columns: ['career_label', 'skills', 'experience_years', 'education_level']
Job columns: ['career_label', 'skills']
Course columns: ['course_title', 'course_organization', 'course_certificate_type', 'course_rating', 'course_difficulty']


In [22]:
resumes['bert_text'] = ''

if 'skills' in resumes.columns:
    resumes['bert_text'] = resumes['bert_text'] + ' skills ' + resumes['skills'].astype(str)

if 'experience_years' in resumes.columns:
    resumes['bert_text'] = (
        resumes['bert_text'] +
        ' experience ' +
        resumes['experience_years'].astype(str) +
        ' years'
    )

if 'education_level' in resumes.columns:
    resumes['bert_text'] = resumes['bert_text'] + ' education ' + resumes['education_level'].astype(str)

resumes['bert_text'] = resumes['bert_text'].fillna('').str.strip()


In [23]:
jobs['bert_text'] = ''

if 'skills' in jobs.columns:
    jobs['bert_text'] = jobs['bert_text'] + ' skills ' + jobs['skills'].astype(str)

if 'experience_years' in jobs.columns:
    jobs['bert_text'] = (
        jobs['bert_text'] +
        ' experience ' +
        jobs['experience_years'].astype(str) +
        ' years'
    )

jobs['bert_text'] = jobs['bert_text'].fillna('').str.strip()


In [33]:
courses['bert_text'] = (
    ' title ' + courses['course_title'].astype(str) +
    ' organization ' + courses['course_organization'].astype(str) +
    ' certificate ' + courses['course_certificate_type'].astype(str) +
    ' difficulty ' + courses['course_difficulty'].astype(str) +
    ' rating ' + courses['course_rating'].astype(str)
)

courses['bert_text'] = courses['bert_text'].fillna('').str.lower().str.strip()


In [34]:
print(resumes['bert_text'].sample(3).tolist())
print(jobs['bert_text'].sample(3).tolist())
print(courses['bert_text'].sample(3).tolist())


['skills java, sql, excel, management experience 5 years education degree', 'skills python, sql, machine learning, deep learning, aws, docker, tensorflow, pytorch experience 0 years education foundation', 'skills python, aws, docker experience 0 years education degree']
['skills communication', 'skills communication', 'skills nan']
['title strategic sales management organization fundação instituto de administração certificate specialization difficulty intermediate rating 4.5', 'title adhd: everyday strategies for elementary students organization university at buffalo certificate course difficulty beginner rating 4.7', 'title social norms, social change i organization university of pennsylvania certificate course difficulty beginner rating 4.6']


In [35]:
bert_model = SentenceTransformer("all-MiniLM-L6-v2")


In [36]:
resume_embeddings = bert_model.encode(
    resumes['bert_text'].tolist(),
    show_progress_bar=True
)


Batches: 100%|██████████| 348/348 [00:05<00:00, 68.75it/s]


In [37]:
job_embeddings = bert_model.encode(
    jobs['bert_text'].tolist(),
    show_progress_bar=True
)


Batches: 100%|██████████| 480/480 [00:05<00:00, 91.97it/s] 


In [38]:
course_embeddings = bert_model.encode(
    courses['bert_text'].tolist(),
    show_progress_bar=True
)


Batches: 100%|██████████| 28/28 [00:00<00:00, 34.93it/s]


In [39]:
print("Resume embeddings shape:", resume_embeddings.shape)
print("Job embeddings shape:", job_embeddings.shape)
print("Course embeddings shape:", course_embeddings.shape)


Resume embeddings shape: (11136, 384)
Job embeddings shape: (15352, 384)
Course embeddings shape: (874, 384)


In [41]:
np.save("resume_embeddings.npy", resume_embeddings)
np.save("job_embeddings.npy", job_embeddings)
np.save("course_embeddings.npy", course_embeddings)
