In [18]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os
import json

In [19]:
df = pd.read_csv("./data/job_offer/job_descriptions.csv").sample(1000)

columns_to_keep = [
    "Job Id",
    "Job Title",
    "Job Description",
    "skills",
    "Responsibilities",
    "Company",
    "location"
]

df = df[columns_to_keep]

In [20]:
def combine_job_text(row):
    parts = [
        str(row["Job Title"]),
        str(row["Job Description"]),
        str(row["skills"]),
        str(row["Responsibilities"]),
        str(row["Company"]),
        str(row["location"])
    ]
    parts = [p for p in parts if p and p != "nan"]
    return " ".join(parts)

df["combined_text"] = df.apply(combine_job_text, axis=1)

In [21]:
model = SentenceTransformer("all-MiniLM-L6-v2")
job_texts = df["combined_text"].tolist()
job_embeddings = model.encode(job_texts, convert_to_numpy=True, show_progress_bar=True)
job_embeddings = job_embeddings / np.linalg.norm(job_embeddings, axis=1, keepdims=True)

dim = job_embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(job_embeddings)

faiss.write_index(index, "./data/jobs_index.faiss")
job_ids = df["Job Id"].tolist()
jobs_mapping = {i: job_id for i, job_id in enumerate(job_ids)}

with open("./data/jobs_index_mapping.json", "w", encoding="utf-8") as f:
    json.dump(jobs_mapping, f, indent=4)

Batches: 100%|██████████| 32/32 [00:09<00:00,  3.31it/s]
