In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("./data/job_offer/job_descriptions.csv")
df = df.sample(10000)

fields_to_combine = ["Job Id", "Job Title", "Job Description", "skills", "Responsibilities", "Company", "location"]
df["combined_text"] = df[fields_to_combine].astype(str).agg(" ".join, axis=1)

In [3]:
model = SentenceTransformer("all-MiniLM-L6-v2")
job_embeddings = model.encode(df["combined_text"].tolist(), convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True)

faiss_index = faiss.IndexFlatIP(job_embeddings.shape[1])
faiss_index.add(job_embeddings)
faiss.write_index(faiss_index, "./data/jobs_index.faiss")

job_id_mapping = {i: jid for i, jid in enumerate(df["Job Id"].tolist())}
with open("./data/jobs_index_mapping.json", "w", encoding="utf-8") as f:
    json.dump(job_id_mapping, f, indent=4)

Batches: 100%|██████████| 313/313 [02:43<00:00,  1.92it/s]
