In [1]:
import os
import json
import numpy as np
import faiss
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = SentenceTransformer("all-MiniLM-L6-v2")

input_folder = "./data/resume_extract_text"
txt_files = [f for f in os.listdir(input_folder) if f.endswith(".txt")]

batch_size = 32
embeddings_list = []

for start in tqdm(range(0, len(txt_files), batch_size), desc="Encoding resumes"):
    batch_files = txt_files[start:start + batch_size]
    texts = []
    for file in batch_files:
        path = os.path.join(input_folder, file)
        try:
            with open(path, "r", encoding="utf-8") as f:
                texts.append(f.read())
        except Exception as e:
            print(f"Error reading {file}: {e}")
            texts.append("")

    batch_embeddings = model.encode(texts, convert_to_numpy=True, batch_size=batch_size)
    batch_embeddings /= np.linalg.norm(batch_embeddings, axis=1, keepdims=True)
    embeddings_list.append(batch_embeddings)

all_embeddings = np.vstack(embeddings_list)
faiss_index = faiss.IndexFlatIP(all_embeddings.shape[1])
faiss_index.add(all_embeddings)
faiss.write_index(faiss_index, "./data/resume_index.faiss")

with open("./data/resume_index_mapping.json", "w", encoding="utf-8") as f:
    json.dump(txt_files, f, ensure_ascii=False, indent=4)

Encoding resumes: 100%|██████████| 63/63 [01:57<00:00,  1.86s/it]
