# Batch Resume Processing Example
This notebook demonstrates how to process multiple resumes, extract entities, compute embeddings, and save results for use in the Streamlit app.

In [None]:
# Install dependencies
!pip install sentence-transformers hnswlib pdfminer.six pandas tqdm spacy
import spacy
try:
    nlp = spacy.load('en_core_web_sm')
except:
    import os
    os.system('python -m spacy download en_core_web_sm')
    nlp = spacy.load('en_core_web_sm')

In [None]:
# Upload resumes (PDF or TXT)
from google.colab import files
uploaded = files.upload()
resume_files = list(uploaded.keys())
print('Uploaded:', resume_files)

In [None]:
# Extract text from resumes
from pdfminer.high_level import extract_text
resume_texts = {}
for fname in resume_files:
    if fname.lower().endswith('.pdf'):
        text = extract_text(fname)
    else:
        with open(fname, 'r', encoding='utf-8', errors='ignore') as f:
            text = f.read()
    resume_texts[fname] = text
print({k: v[:200] for k, v in resume_texts.items()})

In [None]:
# Entity extraction
from src.entity_extraction import extract_entities
entities = {fname: extract_entities(text) for fname, text in resume_texts.items()}
import pandas as pd
df_entities = pd.DataFrame.from_dict(entities, orient='index')
df_entities.to_csv('resume_entities.csv')
print(df_entities)

In [None]:
# Compute embeddings
from sentence_transformers import SentenceTransformer
import numpy as np
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = []
for text in resume_texts.values():
    emb = model.encode(text, convert_to_numpy=True)
    embeddings.append(emb)
embeddings = np.stack(embeddings)
np.savez_compressed('resume_embeddings.npz', embeddings=embeddings)
print('Saved resume_embeddings.npz')

In [None]:
# Download results
files.download('resume_entities.csv')
files.download('resume_embeddings.npz')