# Resume vs Job Description Classifier Training


This notebook loads data, preprocesses text, builds sentence embeddings using `all-MiniLM-L6-v2`, trains a Logistic Regression classifier, evaluates metrics, and exports model artifact.


In [None]:
import re
import string
from pathlib import Path

import joblib
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split



In [None]:
DATA_PATH = Path('../data/sample_dataset.csv')
MODEL_PATH = Path('../models/resume_classifier.pkl')

df = pd.read_csv(DATA_PATH)
df.head()

In [None]:
STOPWORDS = {
    'a', 'an', 'the', 'and', 'or', 'to', 'for', 'with', 'in', 'on', 'at', 'of', 'is', 'are'
}

def clean_text(text: str) -> str:
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = [token for token in text.split() if token not in STOPWORDS]
    return ' '.join(tokens)

df['resume_clean'] = df['resume_text'].astype(str).apply(clean_text)
df['job_clean'] = df['job_description'].astype(str).apply(clean_text)


In [None]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')

resume_embeddings = embedder.encode(df['resume_clean'].tolist())
job_embeddings = embedder.encode(df['job_clean'].tolist())

X = np.hstack([resume_embeddings, job_embeddings])
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred, zero_division=0))
print('Recall:', recall_score(y_test, y_pred, zero_division=0))
print('F1 Score:', f1_score(y_test, y_pred, zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))


In [None]:
MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(clf, MODEL_PATH)
print(f'Model saved to: {MODEL_PATH.resolve()}')
