# Recruitment Dataset of Job Applicants

### Data Collection & Cleaning

In [21]:
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# --------------------------------------------------
# 1. Load Dataset (robust encoding handling)
# --------------------------------------------------
for enc in ("utf-8", "latin-1", "cp1252"):
    try:
        df = pd.read_csv("dataset/recruitment-dataset.csv", encoding=enc)
        print(f"Loaded with encoding: {enc}")
        break
    except UnicodeDecodeError:
        pass

df.drop(columns=["Unnamed: 0"], inplace=True)
df.dropna(inplace=True)

# --------------------------------------------------
# 2. Feature Selection
# --------------------------------------------------
DROP_COLS = ["YearsCode", "Age", "MentalHealth", "Accessibility", "Gender"]
df.drop(columns=DROP_COLS, inplace=True)

TARGET = "Employed"
X = df.drop(columns=[TARGET])
y = df[TARGET].astype(int)

# --------------------------------------------------
# 3. Train / Valid / Test Split (60/20/20)
# --------------------------------------------------
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, stratify=y, random_state=42
)

X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

# --------------------------------------------------
# 4. Column Types
# --------------------------------------------------
TEXT_COL = "HaveWorkedWith"
NUM_COLS = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
CAT_COLS = X.select_dtypes(include=["object", "category"]).columns.tolist()
CAT_COLS.remove(TEXT_COL)

# --------------------------------------------------
# 5. Preprocessing Pipeline
# --------------------------------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), NUM_COLS),
        ("cat", OneHotEncoder(handle_unknown="ignore"), CAT_COLS),
        ("txt", TfidfVectorizer(max_features=300), TEXT_COL),
    ]
)

# --------------------------------------------------
# 6. Model Pipeline
# --------------------------------------------------
model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(max_iter=1000, class_weight="balanced")),
    ]
)

# --------------------------------------------------
# 7. Train Model
# --------------------------------------------------
model.fit(X_train, y_train)

# --------------------------------------------------
# 8. Evaluation
# --------------------------------------------------
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC :", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))

# --------------------------------------------------
# 9. Save Model
# --------------------------------------------------
joblib.dump(model, "hr_recruitment_model.pkl")
print("Model saved as hr_recruitment_model.pkl")

Loaded with encoding: utf-8
Accuracy: 0.9713896457765667
ROC-AUC : 0.9951370462176836
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      6802
           1       0.98      0.97      0.97      7878

    accuracy                           0.97     14680
   macro avg       0.97      0.97      0.97     14680
weighted avg       0.97      0.97      0.97     14680

Model saved as hr_recruitment_model.pkl
