In [11]:
# Ensure imbalanced-learn (imblearn) is installed in the notebook kernel environment
import sys, subprocess
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'imbalanced-learn'])
print('imbalanced-learn installed into', sys.executable)

imbalanced-learn installed into c:\Users\ILYESS\Desktop\training\.venv\Scripts\python.exe


In [6]:
# Ensure matplotlib is installed in the notebook kernel environment
import sys, subprocess
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib'])
print('matplotlib installed into', sys.executable)

matplotlib installed into c:\Users\ILYESS\Desktop\training\.venv\Scripts\python.exe


In [None]:
# XGBoost Model for Remote Job Prediction
# This notebook trains an XGBoost classifier to predict remote job postings.
# It uses advanced text processing (TF-IDF, hashing), feature engineering, and handles class imbalance.
# The model is tuned, evaluated, and saved for deployment.

import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import time

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, FeatureHasher
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import (accuracy_score, roc_auc_score, f1_score,
                             classification_report, precision_recall_curve, auc)
from imblearn.over_sampling import RandomOverSampler
from scipy.sparse import hstack, csr_matrix
from xgboost import XGBClassifier
import joblib
import warnings
warnings.filterwarnings("ignore")

# LOAD DATA
# Read the job postings dataset
df = pd.read_csv("prepared_jobs_dataset.csv")

# Fill missing values in text columns
df["skill_text"] = df["skill_text"].fillna("")
df["job_title_short"] = df["job_title_short"].fillna("")
df["company_name"] = df["company_name"].fillna("")
df["CountryName"] = df["CountryName"].fillna("Unknown")

# Combine text fields into raw text
df["raw_text"] = (
    df["job_title_short"].astype(str) + " " +
    df["company_name"].astype(str) + " " +
    df["skill_text"].astype(str)
 )

# Clean text: lowercase, remove non-alphanumeric, normalize spaces
def clean(t):
    t = str(t).lower()
    t = re.sub(r"[^a-z0-9\\s]", " ", t)
    return re.sub(r"\\s+", " ", t).strip()

df["clean_text"] = df["raw_text"].apply(clean)

# NUMERIC FEATURES
# Create additional numeric features for better modeling
df["num_skills"] = df["skill_text"].apply(lambda x: len(x.split()))
df["title_len"] = df["job_title_short"].apply(lambda x: len(x.split()))
df["unique_words"] = df["clean_text"].apply(lambda x: len(set(x.split())))
df["avg_word_len"] = df["clean_text"].apply(lambda x: np.mean([len(w) for w in x.split()]) if len(x.split()) else 0)

# TARGET
# Define target variable: remote flag
if "remote_flag" not in df.columns:
    df["remote_flag"] = df["job_work_from_home"].apply(lambda x: 1 if x == 1 else 0)

y = df["remote_flag"].values

# TF-IDF
# Vectorize cleaned text with TF-IDF
tfidf = TfidfVectorizer(
    max_features=15000,
    ngram_range=(1,3),
    min_df=3,
    max_df=0.9
)
X_text = tfidf.fit_transform(df["clean_text"])

# COUNTRY OHE
# One-hot encode country
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
X_country = ohe.fit_transform(df[["CountryName"]])

# HASHING
# Use feature hashing for company and title to reduce dimensionality
fh_company = FeatureHasher(n_features=256, input_type="dict")
X_company = fh_company.transform(df["company_name"].apply(lambda s: {"company": str(s)}))

fh_title = FeatureHasher(n_features=128, input_type="dict")
X_title = fh_title.transform(df["job_title_short"].apply(lambda s: {"title": str(s)}))

# NUMERIC SCALING
# Scale numeric features
scaler = StandardScaler()
num_arr = scaler.fit_transform(df[["num_skills","title_len","unique_words","avg_word_len"]])
X_num = csr_matrix(num_arr)

# COMBINE FEATURES
# Horizontally stack all feature matrices
X = hstack([X_text, X_country, X_company, X_title, X_num], format="csr")

# BALANCE CLASSES
# Oversample minority class to balance
ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(X, y)

# SPLITS
# Split into train/validation/test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X_res, y_res, test_size=0.15, stratify=y_res, random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.15, stratify=y_train_val, random_state=42
)

# FINAL MODEL
# Define XGBoost parameters (pre-tuned)
final_params = {
    "n_estimators": 200,
    "max_depth": 6,
    "learning_rate": 0.05,
    "subsample": 0.7,
    "colsample_bytree": 0.7,
    "reg_alpha": 0.1,
    "reg_lambda": 1.0,
    "gamma": 0,
    "tree_method": "hist",
    "n_jobs": 1,
    "eval_metric": "logloss",
    "use_label_encoder": False,
    "max_bin": 256
}

# Print simple diagnostics before training
try:
    print('X_train type:', type(X_train))
    print('X_train shape:', X_train.shape)
    if hasattr(X_train, 'data'):
        print('X_train data bytes:', X_train.data.nbytes)
        print('X_train indices bytes:', X_train.indices.nbytes)
        print('X_train indptr bytes:', X_train.indptr.nbytes)
    else:
        try:
            print('X_train nbytes:', X_train.nbytes)
        except Exception:
            pass
except Exception:
    pass

# TRAIN FINAL MODEL (with basic error handling)
# Train XGBoost with early stopping
final_model = XGBClassifier(**final_params)
try:
    final_model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=True
    )
except Exception as e:
    import traceback
    traceback.print_exc()
    print('\nTraining failed with exception: ', str(e))
    print('\nSuggestions to fix:')
    print('- Reduce TF-IDF `max_features` (e.g., 5000-8000)')
    print('- Reduce `n_estimators` (e.g., 50-200) and/or `max_depth` (e.g., 4-6)')
    print("- Set `n_jobs` to 1 (already set) or to a smaller value if you used -1")
    print('- Train on a subset of the data (df.sample(frac=0.2)) to iterate quickly')
    print('- Consider not using RandomOverSampler if it densifies the matrix (it may convert sparse to dense)')
    raise

joblib.dump(final_model, "xgb_remote_final.joblib")


# THRESHOLD TUNING
# Tune decision threshold for better F1 score
val_probs = final_model.predict_proba(X_val)[:,1]

best_f1, best_t = -1, 0.5
for t in np.linspace(0.1, 0.9, 41):
    f1v = f1_score(y_val, (val_probs >= t).astype(int))
    if f1v > best_f1:
        best_f1, best_t = f1v, t

test_probs = final_model.predict_proba(X_test)[:,1]
test_preds = (test_probs >= best_t).astype(int)

print("\nFinal Threshold:", best_t)
print("Accuracy:", accuracy_score(y_test, test_preds))
print("ROC AUC:", roc_auc_score(y_test, test_probs))
print(classification_report(y_test, test_preds))


# FEATURE IMPORTANCE
# Plot top feature importances
tfidf_names = tfidf.get_feature_names_out()
country_names = ohe.get_feature_names_out(["CountryName"])
company_names = np.array([f"company_{i}" for i in range(X_company.shape[1])])
title_names = np.array([f"title_{i}" for i in range(X_title.shape[1])])
num_names = np.array(["num_skills","title_len","unique_words","avg_word_len"])

feature_names = np.concatenate([tfidf_names, country_names, company_names, title_names, num_names])
importances = final_model.feature_importances_

idx = np.argsort(importances)[-25:]
plt.figure(figsize=(10,8))
plt.barh(range(25), importances[idx])
plt.yticks(range(25), feature_names[idx])
plt.title("Top 25 Feature Importances")
plt.tight_layout()
plt.show()

print("DONE âœ”")

X_train type: <class 'scipy.sparse._csr.csr_matrix'>
X_train shape: (755065, 15547)
X_train data bytes: 154149896
X_train indices bytes: 77074948
X_train indptr bytes: 3020264


In [None]:
# -------------------------
# SAVE ARTIFACTS FOR APP
# -------------------------
# Save preprocessing artifacts and alternative model exports for deployment
joblib.dump(tfidf, "tfidf.joblib")
joblib.dump(ohe, "ohe.joblib")
joblib.dump(scaler, "scaler.joblib")
# Save XGBoost booster as JSON (compatible with xgboost.Booster)
final_model.get_booster().save_model("remote_job_model.json")
# Also save the sklearn-wrapped estimator for convenience
joblib.dump(final_model, "remote_job_model.joblib")