In [6]:
# TensorFlow will be skipped — using scikit-learn MLP instead (no heavy installs required)
import sys, subprocess
print('Python executable:', sys.executable)
print('Python version:', sys.version)
try:
    pip_ver = subprocess.check_output([sys.executable, '-m', 'pip', '--version'], text=True).strip()
    print('pip:', pip_ver)
except Exception as e:
    print('Could not query pip version:', e)

print('\nUsing scikit-learn MLPClassifier for ANN-style modelling (no TensorFlow required).')

Python executable: c:\Python314\python.exe
Python version: 3.14.0 (tags/v3.14.0:ebf955d, Oct  7 2025, 10:15:03) [MSC v.1944 64 bit (AMD64)]
pip: pip 25.3 from C:\Users\ILYESS\AppData\Roaming\Python\Python314\site-packages\pip (python 3.14)

Using scikit-learn MLPClassifier for ANN-style modelling (no TensorFlow required).
pip: pip 25.3 from C:\Users\ILYESS\AppData\Roaming\Python\Python314\site-packages\pip (python 3.14)

Using scikit-learn MLPClassifier for ANN-style modelling (no TensorFlow required).


In [None]:
# ANN (MLP) Model for Predicting Remote Job Postings
# This notebook trains a Multi-Layer Perceptron (MLP) classifier to predict if a job posting is remote.
# It uses TF-IDF for text vectorization, SVD for dimensionality reduction, one-hot encoding for countries,
# and oversampling to handle class imbalance. The model is evaluated and saved for later use.

# 1) IMPORTS
# Import necessary libraries for data processing, machine learning, and evaluation
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, f1_score
from imblearn.over_sampling import RandomOverSampler
from sklearn.neural_network import MLPClassifier
import joblib
import warnings
warnings.filterwarnings('ignore')

# 2) LOAD & PREPARE DATA
# Load the dataset and prepare features and target
df = pd.read_csv('prepared_jobs_dataset.csv')

# Fill missing values in text columns
df['skill_text'] = df['skill_text'].fillna('')
df['job_title_short'] = df['job_title_short'].fillna('')
df['company_name'] = df['company_name'].fillna('')
df['CountryName'] = df['CountryName'].fillna('Unknown')

# Combine text fields into a single clean text for vectorization
df['clean_text'] = (
    df['job_title_short'].astype(str) + ' ' +
    df['company_name'].astype(str) + ' ' +
    df['skill_text'].astype(str)
 )

# Create numeric features: number of skills and title length
df['num_skills'] = df['skill_text'].apply(lambda x: len(x.split()))
df['title_len'] = df['job_title_short'].apply(lambda x: len(x.split()))

# Define target: remote flag (1 for remote, 0 otherwise)
y = df['remote_flag'] if 'remote_flag' in df.columns else df['job_work_from_home'].apply(lambda x: 1 if x==1 else 0)

# 3) TF-IDF + SVD (dimensionality reduction)
# Vectorize text using TF-IDF and reduce dimensions with SVD for efficiency
print('Vectorizing text...')
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_text = tfidf.fit_transform(df['clean_text'])

print('Reducing TF-IDF with SVD...')
svd = TruncatedSVD(n_components=300, random_state=42)
X_text_reduced = svd.fit_transform(X_text)   # now dense (important for ANN/MLP)

# 4) ONE-HOT COUNTRY
# Encode country as one-hot vectors
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_country = ohe.fit_transform(df[['CountryName']])

# 5) NUMERIC FEATURES (scaled)
# Scale numeric features
scaler = StandardScaler()
X_numeric = scaler.fit_transform(df[['num_skills','title_len']])

# 6) FINAL FEATURE MATRIX (dense)
# Combine all features into a dense matrix
X = np.hstack([X_text_reduced, X_country, X_numeric])
print('Final ANN/MLP feature shape:', X.shape)

# 7) BALANCE WITH OVERSAMPLING (keeps dense arrays)
# Use oversampling to balance classes since remote jobs are minority
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

# 8) TRAIN-TEST SPLIT
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42
 )

# 9) BUILD & TRAIN scikit-learn MLP (CPU-friendly)
# Define and train the MLP classifier with early stopping
mlp = MLPClassifier(
    hidden_layer_sizes=(512,256,128),  # Three hidden layers
    activation='relu',
    solver='adam',
    batch_size=256,
    max_iter=50,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=5,
    random_state=42,
    verbose=True
)

print('Training MLPClassifier...')
mlp.fit(X_train, y_train)

# Save model and preprocessing artifacts for later use
joblib.dump(mlp, 'mlp_remote_model.joblib')
joblib.dump(tfidf, 'tfidf_for_mlp.joblib')
joblib.dump(svd, 'svd_for_mlp.joblib')
joblib.dump(ohe, 'ohe_for_mlp.joblib')
joblib.dump(scaler, 'scaler_for_mlp.joblib')

# 10) EVALUATION
# Evaluate the model on test data
y_prob = mlp.predict_proba(X_test)[:,1]
y_pred = (y_prob >= 0.5).astype(int)

print('\nAccuracy:', accuracy_score(y_test, y_pred))
print('\nROC AUC:', roc_auc_score(y_test, y_prob))
print('\nClassification Report:')
print(classification_report(y_test, y_pred))

print('\nDONE ✔ — model and preprocessing saved as joblib files.')

Vectorizing text...
Reducing TF-IDF with SVD...
Reducing TF-IDF with SVD...
Final ANN/MLP feature shape: (576257, 461)
Final ANN/MLP feature shape: (576257, 461)
Training MLPClassifier...
Training MLPClassifier...
Iteration 1, loss = 0.56152374
Iteration 1, loss = 0.56152374
Validation score: 0.743296
Validation score: 0.743296
Iteration 2, loss = 0.44030333
Iteration 2, loss = 0.44030333
Validation score: 0.811616
Validation score: 0.811616
Iteration 3, loss = 0.33613373
Iteration 3, loss = 0.33613373
Validation score: 0.850441
Validation score: 0.850441
Iteration 4, loss = 0.26644291
Iteration 4, loss = 0.26644291
Validation score: 0.878214
Validation score: 0.878214
Iteration 5, loss = 0.22389876
Iteration 5, loss = 0.22389876
Validation score: 0.887484
Validation score: 0.887484
Iteration 6, loss = 0.19488067
Iteration 6, loss = 0.19488067
Validation score: 0.898464
Validation score: 0.898464
Iteration 7, loss = 0.17451412
Iteration 7, loss = 0.17451412
Validation score: 0.902973
V

In [None]:
# Prediction Function for Remote Job Model
# This cell loads the saved MLP model and preprocessing artifacts,
# defines a prediction function for new job postings, and demonstrates usage.

import joblib
import numpy as np
import pandas as pd

# 1) LOAD SAVED MODELS & PROCESSORS
# Load the trained MLP model and all preprocessing transformers
mlp = joblib.load("mlp_remote_model.joblib")
tfidf = joblib.load("tfidf_for_mlp.joblib")
svd = joblib.load("svd_for_mlp.joblib")
ohe = joblib.load("ohe_for_mlp.joblib")
scaler = joblib.load("scaler_for_mlp.joblib")

# 2) PREDICTION FUNCTION
# Function to predict if a job is remote based on input features
def predict_remote(job_title, company_name, skills_text, country):
    """
    Predict whether a job is remote (1) or not (0)
    Args:
        job_title: Short job title string
        company_name: Company name string
        skills_text: Skills description string
        country: Country name string
    Returns:
        pred: Binary prediction (0 or 1)
        prob: Probability of being remote
    """
    # ---- Build input record ----
    # Create a DataFrame for the input
    df = pd.DataFrame([{
        "job_title_short": job_title if job_title else "",
        "company_name": company_name if company_name else "",
        "skill_text": skills_text if skills_text else "",
        "CountryName": country if country else "Unknown"
    }])

    # ---- Build clean_text ----
    # Combine text fields as done in training
    df["clean_text"] = (
        df["job_title_short"] + " " +
        df["company_name"] + " " +
        df["skill_text"]
    )

    # ---- Numeric features ----
    # Compute numeric features
    df["num_skills"] = df["skill_text"].apply(lambda x: len(x.split()))
    df["title_len"] = df["job_title_short"].apply(lambda x: len(x.split()))

    # PREPROCESSING PIPELINE
    # Apply the same preprocessing steps as in training

    # 1. TF-IDF → SVD
    # Vectorize text and reduce dimensions
    X_text = tfidf.transform(df["clean_text"])
    X_text_red = svd.transform(X_text)

    # 2. One-hot encode country (ensure correct feature alignment)
    # Encode country
    X_country = ohe.transform(df[["CountryName"]])

    # 3. Scale numeric
    # Scale numeric features
    X_numeric = scaler.transform(df[["num_skills", "title_len"]])

    # Combine all features
    # Stack features horizontally
    X = np.hstack([X_text_red, X_country, X_numeric])

    # 3) MODEL PREDICTION
    # Get prediction and probability
    prob = mlp.predict_proba(X)[0, 1]
    pred = int(prob >= 0.5)

    return pred, float(prob)

# 4) EXAMPLE USAGE
# Demonstrate the prediction function with sample data
pred, prob = predict_remote(
    job_title="Senior Python Engineer",
    company_name="Google",
    skills_text="python backend API docker kubernetes tensorflow",
    country="USA"
)

print("Prediction:", pred)
print("Remote probability:", prob)

Prediction: 0
Remote probability: 0.14110957027466797
