In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
import joblib

In [None]:
# Load data
df = pd.read_csv('../data/fake_job_postings.csv')

In [None]:
# Drop duplicates
df.drop_duplicates(inplace=True)
# Target variable
y = df['fraudulent']
# Drop columns not useful for prediction
df.drop(['job_id', 'fraudulent'], axis=1, inplace=True)
# Fill missing text columns with empty string
text_columns = ['title', 'company_profile', 'description', 'requirements', 'benefits']
df[text_columns] = df[text_columns].fillna("")
# Add a new feature: description word count
df['description_word_count'] = df['description'].apply(lambda x: len(x.split()))
# Fill missing categorical columns with 'Unknown'
cat_columns = ['employment_type', 'required_experience', 'required_education', 'industry', 'function']
df[cat_columns] = df[cat_columns].fillna('Unknown')

In [None]:
# --------------------------
# Define Transformers
# --------------------------
# Text Vectorizer for descriptions
text_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
# Categorical Encoder
cat_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
# Preprocessing Pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('desc', text_vectorizer, 'description'),
        ('title', TfidfVectorizer(max_features=1000, stop_words='english'), 'title'),
        ('requirements', TfidfVectorizer(max_features=1000, stop_words='english'), 'requirements'),
        ('company_profile', TfidfVectorizer(max_features=500, stop_words='english'), 'company_profile'),
        ('benefits', TfidfVectorizer(max_features=500, stop_words='english'), 'benefits'),
        ('cat', cat_encoder, cat_columns),
        ('num', 'passthrough', ['description_word_count']),
    ]
)

In [None]:

# Split the data
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
# Fit transformer on train and transform both
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [None]:
# Save preprocessing objects for future inference
joblib.dump(preprocessor, '../models/preprocessor.pkl')
# Save processed data (optional, for offline use)
joblib.dump((X_train_processed, y_train), '../data/X_train_y_train.pkl')
joblib.dump((X_test_processed, y_test), '../data/X_test_y_test.pkl')
print("✅ Preprocessing complete. Saved: TF-IDF + Encoders + Split datasets.")

✅ Preprocessing complete. Saved: TF-IDF + Encoders + Split datasets.
