In [14]:
!pip install pandas openpyxl scikit-learn imbalanced-learn shap xgboost
print("✅ Libraries installed successfully.")

✅ Libraries installed successfully.


In [15]:
import pandas as pd

# The path to your CSV file
file_name = "/kaggle/input/fakejobposting2/fake_job_postings.csv" 

df = pd.read_csv(file_name)

print(f"\n✅ Successfully loaded: {file_name}")
print(df.head())


✅ Successfully loaded: /kaggle/input/fakejobposting2/fake_job_postings.csv
   job_id                                      title            location  \
0       1                           Marketing Intern    US, NY, New York   
1       2  Customer Service - Cloud Video Production      NZ, , Auckland   
2       3    Commissioning Machinery Assistant (CMA)       US, IA, Wever   
3       4          Account Executive - Washington DC  US, DC, Washington   
4       5                        Bill Review Manager  US, FL, Fort Worth   

  department salary_range                                    company_profile  \
0  Marketing          NaN  We're Food52, and we've created a groundbreaki...   
1    Success          NaN  90 Seconds, the worlds Cloud Video Production ...   
2        NaN          NaN  Valor Services provides Workforce Solutions th...   
3      Sales          NaN  Our passion for improving quality of life thro...   
4        NaN          NaN  SpotSource Solutions LLC is a Global Hum

In [16]:
print("--- Phase 1: Feature Engineering ---")

# a. Create "missingness" features
df['is_company_profile_missing'] = df['company_profile'].isnull().astype(int)
df['is_requirements_missing'] = df['requirements'].isnull().astype(int)
df['is_benefits_missing'] = df['benefits'].isnull().astype(int)
df['is_salary_range_missing'] = df['salary_range'].isnull().astype(int)

# b. Combine all text fields into one for analysis
text_columns = ['title', 'location', 'department', 'company_profile', 
                'description', 'requirements', 'benefits', 'function']

# Fill NaNs with empty strings to avoid errors
for col in text_columns:
    df[col] = df[col].fillna('')

df['text_combined'] = df[text_columns].apply(lambda x: ' '.join(x), axis=1)

print("✅ New features created:")
print(df[['is_company_profile_missing', 'is_salary_range_missing', 'text_combined']].head())

--- Phase 1: Feature Engineering ---
✅ New features created:
   is_company_profile_missing  is_salary_range_missing  \
0                           0                        1   
1                           0                        1   
2                           0                        1   
3                           0                        1   
4                           0                        1   

                                       text_combined  
0  Marketing Intern US, NY, New York Marketing We...  
1  Customer Service - Cloud Video Production NZ, ...  
2  Commissioning Machinery Assistant (CMA) US, IA...  
3  Account Executive - Washington DC US, DC, Wash...  
4  Bill Review Manager US, FL, Fort Worth  SpotSo...  


In [17]:
def clean_text(text):
    """
    Cleans a raw text string:
    1. Lowercases
    2. Removes HTML tags
    3. Removes punctuation and numbers
    4. Tokenizes
    5. Removes stopwords
    """
    text = text.lower()  # Lowercase
    text = re.sub(r'<.*?>', ' ', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)  # Remove punctuation/numbers
    tokens = word_tokenize(text)  # Tokenize
    # Remove stop words
    cleaned_tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    return ' '.join(cleaned_tokens)

print("✅ `clean_text` function defined.")

✅ `clean_text` function defined.


In [18]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [19]:
# --- Standard Libraries ---
import pandas as pd
import numpy as np
import re
import warnings

# --- NLTK for Text Processing ---
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# --- Scikit-learn (sklearn) for Modeling ---
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from scipy import sparse

# --- XGBoost ---
from xgboost import XGBClassifier

# --- Imbalanced-learn (optional) for SMOTE ---
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

# --- Setup ---
warnings.filterwarnings('ignore') # Suppress warnings for cleaner output
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
stop_words = set(stopwords.words('english'))

print("✅ All libraries imported and NLTK data downloaded. Ready for XGBoost.")

✅ All libraries imported and NLTK data downloaded. Ready for XGBoost.


In [20]:

print("Cleaning all text data... (This may take a moment)")
df['text_cleaned'] = df['text_combined'].apply(clean_text)
print("✅ Text cleaning complete.")
print(df[['text_combined', 'text_cleaned']].head())

Cleaning all text data... (This may take a moment)
✅ Text cleaning complete.
                                       text_combined  \
0  Marketing Intern US, NY, New York Marketing We...   
1  Customer Service - Cloud Video Production NZ, ...   
2  Commissioning Machinery Assistant (CMA) US, IA...   
3  Account Executive - Washington DC US, DC, Wash...   
4  Bill Review Manager US, FL, Fort Worth  SpotSo...   

                                        text_cleaned  
0  marketing intern new york marketing food creat...  
1  customer service cloud video production auckla...  
2  commissioning machinery assistant cma wever va...  
3  account executive washington washington sales ...  
4  bill review manager fort worth spotsource solu...  


In [21]:
print("--- Phase 2: Defining Features & Target ---")

target = 'fraudulent'

# Define which columns go into which preprocessing step
binary_features = [
    'telecommuting', 'has_company_logo', 'has_questions', 
    'is_company_profile_missing', 'is_requirements_missing', 
    'is_benefits_missing', 'is_salary_range_missing'
]
categorical_features = ['employment_type', 'required_experience', 'required_education', 'industry']
text_feature = 'text_cleaned' # Our preprocessed text

# Handle missing values in categorical features *before* splitting
df[categorical_features] = df[categorical_features].fillna('Missing')

# Create X and y
X = df[binary_features + categorical_features + [text_feature]]
y = df[target]

print(f"✅ X (features) and y (target) are defined.")
print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")

--- Phase 2: Defining Features & Target ---
✅ X (features) and y (target) are defined.
Shape of X: (17880, 12)
Shape of y: (17880,)


In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.3,    # 30% of data for testing
    random_state=42,  # Ensures reproducible results
    stratify=y        # Keeps the same % of fake jobs in train and test
)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")
print(f"Training target distribution:\n{y_train.value_counts(normalize=True)}")
print(f"Testing target distribution:\n{y_test.value_counts(normalize=True)}")

Training set shape: (12516, 12)
Testing set shape: (5364, 12)
Training target distribution:
fraudulent
0    0.951582
1    0.048418
Name: proportion, dtype: float64
Testing target distribution:
fraudulent
0    0.951529
1    0.048471
Name: proportion, dtype: float64


In [23]:
# This pipeline applies different transforms to different columns
preprocessor = ColumnTransformer(
    transformers=[
        # (name, transformer, columns_to_apply_to)
        ('text', TfidfVectorizer(max_features=5000, stop_words='english'), text_feature),
        ('categorical', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('binary', 'passthrough', binary_features) # 'passthrough' leaves these columns as-is
    ],
    remainder='drop' # Drop any columns not specified
)

print("✅ Preprocessing ColumnTransformer created.")

✅ Preprocessing ColumnTransformer created.


In [24]:
xgb_model = XGBClassifier(
    random_state=42,
    n_jobs=-1,
    eval_metric='logloss'
)

# 2. Initialize SMOTE
smote = SMOTE(random_state=42)

# --- Manual Pipeline Steps ---

# 3. Apply the preprocessor to the training data
print("Applying preprocessor to training data...")
# 'preprocessor' was defined in Cell 10
X_train_processed = preprocessor.fit_transform(X_train, y_train)
print(f"Training data transformed. Shape: {X_train_processed.shape}")

# 4. Apply SMOTE to the processed training data
print("Applying SMOTE to balance classes...")
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)
print(f"Data resampled. New shape: {X_train_resampled.shape}")
print(f"Resampled target distribution:\n{pd.Series(y_train_resampled).value_counts(normalize=True)}")

# 5. Train the model on the resampled data
print("Training the XGBoost model...")
xgb_model.fit(X_train_resampled, y_train_resampled)

print("✅ Model training complete.")

Applying preprocessor to training data...
Training data transformed. Shape: (12516, 5165)
Applying SMOTE to balance classes...
Data resampled. New shape: (23820, 5165)
Resampled target distribution:
fraudulent
0    0.5
1    0.5
Name: proportion, dtype: float64
Training the XGBoost model...
✅ Model training complete.


In [25]:
# 1. Apply the *already-fitted* preprocessor to the test data
print("Applying preprocessor to test data...")
X_test_processed = preprocessor.transform(X_test)
print(f"Test data transformed. Shape: {X_test_processed.shape}")

# 2. Make predictions with the *already-fitted* model
y_pred = xgb_model.predict(X_test_processed)

# 3. Print the report
print("\n--- Classification Report ---")
# Pay close attention to the precision and recall for "Fake (1)"
# Precision (Fake): Of all jobs we flagged as fake, what % was *actually* fake?
# Recall (Fake): Of all *actual* fake jobs, what % did we *catch*?
print(classification_report(y_test, y_pred, target_names=['Real (0)', 'Fake (1)']))

Applying preprocessor to test data...
Test data transformed. Shape: (5364, 5165)

--- Classification Report ---
              precision    recall  f1-score   support

    Real (0)       0.99      1.00      0.99      5104
    Fake (1)       0.90      0.82      0.86       260

    accuracy                           0.99      5364
   macro avg       0.95      0.91      0.93      5364
weighted avg       0.99      0.99      0.99      5364

