In [None]:
# Import the required modules
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import nltk
from nltk.corpus import stopwords
from imblearn.over_sampling import SMOTE
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\scmou\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\scmou\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# exploring fake job postings
fake_job_df = pd.read_csv("fake_job_postings.csv")
fake_job_df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [2]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")

# Define custom stopwords (keeping fraud-related words)
custom_stopwords = set(stopwords.words("english")) - {"urgent", "immediate", "hiring", "work", "money", "apply"}

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters & numbers
    cleaned_text = " ".join(word for word in text.split() if word not in custom_stopwords)
    return cleaned_text


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\scmou\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:

fake_job_df.fillna("", inplace=True)

# Combine text columns
fake_job_df["text"] = fake_job_df["title"] + " " + fake_job_df["company_profile"] + " " + fake_job_df["description"] + " " + fake_job_df["requirements"]

# Clean text
fake_job_df["text_cleaned"] = fake_job_df["text"].apply(clean_text)

# Add fraud-related features
fake_job_df["word_count"] = fake_job_df["text_cleaned"].apply(lambda x: len(x.split()))
fake_job_df["char_count"] = fake_job_df["text_cleaned"].apply(lambda x: len(x))
fake_job_df["contains_money_terms"] = fake_job_df["text_cleaned"].apply(lambda x: 1 if any(word in x for word in ["earn", "cash", "money", "pay"]) else 0)


In [12]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV


vectorizer = TfidfVectorizer(max_features=5000)  # Limit to top 5000 words
X = vectorizer.fit_transform(fake_job_df["text_cleaned"])

X = X.toarray()  # Convert sparse matrix to array
y = fake_job_df["fraudulent"]  # Target variable (1 = fake, 0 = real)

# Split into testing and training sets, limit test to 20%, stratify y for imbalance 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4, stratify=y) 

# Apply SMOTE + Random Undersampling
smote = SMOTE(sampling_strategy=0.6, random_state=42)  # Increase synthetic minority class
undersample = RandomUnderSampler(sampling_strategy=0.7, random_state=42)  # Reduce but not too much

resampling_pipeline = Pipeline([
    ("smote", smote),
    ("under", undersample)
])

X_train_resampled, y_train_resampled = resampling_pipeline.fit_resample(X_train, y_train)





In [16]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Apply SMOTE separately before GridSearchCV
smote = SMOTE(sampling_strategy=0.6, random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Define parameter grid for tuning
param_grid = {
    "max_depth": [3, 5, 7], 
    "min_samples_split": [2, 5, 10], 
    "min_samples_leaf": [1, 2, 4]
}

# Initialize classifier
dt = DecisionTreeClassifier(random_state=4)

# Perform Grid Search
grid_search = GridSearchCV(dt, param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train_resampled, y_train_resampled)  # Fit the model on resampled data




Fitting 3 folds for each of 27 candidates, totalling 81 fits


In [17]:


# Save best model
import joblib
joblib.dump(grid_search.best_estimator_, "optimized_fake_job_detector.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']

In [9]:
import joblib

# Load optimized model & vectorizer
model = joblib.load("optimized_fake_job_detector.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")

def predict_fraud(job_text):
    job_text = clean_text(job_text)  
    job_vectorized = vectorizer.transform([job_text]).toarray()  
    prediction = model.predict(job_vectorized)
    probability = model.predict_proba(job_vectorized)[:, 1]  # Get probability of being fake

    result = "Fake Job Posting" if prediction[0] == 1 else "Real Job Posting"
    return f"{result} (Fraud Probability: {probability[0]:.2f})"

new_job =  "Remote Software Developer Needed – No Experience, High Pay! We are a top technology company looking for software engineers to work remotely on exciting projects! Looking for passionate individuals to work with global clients. No experience required—we provide full training! You will need to set up an account using your government ID for security verification."
print(predict_fraud(new_job))

Fake Job Posting (Fraud Probability: 0.67)
