In [1]:
import pandas as pd
import numpy as np 


In [2]:
df = pd.read_csv("data/fake_job_posting_cleaned.csv")

In [3]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output=False)
employment_encoded = ohe.fit_transform(df[['employment_type']])
emp_df = pd.DataFrame(employment_encoded, columns=ohe.get_feature_names_out(['employment_type']))
df = pd.concat([df.drop(['employment_type'], axis=1), emp_df], axis=1)

In [4]:
experience_order = {
    "unknown": 0,
    "internship": 1,
    "entry level": 2,
    "associate": 3,
    "mid-senior level": 4,
    "director": 5,
    "executive": 6,
    "not applicable": 0
}
df['required_experience_encoded'] = df['required_experience'].map(experience_order)

In [5]:
education_order = {
    "unknown": 0,
    "some high school coursework": 1,
    "high school or equivalent": 2,
    "vocational - hs diploma": 3,
    "vocational": 4,
    "some college coursework completed": 5,
    "associate degree": 6,
    "bachelor's degree": 7,
    "vocational - degree": 8,
    "master's degree": 9,
    "professional": 10,
    "doctorate": 11,
    "certification": 12
}

df['required_education_encoded'] = df['required_education'].map(education_order)

In [6]:
df = df.drop([ 'required_experience', 'required_education'], axis=1)

In [7]:
df['full_text']=df['full_text'].str.lower()

In [8]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     D:\Data_Science\NLP_for_ml\venv\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = " ".join(text.split())
    # Remove stopwords
    text = " ".join([w for w in text.split() if w.lower() not in stop_words])
    return text

df['full_text'] = df['full_text'].apply(clean_text)

In [10]:
from nltk.stem import WordNetLemmatizer

In [11]:
lemmatizer=WordNetLemmatizer()

In [12]:
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

In [13]:
df['full_text']=df['full_text'].apply(lambda x:lemmatize_words(x))

In [14]:
df.drop(columns=['job_id'], inplace=True)

In [31]:
X = df.drop('fraudulent', axis=1)
y = df['fraudulent'] 

In [32]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,
                                              test_size=0.20)

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf=tfidf.fit_transform(X_train['full_text']).toarray()
X_test_tfidf=tfidf.transform(X_test['full_text']).toarray()

In [34]:

text_cols = ['full_text'] 

X_train_other = X_train.drop(columns=text_cols).values
X_test_other = X_test.drop(columns=text_cols).values

In [35]:
X_train_final = np.hstack([X_train_tfidf, X_train_other])
X_test_final = np.hstack([X_test_tfidf, X_test_other])

In [36]:
from imblearn.over_sampling import SMOTE

# ✅ Apply SMOTE only on training data
smote = SMOTE(random_state=42)

X_train_res, y_train_res = smote.fit_resample(X_train_final, y_train)

## Why is used SMOTE
- If you miss fraud (low recall) → real people may get scammed → high risk, costly.

- If you raise a few false alarms (lower precision) → a legitimate job gets flagged for manual review → inconvenience, but not deadly.

- So in this domain, recall is more important than precision.
It’s better to flag 100 jobs and catch 80 frauds (with some false positives) than flag only 70 frauds but miss 30 scammers.

In [37]:
X_train_res.shape

(27214, 5010)

In [38]:
y_train_res.value_counts()

fraudulent
0    13607
1    13607
Name: count, dtype: int64

In [39]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB
from scipy.sparse import issparse

In [61]:
models = {
    'GaussianNB': GaussianNB(),
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=42),
    'LinearSVC': LinearSVC(max_iter=1000),
    'XGBoost': xgb.XGBClassifier(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42
    )
}

In [None]:
results = []

for name, model in models.items():
    # Fit the model
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test_final)

    # Calculate metrics
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    # Save results
    results.append({
        'Model': name,
        'Accuracy': acc,
        'F1-score': f1,
        'Precision': precision,
        'Recall': recall
    })

# Display results
results_df = pd.DataFrame(results)
print(results_df.sort_values(by='F1-score', ascending=False))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


                Model  Accuracy  F1-score  Precision    Recall
5             XGBoost  0.987136  0.864706   0.930380  0.807692
4           LinearSVC  0.983781  0.837989   0.852273  0.824176
2        RandomForest  0.982662  0.797386   0.983871  0.670330
1  LogisticRegression  0.973434  0.775414   0.680498  0.901099
0          GaussianNB  0.964206  0.670103   0.631068  0.714286
3            AdaBoost  0.938199  0.564103   0.440000  0.785714


## lets do hyperparameter tunning on Xgboost

In [73]:
from sklearn.model_selection import RandomizedSearchCV

In [74]:
xgb_clf = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    use_label_encoder=False,
    random_state=42,
    n_jobs=-1
)

In [75]:
param_dist = {
    "n_estimators": [100, 200, 300, 400],
    "max_depth": [3, 5, 7, 9],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "gamma": [0, 0.1, 0.2, 0.3],
    "reg_lambda": [1, 1.5, 2.0]
}


In [76]:
random_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_dist,
    n_iter=20,              # number of random combos to try
    scoring="f1",           # optimize for F1 because imbalanced dataset
    cv=3,                   # 3-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1
)

In [77]:
random_search.fit(X_train_res, y_train_res)

print("Best Parameters:", random_search.best_params_)
print("Best F1 Score:", random_search.best_score_)

# Evaluate on test set
best_xgb = random_search.best_estimator_
y_pred = best_xgb.predict(X_test_final)

print("\nTest Set Performance:")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("F1-score :", f1_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))

Fitting 3 folds for each of 20 candidates, totalling 60 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best Parameters: {'subsample': 1.0, 'reg_lambda': 1.5, 'n_estimators': 400, 'max_depth': 9, 'learning_rate': 0.05, 'gamma': 0.1, 'colsample_bytree': 0.6}
Best F1 Score: 0.9951022995422297

Test Set Performance:
Accuracy : 0.9874161073825504
F1-score : 0.8664688427299704
Precision: 0.9419354838709677
Recall   : 0.8021978021978022


In [40]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

xgb_best = xgb.XGBClassifier(
    subsample=1.0,
    reg_lambda=1.5,
    n_estimators=400,
    max_depth=9,
    learning_rate=0.05,
    gamma=0.1,
    colsample_bytree=0.6,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)


xgb_best.fit(X_train_res, y_train_res)

y_pred = xgb_best.predict(X_test_final)

print("Test Set Performance:")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("F1-score :", f1_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Test Set Performance:
Accuracy : 0.9885346756152126
F1-score : 0.8714733542319749
Precision: 0.9266666666666666
Recall   : 0.8224852071005917


## lets export the model 

In [41]:
import joblib

# Save the trained model
joblib.dump(xgb_best, "xgb_fakejob_model.pkl")
print(" Model saved as xgb_fakejob_model.pkl")

 Model saved as xgb_fakejob_model.pkl
