# Naïve Bayes Classification Model

In [16]:
# importing modules
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [17]:
# getting original text data
base_df = pd.read_csv("fake_job_postings.csv")
y = base_df["fraudulent"]
X = base_df["description"].fillna("") + " " + base_df["company_profile"].fillna("") + " " + base_df["requirements"].fillna(" ")
X.head()

0    Food52, a fast-growing, James Beard Award-winn...
1    Organised - Focused - Vibrant - Awesome!Do you...
2    Our client, located in Houston, is actively se...
3    THE COMPANY: ESRI – Environmental Systems Rese...
4    JOB TITLE: Itemization Review ManagerLOCATION:...
dtype: object

In [18]:
# splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Vectorizing the Data with TfiDF

In [21]:
# vectorizing the data
vectorizer = TfidfVectorizer(stop_words="english")  # Remove common words
X_train_tfidf = vectorizer.fit_transform(X_train)  # Fit and transform training data
X_test_tfidf = vectorizer.transform(X_test)  # Transform test data

In [22]:
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

y_pred = nb_model.predict(X_test_tfidf)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.98      4263
           1       1.00      0.00      0.01       207

    accuracy                           0.95      4470
   macro avg       0.98      0.50      0.49      4470
weighted avg       0.96      0.95      0.93      4470



### TfiDF with Resampling - Oversampling

In [29]:
# oversampling the fraudulent columns
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(sampling_strategy='auto', random_state=42)
X_train_tfidf_oversampled, y_train_oversampled = ros.fit_resample(X_train_tfidf, y_train)

In [31]:
# building the model with the oversampling
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf_oversampled, y_train_oversampled)
y_pred = nb_model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.96      0.98      4263
           1       0.52      0.86      0.65       207

    accuracy                           0.96      4470
   macro avg       0.76      0.91      0.81      4470
weighted avg       0.97      0.96      0.96      4470



### TfiDF with Resampling - Undersampling

In [33]:
# undersampling the unfraudulent columns
from imblearn.under_sampling import RandomUnderSampler

# undersampling
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_train_tfidf_undersampled, y_train_undersampled = rus.fit_resample(X_train_tfidf, y_train)

# building the model with the undersampling
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf_undersampled, y_train_undersampled)
y_pred = nb_model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.91      0.95      4263
           1       0.31      0.88      0.46       207

    accuracy                           0.90      4470
   macro avg       0.65      0.89      0.70      4470
weighted avg       0.96      0.90      0.92      4470



### TfiDF with Resampling - Balanced Sampling

In [34]:
# balancing the samples of both
from imblearn.combine import SMOTEENN

# balancing
smoteen = SMOTEENN(random_state=42)
X_train_tfidf_smoteen, y_train_smoteen = smoteen.fit_resample(X_train_tfidf, y_train)

# building the model with balanced sampling
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf_smoteen, y_train_smoteen)
y_pred = nb_model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.89      0.94      4263
           1       0.30      0.94      0.45       207

    accuracy                           0.89      4470
   macro avg       0.65      0.92      0.70      4470
weighted avg       0.96      0.89      0.92      4470



In [None]:
# checking the train accuracy to test for overfitting
print(classification_report(y_train_smoteen, nb_model.predict(X_train_tfidf_smoteen)))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98      9041
           1       0.98      1.00      0.99     12751

    accuracy                           0.99     21792
   macro avg       0.99      0.98      0.99     21792
weighted avg       0.99      0.99      0.99     21792



## Vectorizing the Data with Count

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=1000, stop_words="english")
X_train_CV = vectorizer.fit_transform(X_train)
X_train_CV = pd.DataFrame(X_train_CV.toarray(), columns=vectorizer.get_feature_names_out())

X_test_CV = vectorizer.transform(X_test)
X_train_CV.head()

Unnamed: 0,000,10,100,12,1500,18,20,200,30,40,...,world,worldwide,write,writing,written,year,years,york,young,και
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,1,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,1,0,0
2,6,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [26]:
# training the model
nb_model = MultinomialNB()
nb_model.fit(X_train_CV, y_train)

y_pred = nb_model.predict(X_test_CV)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.86      0.92      4263
           1       0.22      0.80      0.35       207

    accuracy                           0.86      4470
   macro avg       0.61      0.83      0.64      4470
weighted avg       0.95      0.86      0.90      4470





## Testing the Model with Some Fresh Data