# Naïve Bayes Classification Model

In [5]:
# importing modules
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [6]:
# getting original text data
base_df = pd.read_csv("fake_job_postings.csv")
y = base_df["fraudulent"]
X = base_df["description"].fillna("") + " " + base_df["company_profile"].fillna("") + " " + base_df["requirements"].fillna(" ")
X.head()

0    Food52, a fast-growing, James Beard Award-winn...
1    Organised - Focused - Vibrant - Awesome!Do you...
2    Our client, located in Houston, is actively se...
3    THE COMPANY: ESRI – Environmental Systems Rese...
4    JOB TITLE: Itemization Review ManagerLOCATION:...
dtype: object

In [7]:
# splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Vectorizing the Data with TfiDF

In [8]:
# vectorizing the data
vectorizer = TfidfVectorizer(stop_words="english")  # Remove common words
X_train_tfidf = vectorizer.fit_transform(X_train)  # Fit and transform training data
X_test_tfidf = vectorizer.transform(X_test)  # Transform test data

In [9]:
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

y_pred = nb_model.predict(X_test_tfidf)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97      4216
           1       0.00      0.00      0.00       254

    accuracy                           0.94      4470
   macro avg       0.47      0.50      0.49      4470
weighted avg       0.89      0.94      0.92      4470



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### TfiDF with Resampling - Oversampling

In [10]:
# oversampling the fraudulent columns
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(sampling_strategy='auto', random_state=42)
X_train_tfidf_oversampled, y_train_oversampled = ros.fit_resample(X_train_tfidf, y_train)

In [11]:
# building the model with the oversampling
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf_oversampled, y_train_oversampled)
y_pred = nb_model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98      4216
           1       0.62      0.88      0.72       254

    accuracy                           0.96      4470
   macro avg       0.80      0.92      0.85      4470
weighted avg       0.97      0.96      0.97      4470



### TfiDF with Resampling - Undersampling

In [12]:
# undersampling the unfraudulent columns
from imblearn.under_sampling import RandomUnderSampler

# undersampling
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_train_tfidf_undersampled, y_train_undersampled = rus.fit_resample(X_train_tfidf, y_train)

# building the model with the undersampling
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf_undersampled, y_train_undersampled)
y_pred = nb_model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.91      0.95      4216
           1       0.38      0.88      0.53       254

    accuracy                           0.91      4470
   macro avg       0.68      0.90      0.74      4470
weighted avg       0.96      0.91      0.93      4470



### TfiDF with Resampling - Balanced Sampling

In [13]:
# balancing the samples of both
from imblearn.combine import SMOTEENN

# balancing
smoteen = SMOTEENN(random_state=42)
X_train_tfidf_smoteen, y_train_smoteen = smoteen.fit_resample(X_train_tfidf, y_train)

# building the model with balanced sampling
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf_smoteen, y_train_smoteen)
y_pred = nb_model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.90      0.95      4216
           1       0.37      0.94      0.53       254

    accuracy                           0.90      4470
   macro avg       0.68      0.92      0.74      4470
weighted avg       0.96      0.90      0.92      4470



In [14]:
# checking the train accuracy to test for overfitting
print(classification_report(y_train_smoteen, nb_model.predict(X_train_tfidf_smoteen)))

              precision    recall  f1-score   support

           0       1.00      0.97      0.98      9001
           1       0.98      1.00      0.99     12798

    accuracy                           0.99     21799
   macro avg       0.99      0.98      0.99     21799
weighted avg       0.99      0.99      0.99     21799



## Vectorizing the Data with Count

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=1000, stop_words="english")
X_train_CV = vectorizer.fit_transform(X_train)
X_train_CV = pd.DataFrame(X_train_CV.toarray(), columns=vectorizer.get_feature_names_out())

X_test_CV = vectorizer.transform(X_test)
X_train_CV.head()

Unnamed: 0,000,10,100,12,1500,18,20,200,30,40,...,works,world,worldwide,write,writing,written,year,years,york,young
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,2,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,1,1,0,0,1,...,0,1,0,0,0,0,0,0,0,0
3,2,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,1,0,0
4,1,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,2,2,0,0


In [16]:
# training the model
nb_model = MultinomialNB()
nb_model.fit(X_train_CV, y_train)

y_pred = nb_model.predict(X_test_CV)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.88      0.93      4216
           1       0.27      0.74      0.40       254

    accuracy                           0.87      4470
   macro avg       0.63      0.81      0.66      4470
weighted avg       0.94      0.87      0.90      4470





# The Final Naïve Bayes Model

### Building It


In [17]:
from sklearn.naive_bayes import ComplementNB
import preprocessing as prep

# final model parameters received through model_builder.py
X_train, X_test, y_train, y_test = prep.custom_train_test_split(num_words=5000)
final_model = ComplementNB(alpha=0.01, norm=False)
final_model.fit(X_train, y_train)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Asa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Pickling the Outputs

In [18]:
import pickle
with open("pickles/bayes_model.pkl", "wb") as file:
    pickle.dump(final_model, file)
with open ("pickles/bayes_vectorizer.pkl", "wb") as file:
    pickle.dump(prep.vectorizer, file)

In [19]:
# pickling the accuracy and f1_score for fraudulent
from sklearn.metrics import accuracy_score, f1_score

y_pred = final_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred) 
f1 = f1_score(y_test, y_pred, pos_label=1)

# printing the outputs
print(f"Model Accuracy: {accuracy * 100:.2f}%") 
print(f"Model f1 of fraudulent: {f1 * 100:.2f}%") 

# save the accuracy and f1 to a .pkl file 
with open("pickles/bayes_accuracy.pkl", "wb") as file:
    pickle.dump(accuracy, file)
with open ("pickles/bayes_f1_score.pkl", "wb") as file:
    pickle.dump(f1, file)

Model Accuracy: 95.53%
Model f1 of fraudulent: 63.37%


### Testing It
(The process runs independent of previous processes if pickle was complete)

In [20]:
# opening up the pickled files
import pickle
import preprocessing as prep

with open("pickles/bayes_vectorizer.pkl", "rb") as file:
    prep.vectorizer = pickle.load(file)

with open("pickles/bayes_model.pkl", "rb") as file:
    nb_bayes_model = pickle.load(file)

In [21]:
# classification test matrix
from sklearn.metrics import classification_report
print(classification_report(y_test, nb_bayes_model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.99      0.96      0.98      4259
           1       0.52      0.82      0.63       211

    accuracy                           0.96      4470
   macro avg       0.75      0.89      0.80      4470
weighted avg       0.97      0.96      0.96      4470



In [22]:
text = "Work From Home! Easy Data Entry – No Experience Needed! We are a growing company looking for individuals to help with simple tasks. No prior experience needed. Work remotely and earn up to $5,000 per month! Earn BIG MONEY from home by completing simple data entry tasks. Flexible hours, no experience required! Just sign up, provide your details, and start earning immediately. Spots are limited!"
prediction = nb_bayes_model.predict(prep.preprocess(text=text))
prediction



array([1])