In [5]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,accuracy_score,precision_score,f1_score,recall_score


In [6]:
df=pd.read_csv(r"C:\Users\adorn\OneDrive\Pictures\Documents\fake_job\fake_job_postings.csv")


In [7]:
df['text'] = (df["title"].astype(str)+ " " +  df["description"].astype(str)+ " " + df["requirements"].astype(str))


In [8]:

cat_cols = ['employment_type', 'required_experience', 'required_education', 'industry', 'function']
bin_cols = ['telecommuting', 'has_company_logo', 'has_questions']



In [9]:
df[cat_cols] = df[cat_cols].fillna('Unknown')

In [10]:
onehot=OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first')
res=onehot.fit_transform(df[cat_cols])  

# Convert to DataFrame with column names
res = pd.DataFrame(res,columns=onehot.get_feature_names_out(cat_cols),index=df.index)
#  binary and encoded categorical features
X_structured = pd.concat([df[bin_cols].reset_index(drop=True),
                          res.reset_index(drop=True)], axis=1)

In [11]:
tfidf = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1,2), min_df=2)
X_text = tfidf.fit_transform(df['text'])

In [12]:
from scipy.sparse import hstack
# X_final = hstack([X_structured.values, X_text])
X_final = hstack([X_structured.to_numpy(), X_text])

y = df['fraudulent']

#Final feature matrix


In [13]:
# Structured feature names (binary + one-hot)
structured_feature_names = list(X_structured.columns)
structured_feature_names



['telecommuting',
 'has_company_logo',
 'has_questions',
 'employment_type_Full-time',
 'employment_type_Other',
 'employment_type_Part-time',
 'employment_type_Temporary',
 'employment_type_Unknown',
 'required_experience_Director',
 'required_experience_Entry level',
 'required_experience_Executive',
 'required_experience_Internship',
 'required_experience_Mid-Senior level',
 'required_experience_Not Applicable',
 'required_experience_Unknown',
 "required_education_Bachelor's Degree",
 'required_education_Certification',
 'required_education_Doctorate',
 'required_education_High School or equivalent',
 "required_education_Master's Degree",
 'required_education_Professional',
 'required_education_Some College Coursework Completed',
 'required_education_Some High School Coursework',
 'required_education_Unknown',
 'required_education_Unspecified',
 'required_education_Vocational',
 'required_education_Vocational - Degree',
 'required_education_Vocational - HS Diploma',
 'industry_Airli

In [14]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_s, y_s = smote.fit_resample(X_final, y)


In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_s, y_s,test_size=0.2, random_state=42)


MODELING

In [16]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


In [17]:
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score

y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

print(classification_report(y_test, y_pred))


0.9975022039377021
0.9988238753307851
0.9961876832844575
0.9975040375862575
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3396
           1       1.00      1.00      1.00      3410

    accuracy                           1.00      6806
   macro avg       1.00      1.00      1.00      6806
weighted avg       1.00      1.00      1.00      6806



In [18]:
import pickle

# Save model
with open('fraud_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Save encoder
with open('encoder.pkl', 'wb') as f:
    pickle.dump(onehot,f)

# Save TF-IDF
with open('tfidf.pkl', 'wb') as f:
    pickle.dump(tfidf, f)


In [19]:
y_pred

array([0, 0, 0, ..., 0, 0, 0])