In [42]:
# Import the required modules
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [43]:
# exploring fake job postings
fake_job_df = pd.read_csv("fake_job_postings.csv")
fake_job_df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [44]:
fake_job_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   job_id               17880 non-null  int64 
 1   title                17880 non-null  object
 2   location             17534 non-null  object
 3   department           6333 non-null   object
 4   salary_range         2868 non-null   object
 5   company_profile      14572 non-null  object
 6   description          17879 non-null  object
 7   requirements         15184 non-null  object
 8   benefits             10668 non-null  object
 9   telecommuting        17880 non-null  int64 
 10  has_company_logo     17880 non-null  int64 
 11  has_questions        17880 non-null  int64 
 12  employment_type      14409 non-null  object
 13  required_experience  10830 non-null  object
 14  required_education   9775 non-null   object
 15  industry             12977 non-null  object
 16  func

In [45]:
# getting unique categories
fake_job_df.nunique()

job_id                 17880
title                  11231
location                3105
department              1337
salary_range             874
company_profile         1709
description            14801
requirements           11967
benefits                6204
telecommuting              2
has_company_logo           2
has_questions              2
employment_type            5
required_experience        7
required_education        13
industry                 131
function                  37
fraudulent                 2
dtype: int64

In [46]:
#  checking for percent nulls
fake_job_df.isnull().mean()

job_id                 0.000000
title                  0.000000
location               0.019351
department             0.645805
salary_range           0.839597
company_profile        0.185011
description            0.000056
requirements           0.150783
benefits               0.403356
telecommuting          0.000000
has_company_logo       0.000000
has_questions          0.000000
employment_type        0.194128
required_experience    0.394295
required_education     0.453300
industry               0.274217
function               0.361018
fraudulent             0.000000
dtype: float64

In [47]:
# checking balance of the dataset, dataset is poorly balanced
fake_job_df["fraudulent"].value_counts()

fraudulent
0    17014
1      866
Name: count, dtype: int64

In [48]:
# checking results alfter removing all nulls -> dropping all nulls isn't possible
fake_job_df_no_nulls = fake_job_df.dropna()
fake_job_df_no_nulls.info()

<class 'pandas.core.frame.DataFrame'>
Index: 774 entries, 6 to 17865
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   job_id               774 non-null    int64 
 1   title                774 non-null    object
 2   location             774 non-null    object
 3   department           774 non-null    object
 4   salary_range         774 non-null    object
 5   company_profile      774 non-null    object
 6   description          774 non-null    object
 7   requirements         774 non-null    object
 8   benefits             774 non-null    object
 9   telecommuting        774 non-null    int64 
 10  has_company_logo     774 non-null    int64 
 11  has_questions        774 non-null    int64 
 12  employment_type      774 non-null    object
 13  required_experience  774 non-null    object
 14  required_education   774 non-null    object
 15  industry             774 non-null    object
 16  function   

In [49]:
# removing all rows except the rows with 100% filled out rate
fake_job_df.dropna(axis=1).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   job_id            17880 non-null  int64 
 1   title             17880 non-null  object
 2   telecommuting     17880 non-null  int64 
 3   has_company_logo  17880 non-null  int64 
 4   has_questions     17880 non-null  int64 
 5   fraudulent        17880 non-null  int64 
dtypes: int64(5), object(1)
memory usage: 838.3+ KB


In [50]:
# testing: vectorizing the description column
from sklearn.feature_extraction.text import CountVectorizer


vectorizer = CountVectorizer(max_features=500)  # Top 5000 words
X = vectorizer.fit_transform(fake_job_df["description"].fillna(""))
X = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

X.head()

Unnamed: 0,12,200,30,ability,able,about,account,accounts,achieve,across,...,work,working,world,would,writing,written,year,years,you,your
0,0,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,1,1,0,0,0,...,1,1,2,0,0,0,0,0,2,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,9,1,1,0,...,1,0,2,0,0,0,0,0,3,2
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
# getting y
y = fake_job_df["fraudulent"]
y.head()

0    0
1    0
2    0
3    0
4    0
Name: fraudulent, dtype: int64

In [52]:
# getting train test split
X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train.head()

Unnamed: 0,12,200,30,ability,able,about,account,accounts,achieve,across,...,work,working,world,would,writing,written,year,years,you,your
10723,0,0,0,0,0,1,1,0,0,1,...,1,0,0,0,0,0,0,0,0,0
9769,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,2,0
6045,0,0,0,0,0,2,0,0,0,0,...,2,0,0,0,0,0,0,0,6,1
15199,0,0,0,0,0,0,0,5,0,0,...,0,0,0,0,0,0,0,0,0,0
4378,0,0,0,0,1,0,0,0,0,0,...,3,0,0,0,0,0,0,0,0,0


In [53]:
lrm = LogisticRegression(random_state=4, max_iter=500)
lrm = lrm.fit(X_train, y_train)

In [54]:
# Validate the model
print(f"Training Data Score: {lrm.score(X_train, y_train)}")
print(f"Testing Data Score: {lrm.score(X_test, y_test)}")

Training Data Score: 0.9750186428038777
Testing Data Score: 0.9572706935123042


In [55]:
print(classification_report(y_test, lrm.predict(X_test)))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      4227
           1       0.67      0.42      0.52       243

    accuracy                           0.96      4470
   macro avg       0.82      0.70      0.75      4470
weighted avg       0.95      0.96      0.95      4470



In [None]:
# improving the attributes of the dataset
vectorizer = CountVectorizer(max_features=500)  # Top 5000 words
X = vectorizer.fit_transform(fake_job_df["description"].fillna("") + 
                             fake_job_df["company_profile"].fillna("") +
                             fake_job_df["requirements"].fillna(""))
X = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

X.head()