# Prepare Data


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("/content/fake_job_postings.csv", index_col='job_id')
df.head(3)

Unnamed: 0_level_0,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0


1. `title` : object (job title)
2. `location` : object (job location)
3. `department` : object (department within the company)
4. `salary_range` : object (range of salary for the job)
5. `company_profile` : object (description of the company)
6. `description` : object (job description)
7. `requirements` : object (job requirements)
8. `benefits` : object (job benefits)
9. `telecommuting` : int64 (indicator for whether the job offers telecommuting)
10. `has_company_logo` : int64 (indicator for whether the company has a logo)
11. `has_questions` : int64 (indicator for whether the job application has questions)
12. `employment_type` : object (type of employment)
13. `required_experience` : object (required experience for the job)
14. `required_education` : object (required education for the job)
15. `industry` : object (industry of the company)
16. `function` : object (job function)
17. `fraudulent` : int64 (indicator for whether the job posting is fraudulent)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17880 entries, 1 to 17880
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   title                17880 non-null  object
 1   location             17534 non-null  object
 2   department           6333 non-null   object
 3   salary_range         2868 non-null   object
 4   company_profile      14572 non-null  object
 5   description          17879 non-null  object
 6   requirements         15185 non-null  object
 7   benefits             10670 non-null  object
 8   telecommuting        17880 non-null  int64 
 9   has_company_logo     17880 non-null  int64 
 10  has_questions        17880 non-null  int64 
 11  employment_type      14409 non-null  object
 12  required_experience  10830 non-null  object
 13  required_education   9775 non-null   object
 14  industry             12977 non-null  object
 15  function             11425 non-null  object
 16  frau

## Work on `description` only

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [5]:
# Feature Selection 
df = df[['description', 'fraudulent']]

In [6]:
# Drop rows with missing description or fraudulent values
df = df.dropna(subset=['description', 'fraudulent'])  
df.isna().sum().sum()

0

In [7]:
# features & Target split
X = df['description']
y = df['fraudulent']
X.shape , y.shape

((17879,), (17879,))

### Using `CountVectorizer`

* The `CountVectorizer` class from `scikit-learn's` `feature_extraction`.
* text module is used to `convert a collection of text` documents into a `matrix of token counts`. 
* It provides a simple way to preprocess text data and represent it as numerical features that can be used in machine learning models.

**Here's a high-level overview of how CountVectorizer works:**


---
1. `Tokenization`: The CountVectorizer `first tokenizes the text data` by `breaking it into individual words or tokens`. 
  * It `removes punctuation and special characters` and `splits the text into a sequence of tokens based on whitespace`.

2. `Vocabulary` Building: It then builds a vocabulary, which is a `mapping of each unique token in the corpus to a unique integer index`. 
  * This vocabulary is constructed `based on the training data`.

3. `Encoding` : Once the vocabulary is built, the CountVectorizer `encodes each document as a vector of token counts`. 
  * For each document, it `counts the occurrence of each token` from the vocabulary and stores it in the corresponding position in the vector.

4. `Sparse Matrix` : The resulting matrix is typically sparse since most documents only contain a subset of the available tokens. 
  * Therefore, the CountVectorizer `outputs a sparse matrix representation` to efficiently handle large feature spaces.



In [8]:
#Feature Vectorization 
vectorizer = CountVectorizer(stop_words='english')
X_vec = vectorizer.fit_transform(X)
print(X_vec.shape)

(17879, 61934)


In [9]:
# print any value
print(X_vec[5])

  (0, 20480)	1
  (0, 23364)	1
  (0, 5512)	2
  (0, 59139)	1
  (0, 59321)	1
  (0, 48779)	1
  (0, 52596)	4
  (0, 57607)	1
  (0, 51517)	1
  (0, 34914)	2
  (0, 21355)	1
  (0, 12968)	3
  (0, 47451)	1
  (0, 48448)	1
  (0, 1592)	1
  (0, 42453)	1
  (0, 7692)	1
  (0, 52267)	1
  (0, 51135)	2
  (0, 9732)	1
  (0, 23378)	1
  (0, 39012)	2
  (0, 35611)	1
  (0, 24213)	5
  (0, 42046)	1
  :	:
  (0, 55640)	1
  (0, 39921)	1
  (0, 32309)	1
  (0, 48408)	1
  (0, 1523)	1
  (0, 37486)	1
  (0, 42758)	1
  (0, 11924)	1
  (0, 2406)	1
  (0, 7655)	1
  (0, 29682)	1
  (0, 6509)	1
  (0, 41600)	2
  (0, 11165)	1
  (0, 15985)	1
  (0, 5876)	1
  (0, 25492)	1
  (0, 6905)	1
  (0, 38358)	1
  (0, 36425)	1
  (0, 29589)	1
  (0, 2602)	1
  (0, 1931)	1
  (0, 18760)	1
  (0, 17933)	1


### Split

In [10]:
# train & test split 
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

In [11]:
print(f'X_train : {X_train.shape}')
print(f'X_test : {X_test.shape}')
print(f'y_train : {y_train.shape}')
print(f'y_test : {y_test.shape}')

X_train : (14303, 61934)
X_test : (3576, 61934)
y_train : (14303,)
y_test : (3576,)


### logistic Regression 

In [12]:
#Model Training
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
#Model Evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9793064876957495


### custom test

In [16]:
# create a sample test
description = """
We are seeking a highly motivated and skilled Software Engineer to join our team. The ideal candidate should have a strong background in software development and a passion for building innovative solutions. As a Software Engineer, you will be responsible for designing and implementing software applications, collaborating with cross-functional teams, and ensuring the quality and performance of the software products. The role requires proficiency in programming languages such as Python, Java, or C++, along with experience in web development frameworks and databases. If you are eager to contribute to cutting-edge projects and work in a dynamic environment, we would love to hear from you. Apply now to join our team and make a significant impact!
"""

In [19]:
# vectorize the description
vec_sample = vectorizer.transform([description])
print(vec_sample)

  (0, 3872)	1
  (0, 3994)	1
  (0, 5615)	1
  (0, 7576)	1
  (0, 8266)	1
  (0, 10310)	1
  (0, 11914)	1
  (0, 12711)	1
  (0, 13158)	1
  (0, 13406)	1
  (0, 14728)	1
  (0, 15068)	2
  (0, 16877)	1
  (0, 16911)	1
  (0, 17083)	1
  (0, 18148)	2
  (0, 18384)	1
  (0, 18516)	1
  (0, 19756)	1
  (0, 21806)	1
  (0, 22066)	1
  (0, 24013)	1
  (0, 24230)	1
  (0, 24929)	1
  (0, 25222)	1
  :	:
  (0, 28623)	1
  (0, 29875)	1
  (0, 30280)	1
  (0, 33068)	1
  (0, 36974)	1
  (0, 37388)	1
  (0, 40605)	1
  (0, 40790)	1
  (0, 40932)	1
  (0, 41111)	1
  (0, 41914)	1
  (0, 42046)	1
  (0, 44393)	1
  (0, 45083)	1
  (0, 45820)	1
  (0, 47187)	1
  (0, 48166)	1
  (0, 48441)	1
  (0, 48952)	5
  (0, 49099)	1
  (0, 50858)	1
  (0, 52596)	2
  (0, 52807)	1
  (0, 58644)	1
  (0, 59321)	1


In [21]:
# get the prediction of the vec_sample
predictions = model.predict(vec_sample)
print(predictions)

[0]


In [22]:
# Output the predictions
if predictions[0] == 1:
    print("fraudulent.")
else:
    print("honest.")

honest.


## work on many Features 

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
# Load the dataset
df = pd.read_csv('fake_job_postings.csv')
df.head(2)

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0


In [3]:
# handling missing values in description or fraudulent
df = df.dropna(subset=['description', 'fraudulent'])

In [4]:
# try to get only features have nulls < 4000
feat = (df.isna().sum() < 4000).index [1:-1]
list(feat)

['title',
 'location',
 'department',
 'salary_range',
 'company_profile',
 'description',
 'requirements',
 'benefits',
 'telecommuting',
 'has_company_logo',
 'has_questions',
 'employment_type',
 'required_experience',
 'required_education',
 'industry',
 'function']

In [5]:
# check remainning samples after dropna from selected feat
df[feat].dropna().shape # data becom 774 sample from 18000

(774, 16)

In [6]:
# Select the columns of interest
columns_of_interest = ['company_profile', 'description', 'requirements']

In [21]:
# Extract the relevant data
df_selected = df[columns_of_interest + ['fraudulent']].copy()
df_selected.shape

(17879, 4)

In [22]:
# Fill missing values with empty string
df_selected.fillna('', inplace=True)  

### using CountVectorizer

In [23]:
# Apply CountVectorizer with stop words removal
vectorizer = CountVectorizer(stop_words='english')
X_sparse = vectorizer.fit_transform(df_selected['company_profile'] + ' ' +
                                    df_selected['description'] + ' ' +
                                    df_selected['requirements'])

In [24]:
print(X_sparse.shape)
print(X_sparse[0])

(17879, 96661)
  (0, 33409)	7
  (0, 89934)	1
  (0, 18917)	1
  (0, 36207)	1
  (0, 8456)	2
  (0, 92067)	2
  (0, 18247)	4
  (0, 75155)	1
  (0, 80528)	3
  (0, 17363)	2
  (0, 13192)	1
  (0, 37867)	3
  (0, 18254)	2
  (0, 52480)	1
  (0, 59350)	1
  (0, 26121)	2
  (0, 11556)	1
  (0, 27714)	1
  (0, 82155)	2
  (0, 33282)	1
  (0, 89486)	1
  (0, 83036)	1
  (0, 53094)	4
  (0, 9825)	1
  (0, 91232)	1
  :	:
  (0, 49215)	1
  (0, 86480)	1
  (0, 31641)	1
  (0, 59227)	1
  (0, 62552)	1
  (0, 77013)	1
  (0, 15480)	1
  (0, 25352)	1
  (0, 33808)	1
  (0, 59130)	1
  (0, 59300)	1
  (0, 53306)	1
  (0, 36167)	1
  (0, 71724)	1
  (0, 16236)	1
  (0, 24330)	1
  (0, 74691)	1
  (0, 15710)	1
  (0, 66678)	1
  (0, 92504)	2
  (0, 78644)	1
  (0, 29790)	1
  (0, 91575)	1
  (0, 46526)	1
  (0, 38104)	1


In [54]:
# try to convert sparse to dense array but this caused ram crach 
"""# Convert the sparse matrix to a dense matrix
X_dense = X_sparse.toarray()"""

In [63]:
"""print(X_dense.shape)
X_dense[0]"""

(17879, 96661)


array([0, 0, 0, ..., 0, 0, 0])

### Split

In [25]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_sparse, df['fraudulent'], test_size=0.2, random_state=42)


In [26]:
print(f'X_train : {X_train.shape}')
print(f'X_test : {X_test.shape}')
print(f'y_train : {y_train.shape}')
print(f'y_test : {y_test.shape}')

X_train : (14303, 96661)
X_test : (3576, 96661)
y_train : (14303,)
y_test : (3576,)


### Logistic Regression

In [27]:
# Create a classification model (e.g., Logistic Regression)
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
# Evaluate the model
accuracy = model.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.9807046979865772


### custom test 

In [29]:
# Sample test data
sample_test = {
    'company_profile': "We are a leading company in the tech industry.",
    'description': "We are looking for a skilled software engineer with experience in Python.",
    'requirements': "Candidates must have a Bachelor's degree in Computer Science."
}


In [30]:
# vectorize the test sample
tes_vec = vectorizer.transform([sample_test['company_profile'] + ' ' +
                                    sample_test['description'] + ' ' +
                                    sample_test['requirements']])

In [31]:
# predict the test sample
pred = model.predict(tes_vec)
pred

array([0])

In [32]:
# Output the predictions
if pred[0] == 1:
    print("fraudulent.")
else:
    print("honest.")

honest.
