# Data Observation and Segmentation

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re
import os
import copy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
def summary_dataframe(data):
    null_counts = data.isnull().sum()
    summary_df = pd.DataFrame({"column_name":null_counts.index, "null_count":null_counts.values})
    summary_df["null_rate"] = null_counts.values/len(data)
    summary_df["elements"] = [len(set(data[column_name].to_list())) for column_name in data.columns]
    return summary_df

In [4]:
TARGET_COLUMN = "fraudulent"

In [9]:
job_postings_df = pd.read_csv("/content/drive/MyDrive/Fake-JD-Detector/job_postings_training_set.csv")

## Source Data

In [10]:
job_postings_df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [11]:
summary_dataframe(job_postings_df)

Unnamed: 0,column_name,null_count,null_rate,elements
0,job_id,0,0.0,17828
1,title,0,0.0,11188
2,location,345,0.019352,3098
3,department,11514,0.645838,1338
4,salary_range,14966,0.839466,875
5,company_profile,3290,0.184541,1708
6,description,1,5.6e-05,14678
7,requirements,2688,0.150774,11882
8,benefits,7184,0.402962,6132
9,telecommuting,0,0.0,2


In [12]:
job_postings_df.describe()

Unnamed: 0,job_id,telecommuting,has_company_logo,has_questions,fraudulent
count,17828.0,17828.0,17828.0,17828.0,17828.0
mean,8914.5,0.042966,0.795434,0.491979,0.048351
std,5146.644635,0.202786,0.403395,0.49995,0.214513
min,1.0,0.0,0.0,0.0,0.0
25%,4457.75,0.0,1.0,0.0,0.0
50%,8914.5,0.0,1.0,0.0,0.0
75%,13371.25,0.0,1.0,1.0,0.0
max,17828.0,1.0,1.0,1.0,1.0


In [13]:
job_postings_df[TARGET_COLUMN].value_counts().to_frame()

Unnamed: 0,fraudulent
0,16966
1,862


## Some References Inspire Me

* [Real/Fake Job Posting Prediction](https://www.kaggle.com/datasets/shivamb/real-or-fake-fake-jobposting-prediction)

In [14]:
# 
job_postings_extend_df = copy.deepcopy(job_postings_df)

In [15]:
job_postings_extend_df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [16]:
# Extend columns

# Using 0 length string replace Nan for more easy to detect
job_postings_extend_df.fillna("", inplace=True)

#
null_counts = job_postings_df.isnull().sum()
for column, null_count in null_counts.items():
    if null_count > 0:
        job_postings_extend_df[column+"_is_null"] = job_postings_extend_df[column].apply(lambda x: 0 if len(x) > 0 else 1)

#
job_postings_extend_df[["location_country", "location_state", "location_city"]] = job_postings_extend_df["location"].str.split(", ",  2, expand=True)

#
salary_range_lows, salary_range_ups, salary_range_diff = [], [], []
for salary_range in job_postings_extend_df["salary_range"].tolist():
    
    range_split, range_low, range_up =[], -1, -1
    
    if len(salary_range) > 0:
        range_split = re.sub(r"[^0-9\-]+", "", salary_range).split("-")
    
    if len(range_split) == 2 and len(range_split[0]) > 0 and len(range_split[1]) > 0:
        range_low, range_up = range_split
        
    salary_range_lows.append(int(range_low))
    salary_range_ups.append(int(range_up))
    salary_range_diff.append(int(range_up)-int(range_low))
    
job_postings_extend_df["salary_range_low"] = salary_range_lows
job_postings_extend_df["salary_range_up"] = salary_range_ups
job_postings_extend_df["salary_range_diff"] = salary_range_diff

#
job_postings_extend_df["posting_text"] = (
    job_postings_extend_df["title"]+" "+
    job_postings_extend_df["location"]+" "+
    job_postings_extend_df["company_profile"]+" "+
    job_postings_extend_df["description"]+" "+
    job_postings_extend_df["requirements"]+" "+
    job_postings_extend_df["benefits"]
)

#
job_postings_extend_df["posting_text_cleaned"] = job_postings_extend_df["posting_text"].apply(lambda x: re.sub(r"[\s]+", " ", re.sub(r"[^a-zA-Z0-9]+"," ", x)).strip())

#
text_columns = ["company_profile", "description", "requirements", "benefits", "posting_text", "posting_text_cleaned"]
for text_column in text_columns:
    job_postings_extend_df[text_column+"_len"] = job_postings_extend_df[text_column].apply(lambda x: len(x))

  job_postings_extend_df[["location_country", "location_state", "location_city"]] = job_postings_extend_df["location"].str.split(", ",  2, expand=True)


In [17]:
summary_dataframe(job_postings_extend_df)

Unnamed: 0,column_name,null_count,null_rate,elements
0,job_id,0,0.0,17828
1,title,0,0.0,11188
2,location,0,0.0,3098
3,department,0,0.0,1338
4,salary_range,0,0.0,875
5,company_profile,0,0.0,1708
6,description,0,0.0,14678
7,requirements,0,0.0,11882
8,benefits,0,0.0,6132
9,telecommuting,0,0.0,2


In [18]:
job_postings_extend_df.describe()

Unnamed: 0,job_id,telecommuting,has_company_logo,has_questions,fraudulent,location_is_null,department_is_null,salary_range_is_null,company_profile_is_null,description_is_null,...,function_is_null,salary_range_low,salary_range_up,salary_range_diff,company_profile_len,description_len,requirements_len,benefits_len,posting_text_len,posting_text_cleaned_len
count,17828.0,17828.0,17828.0,17828.0,17828.0,17828.0,17828.0,17828.0,17828.0,17828.0,...,17828.0,17828.0,17828.0,17828.0,17828.0,17828.0,17828.0,17828.0,17828.0,17828.0
mean,8914.5,0.042966,0.795434,0.491979,0.048351,0.019352,0.645838,0.839466,0.184541,5.6e-05,...,0.360837,82146.86,129471.2,47324.29,621.079594,1218.000224,590.106181,208.92551,2686.940767,2612.368073
std,5146.644635,0.202786,0.403395,0.49995,0.214513,0.137761,0.478272,0.367111,0.387936,0.007489,...,0.480257,7065713.0,10803640.0,3750398.0,567.299039,892.607405,613.566109,337.022023,1464.791575,1423.692278
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,27.0,23.0
25%,4457.75,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,-1.0,-1.0,0.0,139.0,607.0,146.0,0.0,1612.0,1570.0
50%,8914.5,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,-1.0,-1.0,0.0,570.0,1017.0,467.0,45.0,2540.5,2473.5
75%,13371.25,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,1.0,-1.0,-1.0,0.0,879.0,1587.0,820.0,294.25,3490.25,3404.0
max,17828.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,800000000.0,1200000000.0,400000000.0,6178.0,14881.0,10864.0,4427.0,14933.0,14611.0


## Split Data to Train and Test

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    job_postings_extend_df.posting_text_cleaned, 
    job_postings_extend_df.fraudulent, 
    test_size = 0.20, 
    stratify=job_postings_extend_df.fraudulent, 
    random_state=777
)

train_df = pd.DataFrame({"text": X_train, "fraudulent": y_train})
test_df = pd.DataFrame({"text": X_test, "fraudulent": y_test})

train_df.to_csv("/content/drive/MyDrive/Fake-JD-Detector/splitted_text_data/train.csv", index=False)
test_df.to_csv("/content/drive/MyDrive/Fake-JD-Detector/splitted_text_data/verify.csv", index=False)