# Data Observation and Segmentation

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re
import os
import copy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
def summary_dataframe(data):
    null_counts = data.isnull().sum()
    summary_df = pd.DataFrame({"column_name":null_counts.index, "null_count":null_counts.values})
    summary_df["null_rate"] = null_counts.values/len(data)
    summary_df["elements"] = [len(set(data[column_name].to_list())) for column_name in data.columns]
    return summary_df

In [4]:
TARGET_COLUMN = "fraudulent"

In [5]:
job_postings_df = pd.read_csv("fake-job-posting-prediction.csv")

## Source Data

In [6]:
job_postings_df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [7]:
summary_dataframe(job_postings_df)

Unnamed: 0,column_name,null_count,null_rate,elements
0,job_id,0,0.0,17880
1,title,0,0.0,11231
2,location,346,0.019351,3106
3,department,11547,0.645805,1338
4,salary_range,15012,0.839597,875
5,company_profile,3308,0.185011,1710
6,description,1,5.6e-05,14802
7,requirements,2695,0.150727,11969
8,benefits,7210,0.403244,6206
9,telecommuting,0,0.0,2


In [8]:
job_postings_df.describe()

Unnamed: 0,job_id,telecommuting,has_company_logo,has_questions,fraudulent
count,17880.0,17880.0,17880.0,17880.0,17880.0
mean,8940.5,0.042897,0.795302,0.491723,0.048434
std,5161.655742,0.202631,0.403492,0.499945,0.214688
min,1.0,0.0,0.0,0.0,0.0
25%,4470.75,0.0,1.0,0.0,0.0
50%,8940.5,0.0,1.0,0.0,0.0
75%,13410.25,0.0,1.0,1.0,0.0
max,17880.0,1.0,1.0,1.0,1.0


In [9]:
job_postings_df[TARGET_COLUMN].value_counts().to_frame()

Unnamed: 0,fraudulent
0,17014
1,866


## Some References Inspire Me

* [Real/Fake Job Posting Prediction](https://www.kaggle.com/datasets/shivamb/real-or-fake-fake-jobposting-prediction)

In [10]:
# 
job_postings_extend_df = copy.deepcopy(job_postings_df)

In [11]:
job_postings_extend_df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [12]:
# Extend columns

# Using 0 length string replace Nan for more easy to detect
job_postings_extend_df.fillna("", inplace=True)

#
null_counts = job_postings_df.isnull().sum()
for column, null_count in null_counts.items():
    if null_count > 0:
        job_postings_extend_df[column+"_is_null"] = job_postings_extend_df[column].apply(lambda x: 0 if len(x) > 0 else 1)

#
job_postings_extend_df[["location_country", "location_state", "location_city"]] = job_postings_extend_df["location"].str.split(", ",  2, expand=True)

#
salary_range_lows, salary_range_ups, salary_range_diff = [], [], []
for salary_range in job_postings_extend_df["salary_range"].tolist():
    
    range_split, range_low, range_up =[], -1, -1
    
    if len(salary_range) > 0:
        range_split = re.sub(r"[^0-9\-]+", "", salary_range).split("-")
    
    if len(range_split) == 2 and len(range_split[0]) > 0 and len(range_split[1]) > 0:
        range_low, range_up = range_split
        
    salary_range_lows.append(int(range_low))
    salary_range_ups.append(int(range_up))
    salary_range_diff.append(int(range_up)-int(range_low))
    
job_postings_extend_df["salary_range_low"] = salary_range_lows
job_postings_extend_df["salary_range_up"] = salary_range_ups
job_postings_extend_df["salary_range_diff"] = salary_range_diff

#
job_postings_extend_df["posting_text"] = (
    job_postings_extend_df["title"]+" "+
    job_postings_extend_df["location"]+" "+
    job_postings_extend_df["company_profile"]+" "+
    job_postings_extend_df["description"]+" "+
    job_postings_extend_df["requirements"]+" "+
    job_postings_extend_df["benefits"]
)

#
job_postings_extend_df["posting_text_cleaned"] = job_postings_extend_df["posting_text"].apply(lambda x: re.sub(r"[\s]+", " ", re.sub(r"[^a-zA-Z0-9]+"," ", x)).strip())

#
text_columns = ["company_profile", "description", "requirements", "benefits", "posting_text", "posting_text_cleaned"]
for text_column in text_columns:
    job_postings_extend_df[text_column+"_len"] = job_postings_extend_df[text_column].apply(lambda x: len(x))

In [13]:
summary_dataframe(job_postings_extend_df)

Unnamed: 0,column_name,null_count,null_rate,elements
0,job_id,0,0.0,17880
1,title,0,0.0,11231
2,location,0,0.0,3106
3,department,0,0.0,1338
4,salary_range,0,0.0,875
5,company_profile,0,0.0,1710
6,description,0,0.0,14802
7,requirements,0,0.0,11969
8,benefits,0,0.0,6206
9,telecommuting,0,0.0,2


In [14]:
job_postings_extend_df.describe()

Unnamed: 0,job_id,telecommuting,has_company_logo,has_questions,fraudulent,location_is_null,department_is_null,salary_range_is_null,company_profile_is_null,description_is_null,...,function_is_null,salary_range_low,salary_range_up,salary_range_diff,company_profile_len,description_len,requirements_len,benefits_len,posting_text_len,posting_text_cleaned_len
count,17880.0,17880.0,17880.0,17880.0,17880.0,17880.0,17880.0,17880.0,17880.0,17880.0,...,17880.0,17880.0,17880.0,17880.0,17880.0,17880.0,17880.0,17880.0,17880.0,17880.0
mean,8940.5,0.042897,0.795302,0.491723,0.048434,0.019351,0.645805,0.839597,0.185011,5.6e-05,...,0.361018,81920.59,129110.8,47190.18,620.901454,1218.004418,590.132438,208.896141,2686.778635,2610.953412
std,5161.655742,0.202631,0.403492,0.499945,0.214688,0.13776,0.478282,0.366989,0.388317,0.007479,...,0.480309,7055431.0,10787920.0,3744941.0,567.4541,894.82862,613.191056,337.077082,1465.829239,1423.076097
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,27.0,23.0
25%,4470.75,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,-1.0,-1.0,0.0,138.0,607.0,146.0,0.0,1612.0,1569.0
50%,8940.5,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,-1.0,-1.0,0.0,570.0,1017.0,467.0,45.0,2539.0,2472.0
75%,13410.25,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,1.0,-1.0,-1.0,0.0,879.0,1586.0,820.0,294.0,3491.0,3403.0
max,17880.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,800000000.0,1200000000.0,400000000.0,6178.0,14907.0,10864.0,4429.0,14959.0,14611.0


## Split Data to Train and Test

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    job_postings_extend_df.posting_text_cleaned, 
    job_postings_extend_df.fraudulent, 
    test_size = 0.20, 
    stratify=job_postings_extend_df.fraudulent, 
    random_state=777
)

train_df = pd.DataFrame({"text": X_train, "fraudulent": y_train})
test_df = pd.DataFrame({"text": X_test, "fraudulent": y_test})

train_df.to_csv("splitted_text_data/train.csv", index=False)
test_df.to_csv("splitted_text_data/test.csv", index=False)