# Dealing with Missing Values

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

missing_values = ["n/a", "nan", "None", "Not Specified", "Unspecified"]
df = pd.read_csv('fake_job_postings.csv', na_values = missing_values)

In [2]:
df.replace({"n/a": np.nan, "None": np.nan, "Not Specified": np.nan, "Unspecified": np.nan})

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,17876,Account Director - Distribution,"CA, ON, Toronto",Sales,,Vend is looking for some awesome new talent to...,Just in case this is the first time you’ve vis...,To ace this role you:Will eat comprehensive St...,What can you expect from us?We have an open cu...,0,1,1,Full-time,Mid-Senior level,,Computer Software,Sales,0
17876,17877,Payroll Accountant,"US, PA, Philadelphia",Accounting,,WebLinc is the e-commerce platform and service...,The Payroll Accountant will focus primarily on...,- B.A. or B.S. in Accounting- Desire to have f...,Health &amp; WellnessMedical planPrescription ...,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Internet,Accounting/Auditing,0
17877,17878,Project Cost Control Staff Engineer - Cost Con...,"US, TX, Houston",,,We Provide Full Time Permanent Positions for m...,Experienced Project Cost Control Staff Enginee...,At least 12 years professional experience.Abil...,,0,0,0,Full-time,,,,,0
17878,17879,Graphic Designer,"NG, LA, Lagos",,,,Nemsia Studios is looking for an experienced v...,1. Must be fluent in the latest versions of Co...,Competitive salary (compensation will be based...,0,0,1,Contract,Not Applicable,Professional,Graphic Design,Design,0


In [3]:
df.isnull().sum()

job_id                     0
title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2696
benefits                7212
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         3471
required_experience     7050
required_education      9502
industry                4903
function                6455
fraudulent                 0
dtype: int64

In [4]:
df_new = df.drop(columns=['job_id', 'department', 'salary_range', 'required_education'])

In [5]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   title                17880 non-null  object
 1   location             17534 non-null  object
 2   company_profile      14572 non-null  object
 3   description          17879 non-null  object
 4   requirements         15184 non-null  object
 5   benefits             10668 non-null  object
 6   telecommuting        17880 non-null  int64 
 7   has_company_logo     17880 non-null  int64 
 8   has_questions        17880 non-null  int64 
 9   employment_type      14409 non-null  object
 10  required_experience  10830 non-null  object
 11  industry             12977 non-null  object
 12  function             11425 non-null  object
 13  fraudulent           17880 non-null  int64 
dtypes: int64(4), object(10)
memory usage: 1.9+ MB


In [6]:
df_most_freq_imputed = df_new.apply(lambda x: x.fillna(x.value_counts().index[0]))
df_most_freq_imputed.head()

Unnamed: 0,title,location,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,industry,function,fraudulent
0,Marketing Intern,"US, NY, New York","We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,See job description,0,1,0,Other,Internship,Information Technology and Services,Marketing,0
1,Customer Service - Cloud Video Production,"NZ, , Auckland","90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,Marketing and Advertising,Customer Service,0
2,Commissioning Machinery Assistant (CMA),"US, IA, Wever",Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,See job description,0,1,0,Full-time,Mid-Senior level,Information Technology and Services,Information Technology,0
3,Account Executive - Washington DC,"US, DC, Washington",Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Computer Software,Sales,0
4,Bill Review Manager,"US, FL, Fort Worth",SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Hospital & Health Care,Health Care Provider,0


# Text Cleaning

In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import time
import string

In [8]:
df_imputed = df_most_freq_imputed.copy()

df_text = df_imputed[['title', 'company_profile', 'description', 'requirements', 'benefits']]

df_text_cleaned = df_text.copy()

def text_cleaning(i):
    global df_text_cleaned

    # Get the column's name
    col = df_text.iloc[:, i].name
    
    # Remove special characters
    df_text_cleaned['c_'+col] = df_text_cleaned[col].str.replace('[{}]'.format(string.punctuation), ' ', regex=True)
    
    # Remove numbers
    df_text_cleaned['c_'+col] = df_text_cleaned['c_'+col].str.replace('\d+', '', regex=True)
    
    # Perform tokenization
    df_text_cleaned['c_'+col] = df_text_cleaned['c_'+col].apply(lambda x: word_tokenize(x))

    # Convert the words to lowercase
    df_text_cleaned['c_'+col] = df_text_cleaned['c_'+col].apply(lambda x: [word.lower() for word in x])

    # Remove the stopwords
    english_stopwords = stopwords.words('english')
    df_text_cleaned['c_'+col] = df_text_cleaned['c_'+col].apply(lambda x: [word for word in x if word not in (english_stopwords)])

    # Perform lemmatization
    lemmatizer = WordNetLemmatizer()
    df_text_cleaned['c_'+col] = df_text_cleaned['c_'+col].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x]))

## use Pool.map from multiprocessing

In [9]:
from multiprocessing.pool import ThreadPool as Pool

def initpool(df):
    global df_text_cleaned
    df_text_cleaned = df

if __name__ == '__main__':
    n_cols = len(df_text.columns)
    start = time.time()
    df = df_text.copy()
    p = Pool(initializer=initpool, initargs=(df,))
    p.map(text_cleaning, range(n_cols))
    end = time.time()
    
print('Running time with Pool.map() function: {:.4f} s'.format(end-start))

Running time with Pool.map() function: 159.3800 s


In [10]:
df_text_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   title              17880 non-null  object
 1   company_profile    17880 non-null  object
 2   description        17880 non-null  object
 3   requirements       17880 non-null  object
 4   benefits           17880 non-null  object
 5   c_title            17880 non-null  object
 6   c_benefits         17880 non-null  object
 7   c_company_profile  17880 non-null  object
 8   c_requirements     17880 non-null  object
 9   c_description      17880 non-null  object
dtypes: object(10)
memory usage: 1.4+ MB


In [11]:
df_text_cleaned["location"] = df_imputed["location"]
df_text_cleaned["telecommuting"] = df_imputed["telecommuting"]
df_text_cleaned["has_company_logo"] = df_imputed["has_company_logo"]
df_text_cleaned["has_questions"] = df_imputed["has_questions"]
df_text_cleaned["employment_type"] = df_imputed["employment_type"]
df_text_cleaned["required_experience"] = df_imputed["required_experience"]
df_text_cleaned["industry"] = df_imputed["industry"]
df_text_cleaned["function"] = df_imputed["function"]
df_text_cleaned["fraudulent"] = df_imputed["fraudulent"]

df = df_text_cleaned.drop(list(df_text_cleaned)[0:5], axis=1)

df.to_csv('fake_job_postings_most_freq_text_cleaned.csv')

# Oversampling for Imbalanced Dataset

In [12]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

df = pd.read_csv('fake_job_postings_most_freq_text_cleaned.csv')

X = df.drop('fraudulent', axis=1)
y = df['fraudulent']

# Split the dataset into training data and validation data in the ratio of 8:2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use random oversampling
oversample = RandomOverSampler(sampling_strategy='minority')

# Fit and apply the transform on the training data
df_X, df_y = oversample.fit_resample(X_train, y_train)

print(Counter(df_y))

Counter({0: 13619, 1: 13619})
