In [1]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics

In [2]:
jobPostings = pd.read_csv('fake_job_postings.csv')
# jobPostings.head()

---
# Extracting `salary_range`

The cleaned salary range are stored in column `salary_lower_limit` and `salary_upper_limit`

for data like "40000-59000", the corresponding lower and upper limits are 40000 and 59000

for data with NaN, they are all stored as (0, 0)

for data with just one value like "40000", lower limit is stored as 0 and upper as 40000

In [3]:
def extract_salary_range(t):
    a_string_lowercase = t.lower()
    if a_string_lowercase.islower():
        return (0, 0)
    return (0, int(t)) if '-' not in t else (int(t.split('-')[0]), int(t.split('-')[1]))

In [4]:
jobPostings['salary_range'] = jobPostings['salary_range'].fillna('0')

In [5]:
lower = []
upper = []
for i in range(len(jobPostings)):
    l, u = extract_salary_range(jobPostings['salary_range'][i])
    lower.append(l)
    upper.append(u)

jobPostings["salary_lower_limit"] = lower
jobPostings["salary_upper_limit"] = upper

In [6]:
jobPostings[["salary_range", "salary_lower_limit", "salary_upper_limit"]].head(50)

Unnamed: 0,salary_range,salary_lower_limit,salary_upper_limit
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0
5,0,0,0
6,20000-28000,20000,28000
7,0,0,0
8,0,0,0
9,0,0,0


---

# Cleaning `function` using one-hot encoding

Columns can be accessed by using `function_type_is_<function name>`.

For example, if you want to know whether the job has the function of 'Marketing', simply use `jobPostings["function_type_is_Marketing"]` to generate a column of bool.

In [7]:
jobPostings["function"] = jobPostings["function"].fillna('Not specified')
jobPostings["function"] = jobPostings["function"].astype("category")

In [8]:
temp = jobPostings["function"]

# generate binary values using get_dummies
jobPostings = pd.get_dummies(jobPostings, columns=["function"], prefix=["function_type_is"], dtype="bool")
jobPostings = jobPostings.join(temp)

In [9]:
jobPostings.dtypes

job_id                                        int64
title                                        object
location                                     object
department                                   object
salary_range                                 object
company_profile                              object
description                                  object
requirements                                 object
benefits                                     object
telecommuting                                 int64
has_company_logo                              int64
has_questions                                 int64
employment_type                              object
required_experience                          object
required_education                           object
industry                                     object
fraudulent                                    int64
salary_lower_limit                            int64
salary_upper_limit                            int64
function_typ

---

# Extract country codes from `location` to column `country_code`

'NS' refers to not being specified, rather than a actual country code.

In [10]:
jobPostings["location"] = jobPostings["location"].fillna("NS") # Not specified

In [11]:
def extract_country_code(t):
    return t[:2]

In [12]:
code = []
for i in range(len(jobPostings)):
    code.append(extract_country_code(jobPostings["location"][i]))

jobPostings["country_code"] = code
jobPostings["country_code"] = jobPostings["country_code"].astype("category")

In [13]:
jobPostings.dtypes

job_id                                        int64
title                                        object
location                                     object
department                                   object
salary_range                                 object
company_profile                              object
description                                  object
requirements                                 object
benefits                                     object
telecommuting                                 int64
has_company_logo                              int64
has_questions                                 int64
employment_type                              object
required_experience                          object
required_education                           object
industry                                     object
fraudulent                                    int64
salary_lower_limit                            int64
salary_upper_limit                            int64
function_typ