# Read Database
Read and clear data from database.

## Load dependencies

In [1]:
import pandas as pd
import numpy as np

## Load database and set dtypes

In [2]:
dtypes = {
    "title": "string",
    "location": "string",
    "department": "string",
    "salary_range": "string",
    "company_profile": "string",
    "description": "string",
    "requirements": "string",
    "benefits": "string",
    "telecommuting": "category",
    "has_company_logo": "category",
    "has_questions": "category",
    "employment_type": "string",
    "required_experience": "string",
    "required_education": "string",
    "industry": "string",
    "function": "string",
    "fraudulent": "category"
}

data = pd.read_csv("../src/dataset/fake_job_postings.csv", index_col="job_id", dtype=dtypes)


## Set categories

In [3]:
categories = {
    "telecommuting": ["no", "yes"],
    "has_company_logo": ["no", "yes"],
    "has_questions": ["no", "yes"],
    "fraudulent": ["no", "yes"]
}

for column, column_categories in categories.items():
    data[column].cat.categories = column_categories


## Fill empty values

In [4]:
fillna = {
    "title": "Missing",
    "location": "Missing",
    "department": "Missing",
    "salary_range": "Missing",
    "company_profile": "Missing",
    "description": "Missing",
    "requirements": "Missing",
    "benefits": "Missing",
    "employment_type": "Missing",
    "required_experience": "Missing",
    "required_education": "Missing",
    "industry": "Missing",
    "function": "Missing",
}

for column, column_fillna in fillna.items():
    data[column] = data[column].fillna(column_fillna)


## Drop test column

In [10]:
fraudulent = data["fraudulent"]

data = data.drop("fraudulent", axis="columns")


## Display

In [11]:
data

Unnamed: 0_level_0,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,Marketing Intern,"US, NY, New York",Marketing,Missing,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,Missing,no,yes,no,Other,Internship,Missing,Missing,Marketing
2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,Missing,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,no,yes,no,Full-time,Not Applicable,Missing,Marketing and Advertising,Customer Service
3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",Missing,Missing,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,Missing,no,yes,no,Missing,Missing,Missing,Missing,Missing
4,Account Executive - Washington DC,"US, DC, Washington",Sales,Missing,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,no,yes,no,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales
5,Bill Review Manager,"US, FL, Fort Worth",Missing,Missing,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,no,yes,yes,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17876,Account Director - Distribution,"CA, ON, Toronto",Sales,Missing,Vend is looking for some awesome new talent to...,Just in case this is the first time you’ve vis...,To ace this role you:Will eat comprehensive St...,What can you expect from us?We have an open cu...,no,yes,yes,Full-time,Mid-Senior level,Missing,Computer Software,Sales
17877,Payroll Accountant,"US, PA, Philadelphia",Accounting,Missing,WebLinc is the e-commerce platform and service...,The Payroll Accountant will focus primarily on...,- B.A. or B.S. in Accounting- Desire to have f...,Health &amp; WellnessMedical planPrescription ...,no,yes,yes,Full-time,Mid-Senior level,Bachelor's Degree,Internet,Accounting/Auditing
17878,Project Cost Control Staff Engineer - Cost Con...,"US, TX, Houston",Missing,Missing,We Provide Full Time Permanent Positions for m...,Experienced Project Cost Control Staff Enginee...,At least 12 years professional experience.Abil...,Missing,no,no,no,Full-time,Missing,Missing,Missing,Missing
17879,Graphic Designer,"NG, LA, Lagos",Missing,Missing,Missing,Nemsia Studios is looking for an experienced v...,1. Must be fluent in the latest versions of Co...,Competitive salary (compensation will be based...,no,no,yes,Contract,Not Applicable,Professional,Graphic Design,Design
