define how to handle raw data

In [1]:
import csv
import re
import pandas as pd
import numpy as np
import os
import pathlib

In [2]:
#CSV column indices

TITLE_INDEX = 1
LOCATION_INDEX=2
DEPARTMENT_INDEX=3
SALARY_INDEX=4
COMPANY_PROFILE_INDEX=5
DESCRIPTION_INDEX=6
REQUIREMENTS_INDEX=7
BENEFITS_INDEX=8
TELECOMMUTING_INDEX=9
HAS_LOGO_INDEX = 10
HAS_QUESTIONS_INDEX= 11
EMPLOYMENT_TYPE_INDEX=12
REQUIRED_EXPERIENCE_INDEX=13
REQUIRED_EDUCATION_INDEX=14
INDUSTRY_INDEX=15
FUNCTION_INDEX=16
FRAUDULENT_INDEX=17

#Dataframe labels

TITLE_LABEL="title"
LOCATION_LABEL="location"
DEPARTMENT_LABEL="department"
COMPANY_PROFILE_LABEL="company_profile"
DESCRIPTION_LABEL="description"
REQUIREMENTS_LABEL="requirements"
BENEFITS_LABEL="benefits"

MIN_SALARY_LABEL="min_salary"
MAX_SALARY_LABEL="max_salary"
SALARY_RANGE_LABEL="salary_range"
SALARY_MIDPT_LABEL="salary_midpt"

EMPLOYMENT_TYPE_LABEL="employment_type"
REQUIRED_EXPERIENCE_LABEL="required_experience"
REQUIRED_EDUCATION_LABEL="required_education"
INDUSTRY_LABEL="industry"
FUNCTION_LABEL="function"

TELECOMMUTING_LABEL="telecommuting"
HAS_LOGO_LABEL = "has_logo"
HAS_QUESTIONS_LABEL= "has_questions"

FRAUDULENT_LABEL="IS_FRAUDULENT"

#salary processing constants

SALARY_RANGE_REGEX = re.compile(r"\d+-\d+")
SALARY_VAL_REGEX= re.compile(r"\d+")


#text preprocessing hyperparameters

baselineStopwords = [] #TODO
NONALPHANUMERIC_REGEX= re.compile(r"[^A-Za-z0-9^,!.\/'+-=]")


MULT_WHITESPACE_REGEX= re.compile(r"\s{2,}")




In [3]:

def findOrAdd(givenList, givenVal):
    ind = -1
    for i, currVal in enumerate(givenList):
        if currVal == givenVal:
            ind= i
            break
    if ind == -1:
        givenList.append(givenVal)
        ind= len(givenList) -1

    return ind

#classes for handling summary data from the data loading/preprocessing
class CategoriesSummary:
    def __init__(self):
        self.employmentTypeVals = []
        self.requiredExperienceVals = []
        self.requiredEducationVals = []
        self.industryVals = []
        self.functionVals = []

    def findOrAddEmploymentType(self, currEmploymentType):
        return findOrAdd(self.employmentTypeVals, currEmploymentType)
    def findOrAddRequiredExperience(self, currRequiredExperience):
        return findOrAdd(self.requiredExperienceVals, currRequiredExperience)
    def findOrAddRequiredEducation(self, currRequiredEducation):
        return findOrAdd(self.requiredEducationVals, currRequiredEducation)
    def findOrAddIndustry(self, currIndustry):
        return findOrAdd(self.industryVals, currIndustry)
    def findOrAddFunction(self, currFunction):
        return findOrAdd(self.functionVals, currFunction)

    def saveToFiles(self, dirPath):
        employmentTypeOptionsFilePath = os.path.join(dirPath, "employment_type_options.txt")
        with open(employmentTypeOptionsFilePath) as employmentTypeOptionsFile:
            employmentTypeOptionsFile.writelines(self.employmentTypeVals)
        
        requiredExperienceOptionsFilePath = os.path.join(dirPath, "required_experience_options.txt")
        with open(requiredExperienceOptionsFilePath) as requiredExperienceOptionsFile:
            requiredExperienceOptionsFile.writelines(self.requiredExperienceVals)
        
        requiredEducationOptionsFilePath = os.path.join(dirPath, "required_education_options.txt")
        with open(requiredEducationOptionsFilePath) as requiredEducationOptionsFile:
            requiredEducationOptionsFile.writelines(self.requiredEducationVals)
        
        industryOptionsFilePath = os.path.join(dirPath, "industry_options.txt")
        with open(industryOptionsFilePath) as industryOptionsFile:
            industryOptionsFile.writelines(self.industryVals)
        
        functionOptionsFilePath = os.path.join(dirPath, "function_options.txt")
        with open(functionOptionsFilePath) as functionOptionsFile:
            functionOptionsFile.writelines(self.functionVals)
        
        
        


class TextAttributeSummaries:
    def __init__(self):
        self.cumulTitlesText = ""
        self.cumulLocationsText = ""
        self.cumulDepartmentsText = ""
        self.cumulCompanyProfilesText = ""
        self.cumulDescriptionsText = ""
        self.cumulRequirementsText = ""
        self.cumulBenefitsText = ""

    def addTitle(self, currTitle):
        self.cumulTitlesText += currTitle + " "
    def addLocation(self, currLocation):
        self.cumulLocationsText += currLocation + " "
    def addDepartment(self, currDepartment):
        self.cumulDepartmentsText += currDepartment + " "
    def addCompanyProfile(self, currCompanyProfile):
        self.cumulCompanyProfilesText += currCompanyProfile + " "
    def addDescription(self, currDescription):
        self.cumulDescriptionsText += currDescription + " "
    def addRequirements(self, currRequirements):
        self.cumulRequirementsText += currRequirements + " "
    def addBenefits(self, currBenefits):
        self.cumulBenefitsText += currBenefits + " "

    def saveToFile(self, dirPath):
        textAttributeSummariesFilePath = os.path.join(dirPath, "text_attribute_summaries.txt")
        with open(textAttributeSummariesFilePath) as textAttributeSummariesFile:
            textAttributeSummariesFile.write("title: %s\n" % self.cumulTitlesText)
            textAttributeSummariesFile.write("location: %s\n" % self.cumulLocationsText)
            textAttributeSummariesFile.write("department: %s\n" % self.cumulDepartmentsText)
            textAttributeSummariesFile.write("company profile: %s\n" % self.cumulCompanyProfilesText)
            textAttributeSummariesFile.write("description: %s\n" % self.cumulDescriptionsText)
            textAttributeSummariesFile.write("requirements: %s\n" % self.cumulRequirementsText)
            textAttributeSummariesFile.write("benefits: %s\n" % self.cumulBenefitsText)





In [4]:
#preprocessing functions

# based on
#  https://www.kaggle.com/lystdo/lstm-with-word2vec-embeddings
def cleanText(rawText, stopwordsList= None, stemmer= None):
    processedText = ""

    processedText= rawText.lower()


    processedText = re.sub(NONALPHANUMERIC_REGEX, " ", processedText)

    # TODO review these- with some I'm not sure
    #  whether they work as written (raw string vs escape characters?) or whether I want them
    processedText = re.sub(r"what's", "what is ", processedText)
    processedText = re.sub(r"\'s", " ", processedText)
    processedText = re.sub(r"\'ve", " have ", processedText)
    processedText = re.sub(r"can't", "cannot ", processedText)
    processedText = re.sub(r"n't", " not ", processedText)
    processedText = re.sub(r"i'm", "i am ", processedText)
    processedText = re.sub(r"\'re", " are ", processedText)
    processedText = re.sub(r"\'d", " would ", processedText)
    processedText = re.sub(r"\'ll", " will ", processedText)
    processedText = re.sub(r",", " ", processedText)
    processedText = re.sub(r"\.", " ", processedText)
    processedText = re.sub(r"!", " ! ", processedText)
    processedText = re.sub(r"\/", " ", processedText)
    processedText = re.sub(r"\^", " ^ ", processedText)
    processedText = re.sub(r"\+", " + ", processedText)
    processedText = re.sub(r"\-", " - ", processedText)
    processedText = re.sub(r"\=", " = ", processedText)
    processedText = re.sub(r"'", " ", processedText)
    processedText = re.sub(r"(\d+)(k)", r"\g<1>000", processedText)
    processedText = re.sub(r":", " : ", processedText)
    processedText = re.sub(r" e g ", " eg ", processedText)
    processedText = re.sub(r" b g ", " bg ", processedText)
    processedText = re.sub(r" u s ", " american ", processedText)
    processedText = re.sub(r"\0s", "0", processedText)
    processedText = re.sub(r" 9 11 ", "911", processedText)
    processedText = re.sub(r"e - mail", "email", processedText)
    processedText = re.sub(r"j k", "jk", processedText)

    processedText = re.sub(MULT_WHITESPACE_REGEX, " ", processedText)

    #todo does this need to go before the regex?
    if stopwordsList is not None:
        textArr = processedText.split()
        textArr = [currWord for currWord in textArr if not currWord in stopwordsList]
        processedText = " ".join(textArr)

    if stemmer is not None:
        textArr = processedText.split()
        stemmedWords = [stemmer.stem(currWord) for currWord in textArr]
        processedText = " ".join(stemmedWords)

    return processedText

def processJobListing(rawDataRow, categorySummariesObj, textAttributeSummariesObj):
    processedListing = {}


    #copy the boolean values
    processedListing[TELECOMMUTING_LABEL] = rawDataRow[TELECOMMUTING_INDEX]
    processedListing[HAS_LOGO_LABEL] = rawDataRow[HAS_LOGO_INDEX]
    processedListing[HAS_QUESTIONS_LABEL] = rawDataRow[HAS_QUESTIONS_INDEX]
    processedListing[FRAUDULENT_LABEL] = rawDataRow[FRAUDULENT_INDEX]

    #one-hot encode the categorical attributes
    currEmploymentType = rawDataRow[EMPLOYMENT_TYPE_INDEX]
    currEmploymentTypeInd = categorySummariesObj.findOrAddEmploymentType(currEmploymentType)
    processedListing[EMPLOYMENT_TYPE_LABEL] = currEmploymentTypeInd
    
    currRequiredExperience = rawDataRow[REQUIRED_EXPERIENCE_INDEX]
    currRequiredExperienceInd = categorySummariesObj.findOrAddRequiredExperience(currRequiredExperience)
    processedListing[REQUIRED_EXPERIENCE_LABEL] = currRequiredExperienceInd
    
    currRequiredEducation = rawDataRow[REQUIRED_EDUCATION_INDEX]
    currRequiredEducationInd = categorySummariesObj.findOrAddRequiredEducation(currRequiredEducation)
    processedListing[REQUIRED_EDUCATION_LABEL] = currRequiredEducationInd
    
    currIndustry = rawDataRow[INDUSTRY_INDEX]
    currIndustryInd = categorySummariesObj.findOrAddIndustry(currIndustry)
    processedListing[INDUSTRY_LABEL] = currIndustryInd
    
    currFunction = rawDataRow[FUNCTION_INDEX]
    currFunctionInd = categorySummariesObj.findOrAddFunction(currFunction)
    processedListing[FUNCTION_LABEL] = currFunctionInd

    #process salary attribute, eliminating invalid salary entries
    currSalaryText = rawDataRow[SALARY_INDEX]

    minSalaryVal = -1
    maxSalaryVal = -1
    salaryRange= -1
    salaryMidpt = -1

    if SALARY_RANGE_REGEX.match(currSalaryText):
        salaryStrs = currSalaryText.split("-")
        minSalaryStr= salaryStrs[0]
        maxSalaryStr= salaryStrs[1]

        minSalaryVal = float(minSalaryStr)
        maxSalaryVal = float(maxSalaryStr)
        salaryRange = maxSalaryVal - minSalaryVal
        salaryMidpt = (maxSalaryVal + minSalaryVal)/2
    elif SALARY_VAL_REGEX.match(currSalaryText):
        minSalaryVal = float(currSalaryText)
        maxSalaryVal = minSalaryVal
        salaryRange = 0
        salaryMidpt= minSalaryVal
    else:
        pass #use default invalid values

    processedListing[MIN_SALARY_LABEL] = minSalaryVal
    processedListing[MAX_SALARY_LABEL] = maxSalaryVal
    processedListing[SALARY_RANGE_LABEL] = salaryRange
    processedListing[SALARY_MIDPT_LABEL] = salaryMidpt

    #basic processing of text attributes
    titleVal = rawDataRow[TITLE_INDEX]
    cleanedTitleVal= cleanText(titleVal, baselineStopwords)
    processedListing[TITLE_LABEL] = cleanedTitleVal
    textAttributeSummariesObj.addTitle(cleanedTitleVal)

    locationVal = rawDataRow[LOCATION_INDEX]
    cleanedLocationVal =  cleanText(locationVal, baselineStopwords)
    processedListing[LOCATION_LABEL] = cleanedLocationVal
    textAttributeSummariesObj.addLocation(cleanedLocationVal)

    departmentVal = rawDataRow[DEPARTMENT_INDEX]
    cleanedDepartmentVal = cleanText(departmentVal, baselineStopwords)
    processedListing[DEPARTMENT_LABEL] = cleanedDepartmentVal
    textAttributeSummariesObj.addDepartment(cleanedDepartmentVal)

    companyProfileVal = rawDataRow[COMPANY_PROFILE_INDEX]
    cleanedCompanyProfileVal= cleanText(companyProfileVal, baselineStopwords)
    processedListing[COMPANY_PROFILE_LABEL] = cleanedCompanyProfileVal
    textAttributeSummariesObj.addCompanyProfile(cleanedCompanyProfileVal)

    descriptionVal = rawDataRow[DESCRIPTION_INDEX]
    cleanedDescriptionVal = cleanText(descriptionVal, baselineStopwords)
    processedListing[DESCRIPTION_LABEL] = cleanedDescriptionVal
    textAttributeSummariesObj.addDescription(cleanedDescriptionVal)

    requirementsVal = rawDataRow[REQUIREMENTS_INDEX]
    cleanedRequirementsVal = cleanText(requirementsVal, baselineStopwords)
    processedListing[REQUIREMENTS_LABEL] = cleanedRequirementsVal
    textAttributeSummariesObj.addRequirements(cleanedRequirementsVal)

    benefitsVal = rawDataRow[BENEFITS_INDEX]
    cleanedBenefitsVal = cleanText(benefitsVal, baselineStopwords)
    processedListing[BENEFITS_LABEL] = cleanedBenefitsVal
    textAttributeSummariesObj.addBenefits(cleanedBenefitsVal)

    return processedListing


def loadData(fpath):
    allCategories = CategoriesSummary()
    allTextAttributes = TextAttributeSummaries()
    processedData= []

    with open(fpath) as raw_csv:
        dataReader = csv.reader(raw_csv)
        yield next(dataReader) # eliminate header row

        #preprocesses the data as it's loaded
        for row in dataReader:
            processedRow = processJobListing(row, allCategories, allTextAttributes)
            processedData.append(processedRow)

    return processedData, allCategories, allTextAttributes

In [5]:

currDirStr = os.getcwd()
currDir = pathlib.Path(currDirStr)
projectDir = currDir.parent

DATA_PATH = os.path.join(projectDir, "data")
RAW_DATA_PATH=os.path.join(DATA_PATH, "raw")
PROCESSED_DATA_PATH= os.path.join(DATA_PATH, "processed")

PROCESSED_FILE_PREFIX= "cleaned_"

datasetDirName ="kaggle_fake_job_postings"
datasetDirPath = os.path.join(PROCESSED_DATA_PATH, datasetDirName)
print(list(datasetDirPath))
print(datasetDirPath)
if os.path.exists(datasetDirPath):
    print("overwriting directory at path ", datasetDirPath)
    os.rmdir(datasetDirPath)
os.mkdir(datasetDirPath)

testInd = findOrAdd(list(datasetDirPath), "\\")
print(testInd)


rawFname = "fake_job_postings.csv"
rawFpath = os.path.join(RAW_DATA_PATH, rawFname)

['C', ':', '\\', 'U', 's', 'e', 'r', 's', '\\', 's', 's', 'i', 'l', 'i', '\\', 'P', 'y', 'c', 'h', 'a', 'r', 'm', 'P', 'r', 'o', 'j', 'e', 'c', 't', 's', '\\', 'J', 'o', 'b', 'P', 'o', 's', 't', 'i', 'n', 'g', 'F', 'r', 'a', 'u', 'd', 'D', 'e', 't', 'e', 'c', 't', 'i', 'o', 'n', '\\', 'd', 'a', 't', 'a', '\\', 'p', 'r', 'o', 'c', 'e', 's', 's', 'e', 'd', '\\', 'k', 'a', 'g', 'g', 'l', 'e', '_', 'f', 'a', 'k', 'e', '_', 'j', 'o', 'b', '_', 'p', 'o', 's', 't', 'i', 'n', 'g', 's']
C:\Users\ssili\PycharmProjects\JobPostingFraudDetection\data\processed\kaggle_fake_job_postings
overwriting directory at path  C:\Users\ssili\PycharmProjects\JobPostingFraudDetection\data\processed\kaggle_fake_job_postings
2


In [6]:
cleanedData, categorySummaries, textAttributeSummaries = loadData(rawFpath)
cleanedDataDf = pd.DataFrame(cleanedData)

UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 4370: character maps to <undefined>

In [None]:
cleanedDataPath = os.path.join(datasetDirPath, PROCESSED_FILE_PREFIX + rawFname)
dataSaveResult = cleanedDataDf.to_csv( cleanedDataPath)
if dataSaveResult is not None:
    print("saving dataframe failed with a message (about csv format?): ", dataSaveResult)

categorySummaries.saveToFiles(datasetDirPath)
textAttributeSummaries.saveToFile(datasetDirPath)



