In [1]:

import tensorflow as tf
import tensorflow.keras.preprocessing as preproc
from tensorflow.keras import layers
from tensorflow.keras import Model
from tensorflow.keras import metrics
from tensorflow.keras import callbacks
import pandas as pd
import numpy as np
import sklearn as sk

from DataNamesReference import *
from LstmTraining import getNonEmptyLines
import tensorflow.keras.regularizers as regs

import absl.logging as abslogging

tf.get_logger().setLevel(abslogging.INFO)


import os

# os.environ["TF_CPP_MIN_VLOG_LEVEL"] = "1"



CSV_READ_ARGS = {"keep_default_na":False, "index_col":0, "dtype":COLUMN_DATA_TYPES}

# the plus 1 is for the start token
MAX_TITLE_LEN = 8+1
MAX_LOCATION_LEN =5+1
MAX_DEPARTMENT_LEN = 3+1
MAX_COMPANY_PROFILE_LEN = 200+1
MAX_DESCRIPTION_LEN = 300+1
MAX_REQUIREMENTS_LEN=200+1
MAX_BENEFITS_LEN = 125+1

START_TOKEN= "<TEXT_STARTS_AFTER_THIS>"

PADDING_TYPE = "post"
TRUNCATING_TYPE = "post"

#not eliminating any of the options
# extra plus 1 is for the start token
TITLE_VOCAB_SIZE=4708 + 1 + 1
LOCATION_VOCAB_SIZE=2335+1 + 1
DEPARTMENT_VOCAB_SIZE=1060+1 + 1
COMPANY_PROFILE_VOCAB_SIZE=13527+1 + 1
DESCRIPTION_VOCAB_SIZE=33470+1 + 1
REQUIREMENTS_VOCAB_SIZE=25259+1 + 1
BENEFITS_VOCAB_SIZE=11717+1 + 1


NUM_EMPLOYMENT_TYPE_OPTIONS = 6
NUM_REQUIRED_EXPERIENCE_OPTIONS=8
NUM_REQUIRED_EDUCATION_OPTIONS =14
NUM_INDUSTRY_OPTIONS=132
NUM_FUNCTION_OPTIONS=38


TEXT_EMBED_DIM = 125 # todo try boosting up to 200 or 300

LSTM_SIZE=300

BASE_LSTM_DROPOUT=0.05
BASE_DENSE_DROPOUT=0.05
BASE_CATEG_DROPOUT=0.05

BASE_TEXT_EMBED_LAMBDA=0.001
BASE_CATEG_EMBED_LAMBDA=0.001
BASE_DENSE_LAMBDA= 0.001
BASE_LSTM_LAMBDA=0.001



BASE_DENSE_SIZE= 150

DENSE_ACTIVATION="relu"






In [2]:
#do final preprocessing on each text attrib
def convertTitlesToPaddedSequences(dataDf):
    allTitles = getNonEmptyLines(TITLES_SUMMARY_FILE_PATH)
    allTitles.append(START_TOKEN)
    titleTokenizer = preproc.text.Tokenizer(num_words=TITLE_VOCAB_SIZE)
    titleTokenizer.fit_on_texts(allTitles)

    trainTitles = START_TOKEN + " " + dataDf[TITLE_LABEL].astype(str)
    trainTitleSequences = titleTokenizer.texts_to_sequences(trainTitles)
    paddedTrainTitleSequences = preproc.sequence.pad_sequences(trainTitleSequences, maxlen=MAX_TITLE_LEN,
                                                               padding=PADDING_TYPE, truncating=TRUNCATING_TYPE)

    return paddedTrainTitleSequences

def convertLocationsToPaddedSequences(dataDf):
    allLocations = getNonEmptyLines(LOCATIONS_SUMMARY_FILE_PATH)
    allLocations.append(START_TOKEN)
    locationTokenizer = preproc.text.Tokenizer(num_words=LOCATION_VOCAB_SIZE)
    locationTokenizer.fit_on_texts(allLocations)

    trainLocations = START_TOKEN + " " + dataDf[LOCATION_LABEL].astype(str)
    trainLocationSequences = locationTokenizer.texts_to_sequences(trainLocations)
    paddedTrainLocationSequences = preproc.sequence.pad_sequences(trainLocationSequences, maxlen=MAX_LOCATION_LEN,
                                                               padding=PADDING_TYPE, truncating=TRUNCATING_TYPE)

    return paddedTrainLocationSequences

def convertDepartmentsToPaddedSequences(dataDf):
    allDepartments = getNonEmptyLines(DEPARTMENTS_SUMMARY_FILE_PATH)
    allDepartments.append(START_TOKEN)
    departmentTokenizer = preproc.text.Tokenizer(num_words=DEPARTMENT_VOCAB_SIZE)
    departmentTokenizer.fit_on_texts(allDepartments)

    trainDepartments = START_TOKEN + " " +  dataDf[DEPARTMENT_LABEL].astype(str)
    trainDepartmentSequences = departmentTokenizer.texts_to_sequences(trainDepartments)
    paddedTrainDepartmentSequences = preproc.sequence.pad_sequences(trainDepartmentSequences, maxlen=MAX_DEPARTMENT_LEN,
                                                               padding=PADDING_TYPE, truncating=TRUNCATING_TYPE)

    return paddedTrainDepartmentSequences

def convertCompanyProfilesToPaddedSequences(dataDf):
    allCompanyProfiles = getNonEmptyLines(COMPANY_PROFILES_SUMMARY_FILE_PATH)
    allCompanyProfiles.append(START_TOKEN)
    companyProfileTokenizer = preproc.text.Tokenizer(num_words=COMPANY_PROFILE_VOCAB_SIZE)
    companyProfileTokenizer.fit_on_texts(allCompanyProfiles)

    trainCompanyProfiles = START_TOKEN + " " + dataDf[COMPANY_PROFILE_LABEL].astype(str)
    trainCompanyProfileSequences = companyProfileTokenizer.texts_to_sequences(trainCompanyProfiles)
    paddedTrainCompanyProfileSequences = preproc.sequence.pad_sequences(trainCompanyProfileSequences, maxlen=MAX_COMPANY_PROFILE_LEN,
                                                               padding=PADDING_TYPE, truncating=TRUNCATING_TYPE)

    return paddedTrainCompanyProfileSequences

def convertDescriptionsToPaddedSequences(dataDf):
    allDescriptions = getNonEmptyLines(DESCRIPTIONS_SUMMARY_FILE_PATH)
    allDescriptions.append(START_TOKEN)
    descriptionTokenizer = preproc.text.Tokenizer(num_words=DESCRIPTION_VOCAB_SIZE)
    descriptionTokenizer.fit_on_texts(allDescriptions)

    trainDescriptions = START_TOKEN + " " + dataDf[DESCRIPTION_LABEL].astype(str)
    trainDescriptionSequences = descriptionTokenizer.texts_to_sequences(trainDescriptions)
    paddedTrainDescriptionSequences = preproc.sequence.pad_sequences(trainDescriptionSequences, maxlen=MAX_DESCRIPTION_LEN,
                                                               padding=PADDING_TYPE, truncating=TRUNCATING_TYPE)

    return paddedTrainDescriptionSequences

def convertRequirementsToPaddedSequences(dataDf):
    allRequirements = getNonEmptyLines(REQUIREMENTS_SUMMARY_FILE_PATH)
    allRequirements.append(START_TOKEN)
    requirementsTokenizer = preproc.text.Tokenizer(num_words=REQUIREMENTS_VOCAB_SIZE)
    requirementsTokenizer.fit_on_texts(allRequirements)

    trainRequirements = START_TOKEN + " " + dataDf[REQUIREMENTS_LABEL].astype(str)
    trainRequirementsSequences = requirementsTokenizer.texts_to_sequences(trainRequirements)
    paddedTrainRequirementsSequences = preproc.sequence.pad_sequences(trainRequirementsSequences, maxlen=MAX_REQUIREMENTS_LEN,
                                                               padding=PADDING_TYPE, truncating=TRUNCATING_TYPE)

    return paddedTrainRequirementsSequences

def convertBenefitsToPaddedSequences(dataDf):
    allBenefits = getNonEmptyLines(BENEFITS_SUMMARY_FILE_PATH)
    allBenefits.append(START_TOKEN)
    benefitsTokenizer = preproc.text.Tokenizer(num_words=BENEFITS_VOCAB_SIZE)
    benefitsTokenizer.fit_on_texts(allBenefits)

    trainBenefits = START_TOKEN + " " + dataDf[BENEFITS_LABEL].astype(str)
    trainBenefitsSequences = benefitsTokenizer.texts_to_sequences(trainBenefits)
    paddedTrainBenefitsSequences = preproc.sequence.pad_sequences(trainBenefitsSequences, maxlen=MAX_BENEFITS_LEN,
                                                               padding=PADDING_TYPE, truncating=TRUNCATING_TYPE)

    return paddedTrainBenefitsSequences


def normalizeSalaryData(dataDf):
    numCases = dataDf.shape[0]
    nullSalaryIndices = dataDf.index[dataDf[SALARY_MIDPT_LABEL] == -1].tolist()
    
    salaryMissingSentinel = np.reshape(nullSalaryIndices, (numCases, 1))


    salaryMins = dataDf[MIN_SALARY_LABEL].to_numpy()
    salaryMins = np.reshape(salaryMins, (numCases, 1))
    salaryMins[nullSalaryIndices, 0] = np.nan

    salaryMaxes = dataDf[MAX_SALARY_LABEL].to_numpy()
    salaryMaxes = np.reshape(salaryMaxes, (numCases, 1))
    salaryMaxes[nullSalaryIndices, 0] = np.nan

    salaryMidpts = dataDf[SALARY_MIDPT_LABEL].to_numpy()
    salaryMidpts = np.reshape(salaryMidpts, (numCases, 1))
    salaryMidpts[nullSalaryIndices, 0] = np.nan

    salaryRanges = dataDf[SALARY_RANGE_LABEL].to_numpy()
    salaryRanges = np.reshape(salaryRanges, (numCases, 1))
    salaryRanges[nullSalaryIndices, 0] = np.nan


    salaryData = np.concatenate((salaryMins, salaryMaxes, salaryMidpts, salaryRanges), axis=1)
    salaryData = sk.preprocessing.scale(salaryData)
    salaryData[nullSalaryIndices, 0:3] = 0

    return salaryData, salaryMissingSentinel




In [3]:
trainDataDf = pd.read_csv(TRAIN_DATA_PATH, **CSV_READ_ARGS)

In [4]:
trainTitleSequences = convertTitlesToPaddedSequences(trainDataDf)
trainLocationSequences = convertLocationsToPaddedSequences(trainDataDf)
trainDepartmentSequences = convertDepartmentsToPaddedSequences(trainDataDf)
trainCompanyProfileSequences = convertCompanyProfilesToPaddedSequences(trainDataDf)
trainDescriptionSequences = convertDescriptionsToPaddedSequences(trainDataDf)
trainRequirementsSequences = convertRequirementsToPaddedSequences(trainDataDf)
trainBenefitsSequences = convertBenefitsToPaddedSequences(trainDataDf)

In [5]:
posTrainExamples = trainDataDf[trainDataDf[FRAUDULENT_LABEL] == True]
numPosTrainExamples = posTrainExamples.shape[0]

numTrainExamples = trainDataDf.shape[0]

trainPosFrac = numPosTrainExamples/numTrainExamples

trainTelecommuting = trainDataDf[TELECOMMUTING_LABEL].astype("float32").to_numpy()
trainTelecommuting = np.reshape(trainTelecommuting, (numTrainExamples, 1))

trainHasLogo =  trainDataDf[HAS_LOGO_LABEL].astype("float32").to_numpy()
trainHasLogo = np.reshape(trainHasLogo, (numTrainExamples, 1))

trainHasQuestions =  trainDataDf[HAS_QUESTIONS_LABEL].astype("float32").to_numpy()
trainHasQuestions = np.reshape(trainHasQuestions, (numTrainExamples, 1))

trainLabels = trainDataDf[FRAUDULENT_LABEL].astype("int32").to_numpy()
trainLabels = np.reshape(trainLabels, (numTrainExamples,1))

# trainLabelsZerosCol = [1 if labelVal == 0 else 0 for labelVal in trainLabels]
# trainLabelsZerosCol = np.array(trainLabelsZerosCol)
# trainLabelsZerosCol = np.reshape(trainLabelsZerosCol, (numTrainExamples, 1))
#
# trainLabelsOnesCol = trainLabels
# trainLabels = np.concatenate((trainLabelsZerosCol, trainLabelsOnesCol), axis=1)

In [6]:
trainExamplesIndexArray = np.arange(numTrainExamples)

employmentTypesArray = trainDataDf[EMPLOYMENT_TYPE_LABEL].to_numpy()
employmentTypesTrainVectors = np.zeros((numTrainExamples, NUM_EMPLOYMENT_TYPE_OPTIONS), dtype="float32")
employmentTypesTrainVectors[trainExamplesIndexArray, employmentTypesArray] = 1

requiredExperienceArray = trainDataDf[REQUIRED_EXPERIENCE_LABEL].to_numpy()
requiredExperienceTrainVectors = np.zeros((numTrainExamples, NUM_REQUIRED_EXPERIENCE_OPTIONS), dtype="float32")
requiredExperienceTrainVectors[trainExamplesIndexArray, requiredExperienceArray] = 1

requiredEducationArray = trainDataDf[REQUIRED_EDUCATION_LABEL].to_numpy()
requiredEducationTrainVectors = np.zeros((numTrainExamples, NUM_REQUIRED_EDUCATION_OPTIONS), dtype="float32")
requiredEducationTrainVectors[trainExamplesIndexArray, requiredEducationArray] = 1

industryArray = trainDataDf[INDUSTRY_LABEL].to_numpy()
industryTrainVectors = np.zeros((numTrainExamples, NUM_INDUSTRY_OPTIONS), dtype="float32")
industryTrainVectors[trainExamplesIndexArray, industryArray] = 1

functionArray = trainDataDf[FUNCTION_LABEL].to_numpy()
functionTrainVectors = np.zeros((numTrainExamples, NUM_FUNCTION_OPTIONS), dtype="float32")
functionTrainVectors[trainExamplesIndexArray, functionArray] = 1


trainSalaryData, trainSalaryMissingSentinel = normalizeSalaryData(trainDataDf)

In [7]:
validDataDf = pd.read_csv(VALIDATION_DATA_PATH, **CSV_READ_ARGS)

In [8]:
validTitleSequences = convertTitlesToPaddedSequences(validDataDf)
validLocationSequences = convertLocationsToPaddedSequences(validDataDf)
validDepartmentSequences = convertDepartmentsToPaddedSequences(validDataDf)
validCompanyProfileSequences = convertCompanyProfilesToPaddedSequences(validDataDf)
validDescriptionSequences = convertDescriptionsToPaddedSequences(validDataDf)
validRequirementsSequences = convertRequirementsToPaddedSequences(validDataDf)
validBenefitsSequences = convertBenefitsToPaddedSequences(validDataDf)

In [9]:
posValidExamples = validDataDf[validDataDf[FRAUDULENT_LABEL] == True]
numPosValidExamples = posValidExamples.shape[0]

numValidExamples = validDataDf.shape[0]

validPosFrac = numPosValidExamples/numValidExamples

validTelecommuting =  validDataDf[TELECOMMUTING_LABEL].astype("float32").to_numpy()
validTelecommuting = np.reshape(validTelecommuting, (numValidExamples, 1))

validHasLogo =  validDataDf[HAS_LOGO_LABEL].astype("float32").to_numpy()
validHasLogo = np.reshape(validHasLogo, (numValidExamples, 1))

validHasQuestions =  validDataDf[HAS_QUESTIONS_LABEL].astype("float32").to_numpy()
validHasQuestions = np.reshape(validHasQuestions, (numValidExamples, 1))

validLabels = validDataDf[FRAUDULENT_LABEL].astype("int32").to_numpy()
validLabels = np.reshape(validLabels, (numValidExamples, 1))


# validLabelsZerosCol = [1 if labelVal == 0 else 0 for labelVal in validLabels]
# validLabelsZerosCol = np.array(validLabelsZerosCol)
# validLabelsZerosCol = np.reshape(validLabelsZerosCol, (numValidExamples, 1))
#
# validLabelsOnesCol = validLabels
# validLabels = np.concatenate((validLabelsZerosCol, validLabelsOnesCol), axis=1)


In [10]:
validExamplesIndexArray = np.arange(numValidExamples)

employmentTypesArray = validDataDf[EMPLOYMENT_TYPE_LABEL].to_numpy()
employmentTypesValidVectors = np.zeros((numValidExamples, NUM_EMPLOYMENT_TYPE_OPTIONS), dtype="float32")
employmentTypesValidVectors[validExamplesIndexArray, employmentTypesArray] = 1

requiredExperienceArray = validDataDf[REQUIRED_EXPERIENCE_LABEL].to_numpy()
requiredExperienceValidVectors = np.zeros((numValidExamples, NUM_REQUIRED_EXPERIENCE_OPTIONS), dtype="float32")
requiredExperienceValidVectors[validExamplesIndexArray, requiredExperienceArray] = 1

requiredEducationArray = validDataDf[REQUIRED_EDUCATION_LABEL].to_numpy()
requiredEducationValidVectors = np.zeros((numValidExamples, NUM_REQUIRED_EDUCATION_OPTIONS), dtype="float32")
requiredEducationValidVectors[validExamplesIndexArray, requiredEducationArray] = 1

industryArray = validDataDf[INDUSTRY_LABEL].to_numpy()
industryValidVectors = np.zeros((numValidExamples, NUM_INDUSTRY_OPTIONS), dtype="float32")
industryValidVectors[validExamplesIndexArray, industryArray] = 1

functionArray = validDataDf[FUNCTION_LABEL].to_numpy()
functionValidVectors = np.zeros((numValidExamples, NUM_FUNCTION_OPTIONS), dtype="float32")
functionValidVectors[validExamplesIndexArray, functionArray] = 1


validSalaryData, validSalaryMissingSentinel = normalizeSalaryData(validDataDf)

In [11]:
#todo implement use of word2vec pretrained embedding matrix

#no recurrent_dropout on lstm because need to use GPU

#Description LSTM
descriptionInputLayer = layers.Input(name="descriptionTextInput", shape=(MAX_DESCRIPTION_LEN,), dtype="int32")

descriptionEmbedLayer = layers.Embedding(
    name="descriptionEmbedding", embeddings_regularizer=regs.l2(BASE_TEXT_EMBED_LAMBDA),
    input_dim=DESCRIPTION_VOCAB_SIZE, output_dim=TEXT_EMBED_DIM, mask_zero=True,
    input_length=MAX_DESCRIPTION_LEN)(descriptionInputLayer)

descriptionLstmLayer = layers.LSTM(name= "descriptionLstm",units=LSTM_SIZE, dropout=BASE_LSTM_DROPOUT,
                                   kernel_regularizer=regs.l2(BASE_LSTM_LAMBDA))(descriptionEmbedLayer)

descriptionDropoutLayer = layers.Dropout(name="descriptionDropout", rate=BASE_DENSE_DROPOUT)(descriptionLstmLayer)
#todo try omitting batch norm?
descriptionBatchNormLayer = layers.BatchNormalization(name="descriptionBatchNormalization")(descriptionDropoutLayer)

#secondary model output to allow for better training of the description-specific lstm
descriptionSidePrediction = layers.Dense(1, name="descSidePred", activation="sigmoid",
                                         kernel_regularizer=regs.l2(BASE_DENSE_LAMBDA)) (descriptionBatchNormLayer)


#Title LSTM
titleInputLayer = layers.Input(name="titleTextInput", shape=(MAX_TITLE_LEN,), dtype="int32")

titleEmbedLayer = layers.Embedding(
    name="titleEmbedding", embeddings_regularizer=regs.l2(BASE_TEXT_EMBED_LAMBDA),
    input_dim=TITLE_VOCAB_SIZE, output_dim=TEXT_EMBED_DIM, mask_zero=True,
    input_length=MAX_TITLE_LEN)(titleInputLayer)

titleLstmLayer = layers.LSTM(name= "titleLstm",units=LSTM_SIZE, dropout=BASE_LSTM_DROPOUT,
                                   kernel_regularizer=regs.l2(BASE_LSTM_LAMBDA))(titleEmbedLayer)

titleDropoutLayer = layers.Dropout(name="titleDropout", rate=BASE_DENSE_DROPOUT)(titleLstmLayer)
#todo try omitting batch norm?
titleBatchNormLayer = layers.BatchNormalization(name="titleBatchNormalization")(titleDropoutLayer)

#secondary model output to allow for better training of the title-specific lstm
titleSidePrediction = layers.Dense(1, name="titleSidePred", activation="sigmoid",
                                         kernel_regularizer=regs.l2(BASE_DENSE_LAMBDA)) (titleBatchNormLayer)


#Location LSTM
locationInputLayer = layers.Input(name="locationTextInput", shape=(MAX_LOCATION_LEN,), dtype="int32")

locationEmbedLayer = layers.Embedding(
    name="locationEmbedding", embeddings_regularizer=regs.l2(BASE_TEXT_EMBED_LAMBDA),
    input_dim=LOCATION_VOCAB_SIZE, output_dim=TEXT_EMBED_DIM, mask_zero=True,
    input_length=MAX_LOCATION_LEN)(locationInputLayer)

locationLstmLayer = layers.LSTM(name= "locationLstm",units=LSTM_SIZE, dropout=BASE_LSTM_DROPOUT,
                                   kernel_regularizer=regs.l2(BASE_LSTM_LAMBDA))(locationEmbedLayer)

locationDropoutLayer = layers.Dropout(name="locationDropout", rate=BASE_DENSE_DROPOUT)(locationLstmLayer)
#todo try omitting batch norm?
locationBatchNormLayer = layers.BatchNormalization(name="locationBatchNormalization")(locationDropoutLayer)

#secondary model output to allow for better training of the location-specific lstm
locationSidePrediction = layers.Dense(1, name="locSidePred", activation="sigmoid",
                                         kernel_regularizer=regs.l2(BASE_DENSE_LAMBDA)) (locationBatchNormLayer)


#Department LSTM
departmentInputLayer = layers.Input(name="departmentTextInput", shape=(MAX_DEPARTMENT_LEN,), dtype="int32")

departmentEmbedLayer = layers.Embedding(
    name="departmentEmbedding", embeddings_regularizer=regs.l2(BASE_TEXT_EMBED_LAMBDA),
    input_dim=DEPARTMENT_VOCAB_SIZE, output_dim=TEXT_EMBED_DIM, mask_zero=True,
    input_length=MAX_DEPARTMENT_LEN)(departmentInputLayer)

departmentLstmLayer = layers.LSTM(name= "departmentLstm",units=LSTM_SIZE, dropout=BASE_LSTM_DROPOUT,
                                   kernel_regularizer=regs.l2(BASE_LSTM_LAMBDA))(departmentEmbedLayer)

departmentDropoutLayer = layers.Dropout(name="departmentDropout", rate=BASE_DENSE_DROPOUT)(departmentLstmLayer)
#todo try omitting batch norm?
departmentBatchNormLayer = layers.BatchNormalization(name="departmentBatchNormalization")(departmentDropoutLayer)

#secondary model output to allow for better training of the department-specific lstm
departmentSidePrediction = layers.Dense(1, name="deptSidePred", activation="sigmoid",
                                         kernel_regularizer=regs.l2(BASE_DENSE_LAMBDA)) (departmentBatchNormLayer)

#Company Profile LSTM
companyProfileInputLayer = layers.Input(name="companyProfileTextInput", shape=(MAX_COMPANY_PROFILE_LEN,), dtype="int32")

companyProfileEmbedLayer = layers.Embedding(
    name="companyProfileEmbedding", embeddings_regularizer=regs.l2(BASE_TEXT_EMBED_LAMBDA),
    input_dim=COMPANY_PROFILE_VOCAB_SIZE, output_dim=TEXT_EMBED_DIM, mask_zero=True,
    input_length=MAX_COMPANY_PROFILE_LEN)(companyProfileInputLayer)

companyProfileLstmLayer = layers.LSTM(name= "companyProfileLstm",units=LSTM_SIZE, dropout=BASE_LSTM_DROPOUT,
                                   kernel_regularizer=regs.l2(BASE_LSTM_LAMBDA))(companyProfileEmbedLayer)

companyProfileDropoutLayer = layers.Dropout(name="companyProfileDropout", rate=BASE_DENSE_DROPOUT)(companyProfileLstmLayer)
#todo try omitting batch norm?
companyProfileBatchNormLayer = layers.BatchNormalization(name="companyProfileBatchNormalization")(companyProfileDropoutLayer)

#secondary model output to allow for better training of the companyProfile-specific lstm
companyProfileSidePrediction = layers.Dense(1, name="compProfSidePred", activation="sigmoid",
                                         kernel_regularizer=regs.l2(BASE_DENSE_LAMBDA)) (companyProfileBatchNormLayer)

#Requirements LSTM
requirementsInputLayer = layers.Input(name="requirementsTextInput", shape=(MAX_REQUIREMENTS_LEN,), dtype="int32")

requirementsEmbedLayer = layers.Embedding(
    name="requirementsEmbedding", embeddings_regularizer=regs.l2(BASE_TEXT_EMBED_LAMBDA),
    input_dim=REQUIREMENTS_VOCAB_SIZE, output_dim=TEXT_EMBED_DIM, mask_zero=True,
    input_length=MAX_REQUIREMENTS_LEN)(requirementsInputLayer)

requirementsLstmLayer = layers.LSTM(name= "requirementsLstm",units=LSTM_SIZE, dropout=BASE_LSTM_DROPOUT,
                                   kernel_regularizer=regs.l2(BASE_LSTM_LAMBDA))(requirementsEmbedLayer)

requirementsDropoutLayer = layers.Dropout(name="requirementsDropout", rate=BASE_DENSE_DROPOUT)(requirementsLstmLayer)
#todo try omitting batch norm?
requirementsBatchNormLayer = layers.BatchNormalization(name="requirementsBatchNormalization")(requirementsDropoutLayer)

#secondary model output to allow for better training of the requirements-specific lstm
requirementsSidePrediction = layers.Dense(1, name="reqsSidePred", activation="sigmoid",
                                         kernel_regularizer=regs.l2(BASE_DENSE_LAMBDA)) (requirementsBatchNormLayer)


#Benefits LSTM
benefitsInputLayer = layers.Input(name="benefitsTextInput", shape=(MAX_BENEFITS_LEN,), dtype="int32")

benefitsEmbedLayer = layers.Embedding(
    name="benefitsEmbedding", embeddings_regularizer=regs.l2(BASE_TEXT_EMBED_LAMBDA),
    input_dim=BENEFITS_VOCAB_SIZE, output_dim=TEXT_EMBED_DIM, mask_zero=True,
    input_length=MAX_BENEFITS_LEN)(benefitsInputLayer)

benefitsLstmLayer = layers.LSTM(name= "benefitsLstm",units=LSTM_SIZE, dropout=BASE_LSTM_DROPOUT,
                                   kernel_regularizer=regs.l2(BASE_LSTM_LAMBDA))(benefitsEmbedLayer)

benefitsDropoutLayer = layers.Dropout(name="benefitsDropout", rate=BASE_DENSE_DROPOUT)(benefitsLstmLayer)
#todo try omitting batch norm?
benefitsBatchNormLayer = layers.BatchNormalization(name="benefitsBatchNormalization")(benefitsDropoutLayer)

#secondary model output to allow for better training of the benefits-specific lstm
benefitsSidePrediction = layers.Dense(1, name="benefitsSidePred", activation="sigmoid",
                                         kernel_regularizer=regs.l2(BASE_DENSE_LAMBDA)) (benefitsBatchNormLayer)









employmentTypeInputLayer = layers.Input(name="employmentTypeInput", shape=(NUM_EMPLOYMENT_TYPE_OPTIONS,), dtype="float32")

requiredExperienceInputLayer = layers.Input(name="requiredExperienceInput", shape=(NUM_REQUIRED_EXPERIENCE_OPTIONS,), dtype="float32")

requiredEducationInputLayer = layers.Input(name="requiredEducationInput", shape=(NUM_REQUIRED_EDUCATION_OPTIONS,), dtype="float32")

industryInputLayer = layers.Input(name="industryInput", shape=(NUM_INDUSTRY_OPTIONS,), dtype="float32")

functionInputLayer = layers.Input(name="functionInput", shape=(NUM_FUNCTION_OPTIONS,), dtype="float32")

telecommutingInputLayer = layers.Input(name="telecommutingInput", shape=(1,), dtype="float32")
hasLogoInputLayer = layers.Input(name="hasLogoInput", shape=(1,), dtype="float32")
hasQuestionsInputLayer = layers.Input(name="hasQuestionsInput", shape=(1,), dtype="float32")

textCategoricalMergeLayer = layers.Concatenate(
    name="textCategoricalMerge", axis=1)([
    descriptionBatchNormLayer, titleBatchNormLayer, locationBatchNormLayer, departmentBatchNormLayer,
    companyProfileBatchNormLayer, requirementsBatchNormLayer, benefitsBatchNormLayer,
    employmentTypeInputLayer, requiredExperienceInputLayer, requiredEducationInputLayer, industryInputLayer,
    functionInputLayer, telecommutingInputLayer, hasLogoInputLayer, hasQuestionsInputLayer])

fullDenseLayer1 = layers.Dense(name="firstFullDense", kernel_regularizer=regs.l2(BASE_DENSE_LAMBDA),
                               units=2*BASE_DENSE_SIZE, activation=DENSE_ACTIVATION)(textCategoricalMergeLayer)
fullDropoutLayer1 = layers.Dropout(name="firstFullDropout", rate=BASE_DENSE_DROPOUT)(fullDenseLayer1)

fullDenseLayer2 = layers.Dense(name="secondFullDense", kernel_regularizer=regs.l2(BASE_DENSE_LAMBDA),
                               units=BASE_DENSE_SIZE, activation=DENSE_ACTIVATION)(fullDropoutLayer1)
fullDropoutLayer2 = layers.Dropout(name="secondFullDropout", rate=BASE_DENSE_DROPOUT)(fullDenseLayer2)

fullDenseLayer3 = layers.Dense(name="thirdFullDense", kernel_regularizer=regs.l2(BASE_DENSE_LAMBDA),
                               units=int(0.5*BASE_DENSE_SIZE), activation=DENSE_ACTIVATION)(fullDropoutLayer2) # fullDropoutLayer2)
fullDropoutLayer3= layers.Dropout(name="thirdFullDropout", rate=BASE_DENSE_DROPOUT)(fullDenseLayer3)

finalBatchNormLayer = layers.BatchNormalization(name="finalBatchNormalization")(fullDropoutLayer3)


finalPrediction = layers.Dense(1, name="finalPred", activation="sigmoid")(finalBatchNormLayer)









fraudModel = Model(inputs=[employmentTypeInputLayer, requiredExperienceInputLayer, requiredEducationInputLayer,
                           industryInputLayer, functionInputLayer, telecommutingInputLayer, hasLogoInputLayer,
                           hasQuestionsInputLayer,
                           descriptionInputLayer, titleInputLayer, locationInputLayer, departmentInputLayer,
                           companyProfileInputLayer, requirementsInputLayer, benefitsInputLayer],
                   outputs=[finalPrediction, descriptionSidePrediction, titleSidePrediction, locationSidePrediction,
                            departmentSidePrediction,
                            companyProfileSidePrediction, requirementsSidePrediction, benefitsSidePrediction])

NUM_OUTPUTS= 8


In [12]:
MAIN_PRED_WEIGHT=0.65
DESCRIPTION_SIDE_PRED_WEIGHT=0.05
TITLE_SIDE_PRED_WEIGHT=0.05
LOCATION_SIDE_PRED_WEIGHT=0.05
DEPARTMENT_SIDE_PRED_WEIGHT=0.05
COMPANY_PROFILE_SIDE_PRED_WEIGHT=0.05
REQUIREMENTS_SIDE_PRED_WEIGHT=0.05
BENEFFITS_SIDE_PRED_WEIGHT=0.05

predictionLossWeights = [MAIN_PRED_WEIGHT, DESCRIPTION_SIDE_PRED_WEIGHT, TITLE_SIDE_PRED_WEIGHT, 
                         LOCATION_SIDE_PRED_WEIGHT, DEPARTMENT_SIDE_PRED_WEIGHT,
                        COMPANY_PROFILE_SIDE_PRED_WEIGHT, REQUIREMENTS_SIDE_PRED_WEIGHT, BENEFFITS_SIDE_PRED_WEIGHT]


#todo figure out how to use other metrics like precision or recall?
fraudModel.compile(loss=["binary_crossentropy"]*NUM_OUTPUTS, loss_weights=predictionLossWeights, optimizer="nadam",
                   metrics=[metrics.AUC()])

print(fraudModel.summary())

# from tensorflow.keras.utils import plot_model
#
# plot_model(fraudModel, to_file="model_visualisation.png")



Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
descriptionTextInput (InputLaye [(None, 301)]        0                                            
__________________________________________________________________________________________________
titleTextInput (InputLayer)     [(None, 9)]          0                                            
__________________________________________________________________________________________________
locationTextInput (InputLayer)  [(None, 6)]          0                                            
__________________________________________________________________________________________________
departmentTextInput (InputLayer [(None, 4)]          0                                            
______________________________________________________________________________________________

In [13]:
CHECKPOINT_NAME_FORMAT="weights_after_{epoch:02d}-val_loss{val_loss:.4f}.hdf5"
checkpointFilePath = os.path.join(CHECKPOINT_DIR_PATH, CHECKPOINT_NAME_FORMAT)
checkpointer = callbacks.ModelCheckpoint(filepath=checkpointFilePath, save_best_only=True)
earlyStopper = callbacks.EarlyStopping(patience=5, min_delta=1e-9, restore_best_weights=True)

numEpochs=30
# batchSize = 1024

trainClassWeights = {0:1, 1:1/trainPosFrac}

In [14]:
trainHist = fraudModel.fit([
    employmentTypesTrainVectors, requiredExperienceTrainVectors, requiredEducationTrainVectors,
    industryTrainVectors, functionTrainVectors, trainTelecommuting, trainHasLogo, trainHasQuestions,
    trainDescriptionSequences, trainTitleSequences, trainLocationSequences, trainDepartmentSequences ,
    trainCompanyProfileSequences, trainRequirementsSequences, trainBenefitsSequences
], [trainLabels]*NUM_OUTPUTS, 
    class_weight=[trainClassWeights]*NUM_OUTPUTS,
    validation_data=([employmentTypesValidVectors, requiredExperienceValidVectors, requiredEducationValidVectors,
                      industryValidVectors, functionValidVectors, validTelecommuting, validHasLogo, validHasQuestions,
                      validDescriptionSequences, validTitleSequences, validLocationSequences, validDepartmentSequences,
                       validCompanyProfileSequences, validRequirementsSequences, validBenefitsSequences
                      ], 
                     [validLabels]*NUM_OUTPUTS), epochs=numEpochs, # batch_size=batchSize,
                           callbacks=[earlyStopper, checkpointer])

#todo add weighting for positive examples

Train on 12515 samples, validate on 2682 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [15]:
#todo evaluate on validation data
