In [1]:
import tensorflow as tf
import tensorflow.keras.preprocessing as preproc
from tensorflow.keras import layers
from tensorflow.keras import Model
from tensorflow.keras import metrics
from tensorflow.keras import callbacks
import pandas as pd
import numpy as np

from DataNamesReference import *
from LstmTraining import getNonEmptyLines

CSV_READ_ARGS = {"keep_default_na":False, "index_col":0, "dtype":COLUMN_DATA_TYPES}

MAX_TITLE_LEN = 8
MAX_LOCATION_LEN =5
MAX_DEPARTMENT_LEN = 3
MAX_COMPANY_PROFILE_LEN = 200
MAX_DESCRIPTION_LEN = 300
MAX_REQUIREMENTS_LEN=200
MAX_BENEFITS_LEN = 125

PADDING_TYPE = "post"
TRUNCATING_TYPE = "post"

#not eliminating any of the options
TITLE_VOCAB_SIZE=4708 + 1
LOCATION_VOCAB_SIZE=2335+1
DEPARTMENT_VOCAB_SIZE=1060+1
COMPANY_PROFILE_VOCAB_SIZE=13527+1
DESCRIPTION_VOCAB_SIZE=33470+1
REQUIREMENTS_VOCAB_SIZE=25259+1
BENEFITS_VOCAB_SIZE=11717+1


NUM_EMPLOYMENT_TYPE_OPTIONS = 6
NUM_REQUIRED_EXPERIENCE_OPTIONS=8
NUM_REQUIRED_EDUCATION_OPTIONS =14
NUM_INDUSTRY_OPTIONS=132
NUM_FUNCTION_OPTIONS=38


TEXT_EMBED_DIM = 100 # todo try boosting up to 200 or 300

EMPLOYMENT_TYPE_EMBED_DIM = 15 # todo try increasing to 50 or 100
REQUIRED_EXPERIENCE_EMBED_DIM = 15 # todo try increasing to 50 or 100
REQUIRED_EDUCATION_EMBED_DIM = 30 # todo try increasing to 50 or 100
INDUSTRY_EMBED_DIM = 150 # todo try increasing to 200 or 300?
FUNCTION_EMBED_DIM = 50 # todo try increasing to 100 or 200





LSTM_SIZE=300


BASE_LSTM_DROPOUT=0.4
BASE_DENSE_DROPOUT=0.4
BASE_CATEG_DROPOUT=0.25

BASE_DENSE_SIZE= 50

DENSE_ACTIVATION="relu"



In [2]:
#do final preprocessing on each text attrib
def convertTitlesToPaddedSequences(dataDf):
    allTitles = getNonEmptyLines(TITLES_SUMMARY_FILE_PATH)
    titleTokenizer = preproc.text.Tokenizer(num_words=TITLE_VOCAB_SIZE)
    titleTokenizer.fit_on_texts(allTitles)

    trainTitles = dataDf[TITLE_LABEL]
    trainTitleSequences = titleTokenizer.texts_to_sequences(trainTitles)
    paddedTrainTitleSequences = preproc.sequence.pad_sequences(trainTitleSequences, maxlen=MAX_TITLE_LEN,
                                                               padding=PADDING_TYPE, truncating=TRUNCATING_TYPE)

    return paddedTrainTitleSequences

def convertLocationsToPaddedSequences(dataDf):
    allLocations = getNonEmptyLines(LOCATIONS_SUMMARY_FILE_PATH)
    locationTokenizer = preproc.text.Tokenizer(num_words=LOCATION_VOCAB_SIZE)
    locationTokenizer.fit_on_texts(allLocations)

    trainLocations = dataDf[LOCATION_LABEL]
    trainLocationSequences = locationTokenizer.texts_to_sequences(trainLocations)
    paddedTrainLocationSequences = preproc.sequence.pad_sequences(trainLocationSequences, maxlen=MAX_LOCATION_LEN,
                                                               padding=PADDING_TYPE, truncating=TRUNCATING_TYPE)

    return paddedTrainLocationSequences

def convertDepartmentsToPaddedSequences(dataDf):
    allDepartments = getNonEmptyLines(DEPARTMENTS_SUMMARY_FILE_PATH)
    departmentTokenizer = preproc.text.Tokenizer(num_words=DEPARTMENT_VOCAB_SIZE)
    departmentTokenizer.fit_on_texts(allDepartments)

    trainDepartments = dataDf[DEPARTMENT_LABEL]
    trainDepartmentSequences = departmentTokenizer.texts_to_sequences(trainDepartments)
    paddedTrainDepartmentSequences = preproc.sequence.pad_sequences(trainDepartmentSequences, maxlen=MAX_DEPARTMENT_LEN,
                                                               padding=PADDING_TYPE, truncating=TRUNCATING_TYPE)

    return paddedTrainDepartmentSequences

def convertCompanyProfilesToPaddedSequences(dataDf):
    allCompanyProfiles = getNonEmptyLines(COMPANY_PROFILES_SUMMARY_FILE_PATH)
    companyProfileTokenizer = preproc.text.Tokenizer(num_words=COMPANY_PROFILE_VOCAB_SIZE)
    companyProfileTokenizer.fit_on_texts(allCompanyProfiles)

    trainCompanyProfiles = dataDf[COMPANY_PROFILE_LABEL]
    trainCompanyProfileSequences = companyProfileTokenizer.texts_to_sequences(trainCompanyProfiles)
    paddedTrainCompanyProfileSequences = preproc.sequence.pad_sequences(trainCompanyProfileSequences, maxlen=MAX_COMPANY_PROFILE_LEN,
                                                               padding=PADDING_TYPE, truncating=TRUNCATING_TYPE)

    return paddedTrainCompanyProfileSequences

def convertDescriptionsToPaddedSequences(dataDf):
    allDescriptions = getNonEmptyLines(DESCRIPTIONS_SUMMARY_FILE_PATH)
    descriptionTokenizer = preproc.text.Tokenizer(num_words=DESCRIPTION_VOCAB_SIZE)
    descriptionTokenizer.fit_on_texts(allDescriptions)

    trainDescriptions = dataDf[DESCRIPTION_LABEL]
    trainDescriptionSequences = descriptionTokenizer.texts_to_sequences(trainDescriptions)
    paddedTrainDescriptionSequences = preproc.sequence.pad_sequences(trainDescriptionSequences, maxlen=MAX_DESCRIPTION_LEN,
                                                               padding=PADDING_TYPE, truncating=TRUNCATING_TYPE)

    return paddedTrainDescriptionSequences

def convertRequirementsToPaddedSequences(dataDf):
    allRequirements = getNonEmptyLines(REQUIREMENTS_SUMMARY_FILE_PATH)
    requirementsTokenizer = preproc.text.Tokenizer(num_words=REQUIREMENTS_VOCAB_SIZE)
    requirementsTokenizer.fit_on_texts(allRequirements)

    trainRequirements = dataDf[REQUIREMENTS_LABEL]
    trainRequirementsSequences = requirementsTokenizer.texts_to_sequences(trainRequirements)
    paddedTrainRequirementsSequences = preproc.sequence.pad_sequences(trainRequirementsSequences, maxlen=MAX_REQUIREMENTS_LEN,
                                                               padding=PADDING_TYPE, truncating=TRUNCATING_TYPE)

    return paddedTrainRequirementsSequences

def convertBenefitsToPaddedSequences(dataDf):
    allBenefits = getNonEmptyLines(BENEFITS_SUMMARY_FILE_PATH)
    benefitsTokenizer = preproc.text.Tokenizer(num_words=BENEFITS_VOCAB_SIZE)
    benefitsTokenizer.fit_on_texts(allBenefits)

    trainBenefits = dataDf[BENEFITS_LABEL]
    trainBenefitsSequences = benefitsTokenizer.texts_to_sequences(trainBenefits)
    paddedTrainBenefitsSequences = preproc.sequence.pad_sequences(trainBenefitsSequences, maxlen=MAX_BENEFITS_LEN,
                                                               padding=PADDING_TYPE, truncating=TRUNCATING_TYPE)

    return paddedTrainBenefitsSequences

In [3]:
trainDataDf = pd.read_csv(TRAIN_DATA_PATH, **CSV_READ_ARGS)

In [4]:
trainTitleSequences = convertTitlesToPaddedSequences(trainDataDf)
trainLocationSequences = convertLocationsToPaddedSequences(trainDataDf)
trainDepartmentSequences = convertDepartmentsToPaddedSequences(trainDataDf)
trainCompanyProfileSequences = convertCompanyProfilesToPaddedSequences(trainDataDf)
trainDescriptionSequences = convertDescriptionsToPaddedSequences(trainDataDf)
trainRequirementsSequences = convertRequirementsToPaddedSequences(trainDataDf)
trainBenefitsSequences = convertBenefitsToPaddedSequences(trainDataDf)

In [14]:
numTrainExamples = trainDataDf.shape[0]

trainTelecommuting = trainDataDf[TELECOMMUTING_LABEL].astype(float).to_numpy()
trainTelecommuting = np.reshape(trainTelecommuting, (numTrainExamples, 1, 1))

trainHasLogo =  trainDataDf[HAS_LOGO_LABEL].astype(float).to_numpy()
trainHasLogo = np.reshape(trainHasLogo, (numTrainExamples, 1, 1))

trainHasQuestions =  trainDataDf[HAS_QUESTIONS_LABEL].astype(float).to_numpy()
trainHasQuestions = np.reshape(trainHasQuestions, (numTrainExamples, 1, 1))

trainLabels = trainDataDf[FRAUDULENT_LABEL].astype(float).to_numpy()
trainLabels = np.reshape(trainLabels, (numTrainExamples,1))

In [7]:
validDataDf = pd.read_csv(VALIDATION_DATA_PATH, **CSV_READ_ARGS)

In [8]:
validTitleSequences = convertTitlesToPaddedSequences(validDataDf)
validLocationSequences = convertLocationsToPaddedSequences(validDataDf)
validDepartmentSequences = convertDepartmentsToPaddedSequences(validDataDf)
validCompanyProfileSequences = convertCompanyProfilesToPaddedSequences(validDataDf)
validDescriptionSequences = convertDescriptionsToPaddedSequences(validDataDf)
validRequirementsSequences = convertRequirementsToPaddedSequences(validDataDf)
validBenefitsSequences = convertBenefitsToPaddedSequences(validDataDf)

In [15]:
numValidExamples = validDataDf.shape[0]


validTelecommuting =  validDataDf[TELECOMMUTING_LABEL].astype(float).to_numpy()
validTelecommuting = np.reshape(validTelecommuting, (numValidExamples, 1, 1))

validHasLogo =  validDataDf[HAS_LOGO_LABEL].astype(float).to_numpy()
validHasLogo = np.reshape(validHasLogo, (numValidExamples, 1, 1))

validHasQuestions =  validDataDf[HAS_QUESTIONS_LABEL].astype(float).to_numpy()
validHasQuestions = np.reshape(validHasQuestions, (numValidExamples, 1, 1))

validLabels = validDataDf[FRAUDULENT_LABEL].astype(float).to_numpy()
validLabels = np.reshape(validLabels, (numValidExamples, 1))


In [16]:
#todo implement use of word2vec pretrained embedding matrix

#no recurrent_dropout on lstm because need to use GPU

#Description LSTM
descriptionInputLayer = layers.Input(name="descriptionTextInput", shape=(MAX_DESCRIPTION_LEN,), dtype="int32")

descriptionEmbedLayer = layers.Embedding(name="descriptionEmbedding", input_dim=DESCRIPTION_VOCAB_SIZE, output_dim=TEXT_EMBED_DIM,
                                         mask_zero=True, input_length=MAX_DESCRIPTION_LEN)(descriptionInputLayer)

descriptionLstmLayer = layers.LSTM(name= "descriptionLstm",units=LSTM_SIZE, dropout=BASE_LSTM_DROPOUT)(descriptionEmbedLayer)
descriptionLstmLayer = layers.Reshape(name="descriptionLstmReshape", target_shape=(1, LSTM_SIZE))(descriptionLstmLayer)

descriptionDropoutLayer = layers.Dropout(name="descriptionDropout", rate=BASE_DENSE_DROPOUT)(descriptionLstmLayer)
#todo try omitting batch norm?
descriptionBatchNormLayer = layers.BatchNormalization(name="descriptionBatchNormalization")(descriptionDropoutLayer)

#secondary model output to allow for better training of the description-specific lstm
descriptionSidePrediction = layers.Dense(1, name="descSidePred", activation="sigmoid")(descriptionBatchNormLayer)

#todo!!!! duplicate description layers for each other text attribute



employmentTypeInputLayer = layers.Input(name="employmentTypeInput", shape=(1,), dtype="int32")
employmentTypeEmbedLayer = layers.Embedding(name="employmentTypeEmbedding", input_dim=NUM_EMPLOYMENT_TYPE_OPTIONS,
                                            output_dim=EMPLOYMENT_TYPE_EMBED_DIM)(employmentTypeInputLayer)

requiredExperienceInputLayer = layers.Input(name="requiredExperienceInput", shape=(1,), dtype="int32")
requiredExperienceEmbedLayer = layers.Embedding(name="requiredExperienceEmbedding", input_dim=NUM_REQUIRED_EXPERIENCE_OPTIONS,
                                            output_dim=REQUIRED_EXPERIENCE_EMBED_DIM)(requiredExperienceInputLayer)

requiredEducationInputLayer = layers.Input(name="requiredEducationInput", shape=(1,), dtype="int32")
requiredEducationEmbedLayer = layers.Embedding(name="requiredEducationEmbedding", input_dim=NUM_REQUIRED_EDUCATION_OPTIONS,
                                            output_dim=REQUIRED_EDUCATION_EMBED_DIM)(requiredEducationInputLayer)

industryInputLayer = layers.Input(name="industryInput", shape=(1,), dtype="int32")
industryEmbedLayer = layers.Embedding(name="industryEmbedding", input_dim=NUM_INDUSTRY_OPTIONS,
                                            output_dim=INDUSTRY_EMBED_DIM)(industryInputLayer)

functionInputLayer = layers.Input(name="functionInput", shape=(1,), dtype="int32")
functionEmbedLayer = layers.Embedding(name="functionEmbedding", input_dim=NUM_FUNCTION_OPTIONS,
                                            output_dim=FUNCTION_EMBED_DIM)(functionInputLayer)

telecommutingInputLayer = layers.Input(name="telecommutingInput", shape=(1,1)) #todo? define dtype as boolean??
hasLogoInputLayer = layers.Input(name="hasLogoInput", shape=(1,1))
hasQuestionsInputLayer = layers.Input(name="hasQuestionsInput", shape=(1,1))


categoricalMergeLayer = layers.Concatenate(name="categoricalMerge", axis=2)([employmentTypeEmbedLayer, requiredExperienceEmbedLayer,
                                             requiredEducationEmbedLayer, industryEmbedLayer, functionEmbedLayer,
                                             telecommutingInputLayer, hasLogoInputLayer, hasQuestionsInputLayer])

categoricalDropoutLayer = layers.Dropout(name="categoricalDropout", rate=BASE_CATEG_DROPOUT)(categoricalMergeLayer)
#todo try omitting batch norm?
categoricalBatchNormLayer = layers.BatchNormalization(name="categoricalBatchNormalization")(categoricalDropoutLayer)

textCategoricalMergeLayer = layers.Concatenate(name="textCategoricalMerge", axis=2)([descriptionBatchNormLayer,
                                                                            categoricalBatchNormLayer])

fullDenseLayer1 = layers.Dense(name="firstFullDense", units=2*BASE_DENSE_SIZE, activation=DENSE_ACTIVATION)(
    textCategoricalMergeLayer)
fullDropoutLayer1 = layers.Dropout(name="firstFullDropout", rate=BASE_DENSE_DROPOUT)(fullDenseLayer1)

fullDenseLayer2 = layers.Dense(name="secondFullDense", units=BASE_DENSE_SIZE, activation=DENSE_ACTIVATION)(fullDropoutLayer1)
fullDropoutLayer2 = layers.Dropout(name="secondFullDropout", rate=BASE_DENSE_DROPOUT)(fullDenseLayer2)
#todo try batch norm?
fullDenseLayer3 = layers.Dense(name="thirdFullDense", units=int(0.5*BASE_DENSE_SIZE), activation=DENSE_ACTIVATION)(
    fullDropoutLayer2)
fullDropoutLayer3= layers.Dropout(name="thirdFullDropout", rate=BASE_DENSE_DROPOUT)(fullDenseLayer3)

finalPrediction = layers.Dense(1, name="finalPred", activation="sigmoid")(fullDropoutLayer3)









fraudModel = Model(inputs=[employmentTypeInputLayer, requiredExperienceInputLayer, requiredEducationInputLayer,
                           industryInputLayer, functionInputLayer, telecommutingInputLayer, hasLogoInputLayer,
                           hasQuestionsInputLayer,
                           descriptionInputLayer],
                   outputs=[finalPrediction, descriptionSidePrediction])

NUM_OUTPUTS=2


In [17]:
#todo figure out how to use other metrics like precision or recall?
fraudModel.compile(loss=["binary_crossentropy"]*2, loss_weights=[1, 0.15], optimizer="nadam")

print(fraudModel.summary())

CHECKPOINT_NAME_FORMAT="weights_after_{epoch:02d}-loss{val_loss:.3f}.hdf5"
checkpointFilePath = os.path.join(CHECKPOINT_DIR_PATH, CHECKPOINT_NAME_FORMAT)
checkpointer = callbacks.ModelCheckpoint(filepath=checkpointFilePath, save_best_only=True)
earlyStopper = callbacks.EarlyStopping(patience=5, min_delta=1e-9, restore_best_weights=True)

numEpochs=15

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
descriptionTextInput (InputLaye [(None, 300)]        0                                            
__________________________________________________________________________________________________
descriptionEmbedding (Embedding (None, 300, 100)     3347100     descriptionTextInput[0][0]       
__________________________________________________________________________________________________
employmentTypeInput (InputLayer [(None, 1)]          0                                            
__________________________________________________________________________________________________
requiredExperienceInput (InputL [(None, 1)]          0                                            
____________________________________________________________________________________________

In [18]:
trainHist = fraudModel.fit([
    trainDataDf[EMPLOYMENT_TYPE_LABEL], trainDataDf[REQUIRED_EXPERIENCE_LABEL], trainDataDf[REQUIRED_EDUCATION_LABEL],
    trainDataDf[INDUSTRY_LABEL], trainDataDf[FUNCTION_LABEL], trainTelecommuting, trainHasLogo, trainHasQuestions,
    trainDescriptionSequences], [trainLabels]*NUM_OUTPUTS,
    validation_data=([validDataDf[EMPLOYMENT_TYPE_LABEL], validDataDf[REQUIRED_EXPERIENCE_LABEL],
                      validDataDf[REQUIRED_EDUCATION_LABEL], validDataDf[INDUSTRY_LABEL], validDataDf[FUNCTION_LABEL],
                      validTelecommuting, validHasLogo, validHasQuestions,
                      validDescriptionSequences], [validLabels]*NUM_OUTPUTS), epochs=numEpochs,
                           callbacks=[earlyStopper, checkpointer])

#todo add weighting for positive examples

Train on 12515 samples, validate on 2682 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15


In [None]:
#todo evaluate on validation data
