In [17]:
import tensorflow as tf
import tensorflow.keras.preprocessing as preproc
import tensorflow.keras.layers as layers
from tensorflow.keras import Model
import pandas as pd
import numpy as np

from DataNamesReference import *
from LstmTraining import getNonEmptyLines

CSV_READ_ARGS = {"keep_default_na":False, "index_col":0, "dtype":COLUMN_DATA_TYPES}

MAX_TITLE_LEN = 8
MAX_LOCATION_LEN =5
MAX_DEPARTMENT_LEN = 3
MAX_COMPANY_PROFILE_LEN = 200
MAX_DESCRIPTION_LEN = 300
MAX_REQUIREMENTS_LEN=200
MAX_BENEFITS_LEN = 125

PADDING_TYPE = "post"
TRUNCATING_TYPE = "post"

#not eliminating any of the options
TITLE_VOCAB_SIZE=4708 + 1
LOCATION_VOCAB_SIZE=2335+1
DEPARTMENT_VOCAB_SIZE=1060+1
COMPANY_PROFILE_VOCAB_SIZE=13527+1
DESCRIPTION_VOCAB_SIZE=33470+1
REQUIREMENTS_VOCAB_SIZE=25259+1
BENEFITS_VOCAB_SIZE=11717+1





EMBED_DIM = 100 # todo try boosting up to 200 or 300
LSTM_SIZE=150

BASE_EMBED_DROPOUT= 0.2
BASE_LSTM_DROPOUT=0.2
BASE_DENSE_DROPOUT=0.2




In [18]:
trainDataDf = pd.read_csv(TRAIN_DATA_PATH, **CSV_READ_ARGS)

In [19]:
#do final preprocessing on each text attrib
def convertTitlesToPaddedSequences(dataDf):
    allTitles = getNonEmptyLines(TITLES_SUMMARY_FILE_PATH)
    titleTokenizer = preproc.text.Tokenizer(num_words=TITLE_VOCAB_SIZE)
    titleTokenizer.fit_on_texts(allTitles)

    trainTitles = dataDf[TITLE_LABEL]
    trainTitleSequences = titleTokenizer.texts_to_sequences(trainTitles)
    paddedTrainTitleSequences = preproc.sequence.pad_sequences(trainTitleSequences, maxlen=MAX_TITLE_LEN,
                                                               padding=PADDING_TYPE, truncating=TRUNCATING_TYPE)

    return paddedTrainTitleSequences

def convertLocationsToPaddedSequences(dataDf):
    allLocations = getNonEmptyLines(LOCATIONS_SUMMARY_FILE_PATH)
    locationTokenizer = preproc.text.Tokenizer(num_words=LOCATION_VOCAB_SIZE)
    locationTokenizer.fit_on_texts(allLocations)

    trainLocations = dataDf[LOCATION_LABEL]
    trainLocationSequences = locationTokenizer.texts_to_sequences(trainLocations)
    paddedTrainLocationSequences = preproc.sequence.pad_sequences(trainLocationSequences, maxlen=MAX_LOCATION_LEN,
                                                               padding=PADDING_TYPE, truncating=TRUNCATING_TYPE)

    return paddedTrainLocationSequences

def convertDepartmentsToPaddedSequences(dataDf):
    allDepartments = getNonEmptyLines(DEPARTMENTS_SUMMARY_FILE_PATH)
    departmentTokenizer = preproc.text.Tokenizer(num_words=DEPARTMENT_VOCAB_SIZE)
    departmentTokenizer.fit_on_texts(allDepartments)

    trainDepartments = dataDf[DEPARTMENT_LABEL]
    trainDepartmentSequences = departmentTokenizer.texts_to_sequences(trainDepartments)
    paddedTrainDepartmentSequences = preproc.sequence.pad_sequences(trainDepartmentSequences, maxlen=MAX_DEPARTMENT_LEN,
                                                               padding=PADDING_TYPE, truncating=TRUNCATING_TYPE)

    return paddedTrainDepartmentSequences

def convertCompanyProfilesToPaddedSequences(dataDf):
    allCompanyProfiles = getNonEmptyLines(COMPANY_PROFILES_SUMMARY_FILE_PATH)
    companyProfileTokenizer = preproc.text.Tokenizer(num_words=COMPANY_PROFILE_VOCAB_SIZE)
    companyProfileTokenizer.fit_on_texts(allCompanyProfiles)

    trainCompanyProfiles = dataDf[COMPANY_PROFILE_LABEL]
    trainCompanyProfileSequences = companyProfileTokenizer.texts_to_sequences(trainCompanyProfiles)
    paddedTrainCompanyProfileSequences = preproc.sequence.pad_sequences(trainCompanyProfileSequences, maxlen=MAX_COMPANY_PROFILE_LEN,
                                                               padding=PADDING_TYPE, truncating=TRUNCATING_TYPE)

    return paddedTrainCompanyProfileSequences

def convertDescriptionsToPaddedSequences(dataDf):
    allDescriptions = getNonEmptyLines(DESCRIPTIONS_SUMMARY_FILE_PATH)
    descriptionTokenizer = preproc.text.Tokenizer(num_words=DESCRIPTION_VOCAB_SIZE)
    descriptionTokenizer.fit_on_texts(allDescriptions)

    trainDescriptions = dataDf[DESCRIPTION_LABEL]
    trainDescriptionSequences = descriptionTokenizer.texts_to_sequences(trainDescriptions)
    paddedTrainDescriptionSequences = preproc.sequence.pad_sequences(trainDescriptionSequences, maxlen=MAX_DESCRIPTION_LEN,
                                                               padding=PADDING_TYPE, truncating=TRUNCATING_TYPE)

    return paddedTrainDescriptionSequences

def convertRequirementsToPaddedSequences(dataDf):
    allRequirements = getNonEmptyLines(REQUIREMENTS_SUMMARY_FILE_PATH)
    requirementsTokenizer = preproc.text.Tokenizer(num_words=REQUIREMENTS_VOCAB_SIZE)
    requirementsTokenizer.fit_on_texts(allRequirements)

    trainRequirements = dataDf[REQUIREMENTS_LABEL]
    trainRequirementsSequences = requirementsTokenizer.texts_to_sequences(trainRequirements)
    paddedTrainRequirementsSequences = preproc.sequence.pad_sequences(trainRequirementsSequences, maxlen=MAX_REQUIREMENTS_LEN,
                                                               padding=PADDING_TYPE, truncating=TRUNCATING_TYPE)

    return paddedTrainRequirementsSequences

def convertBenefitsToPaddedSequences(dataDf):
    allBenefits = getNonEmptyLines(BENEFITS_SUMMARY_FILE_PATH)
    benefitsTokenizer = preproc.text.Tokenizer(num_words=BENEFITS_VOCAB_SIZE)
    benefitsTokenizer.fit_on_texts(allBenefits)

    trainBenefits = dataDf[BENEFITS_LABEL]
    trainBenefitsSequences = benefitsTokenizer.texts_to_sequences(trainBenefits)
    paddedTrainBenefitsSequences = preproc.sequence.pad_sequences(trainBenefitsSequences, maxlen=MAX_BENEFITS_LEN,
                                                               padding=PADDING_TYPE, truncating=TRUNCATING_TYPE)

    return paddedTrainBenefitsSequences

In [21]:
trainTitleSequences = convertTitlesToPaddedSequences(trainDataDf)
trainLocationSequences = convertLocationsToPaddedSequences(trainDataDf)
trainDepartmentSequences = convertDepartmentsToPaddedSequences(trainDataDf)
trainCompanyProfileSequences = convertCompanyProfilesToPaddedSequences(trainDataDf)
trainDescriptionSequences = convertDescriptionsToPaddedSequences(trainDataDf)
trainRequirementsSequences = convertRequirementsToPaddedSequences(trainDataDf)
trainBenefitsSequences = convertBenefitsToPaddedSequences(trainDataDf)

In [22]:
trainLabels = trainDataDf[FRAUDULENT_LABEL]
trainLabels = trainLabels.astype(float)

In [7]:
#todo implement use of word2vec pretrained embedding matrix

#no recurrent_dropout on lstm because need to use GPU

#todo build lstm model for a text attrib
descriptionInputLayer = layers.Input(shape=(MAX_DESCRIPTION_LEN,), dtype="int32")

descriptionEmbedLayer = layers.Embedding(input_dim=DESCRIPTION_VOCAB_SIZE, output_dim=EMBED_DIM,
                                         mask_zero=True, input_length=MAX_DESCRIPTION_LEN)
descriptionEmbedOutput = descriptionEmbedLayer(descriptionInputLayer)

descriptionLstmLayer = layers.LSTM(units=LSTM_SIZE, dropout=BASE_LSTM_DROPOUT)
descriptionLstmOutput = descriptionLstmLayer(descriptionEmbedOutput)

#secondary model output to allow for better training of the description-specific lstm
descriptionSidePredLayer = layers.Dense(1, activation="sigmoid")
descriptionSidePrediction = descriptionSidePredLayer(descriptionLstmOutput)

#todo insert batchnormalization layers?

fraudModel = Model(inputs=[descriptionInputLayer], outputs=[descriptionSidePrediction])


#todo build mlp model for some categ attribs

#todo combine component models into final output model

#todo add extra model output for each lstm in order to improve their training

#todo add weighting for positive examples

In [8]:
#todo train on training data

In [9]:
#todo evaluate on validation data
