In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] ='3'

In [2]:
data_import = pd.read_csv('../Data/SQLout_employees.csv')

In [3]:
data2018ft = data_import.query('arrangement == "FT" & year==2018', inplace=False )[['nace', 
                          'esize_class', 'gender', 
                          'age_class', 'lpk',  'education','experience', 'hourly_rate' ]]
data=data2018ft.copy()
data = data.rename(columns={'hourly_rate': 'target'})
data.shape

(36785, 8)

In [4]:
def outlier_iqr_upper(x, n=1.5):
    """ check if x is outlier using IQR"""
    iqr = x.quantile(0.75)-x.quantile(0.25)
    upper_lim = x.quantile(0.75)+iqr*n
    return x>=upper_lim

In [5]:
data['lpk2'] = data['lpk'] // 10
data['outlier'] = data.groupby('lpk2')['target'].apply(outlier_iqr_upper, n=1.0)
data=data.query('outlier==False')
data = data.drop(columns=['outlier', 'lpk2'])
data['lpk'] = data['lpk'].apply(str)

In [6]:
print(data.shape)
data.head()

(34259, 8)


Unnamed: 0,nace,esize_class,gender,age_class,lpk,education,experience,target
44952,C,1_49,M,50-59,722,G2,13,8.13
44953,C,1_49,M,40-49,721,G2,13,8.2
44954,C,1_49,M,50-59,722,G2,13,8.2
44955,C,1_49,F,40-49,334,G2,0,2.51
44956,M,50_249,F,40-49,522,G2,18,2.19


# Preprocesing

In [8]:
target = data.pop("target")

In [9]:
inputs = {}
for name, column in data.items():
  dtype = column.dtype
  if dtype == object:
    dtype = tf.string
  else:
    dtype = tf.float32

  inputs[name] = tf.keras.Input(shape=(1,), name=name, dtype=dtype)
inputs

{'nace': <KerasTensor: shape=(None, 1) dtype=string (created by layer 'nace')>,
 'esize_class': <KerasTensor: shape=(None, 1) dtype=string (created by layer 'esize_class')>,
 'gender': <KerasTensor: shape=(None, 1) dtype=string (created by layer 'gender')>,
 'age_class': <KerasTensor: shape=(None, 1) dtype=string (created by layer 'age_class')>,
 'lpk': <KerasTensor: shape=(None, 1) dtype=string (created by layer 'lpk')>,
 'education': <KerasTensor: shape=(None, 1) dtype=string (created by layer 'education')>,
 'experience': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'experience')>}

In [10]:
numeric_inputs = {name:input for name, input in inputs.items()
                  if input.dtype==tf.float32}

x = tf.keras.layers.Concatenate()(list(numeric_inputs.values()))
norm = tf.keras.layers.Normalization()
norm.adapt(np.array(data[numeric_inputs.keys()]))
all_numeric_inputs = norm(x)


In [11]:
preprocessed_inputs = [all_numeric_inputs]

In [12]:
for name, input in inputs.items():
  if input.dtype == tf.float32:
    continue

  lookup = tf.keras.layers.StringLookup(vocabulary=np.unique(data[name]))
  one_hot = tf.keras.layers.CategoryEncoding(num_tokens=lookup.vocabulary_size())

  x = lookup(input)
  x = one_hot(x)
  preprocessed_inputs.append(x)

In [13]:
preprocessed_inputs

[<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'normalization')>,
 <KerasTensor: shape=(None, 19) dtype=float32 (created by layer 'category_encoding')>,
 <KerasTensor: shape=(None, 4) dtype=float32 (created by layer 'category_encoding_1')>,
 <KerasTensor: shape=(None, 3) dtype=float32 (created by layer 'category_encoding_2')>,
 <KerasTensor: shape=(None, 6) dtype=float32 (created by layer 'category_encoding_3')>,
 <KerasTensor: shape=(None, 119) dtype=float32 (created by layer 'category_encoding_4')>,
 <KerasTensor: shape=(None, 5) dtype=float32 (created by layer 'category_encoding_5')>]

In [14]:
preprocessed_inputs_cat = tf.keras.layers.Concatenate()(preprocessed_inputs)

ltdu_preprocessing = tf.keras.Model(inputs, preprocessed_inputs_cat)

tf.keras.utils.plot_model(model = ltdu_preprocessing , rankdir="LR", dpi=72, show_shapes=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


In [15]:
ltdu_features_dict = {name: np.array(value) 
                         for name, value in data.items()}

In [16]:
features_dict = {name:values[:1] for name, values in ltdu_features_dict.items()}
ltdu_preprocessing(features_dict)

<tf.Tensor: shape=(1, 157), dtype=float32, numpy=
array([[0.67940295, 0.        , 0.        , 1.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 1.        , 0.        , 0.        , 0.        ,
        0.        , 1.        , 0.        , 0.        , 0.        ,
        0.        , 1.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        

# Model and predictions

In [17]:
def ltdu_model(preprocessing_head, inputs):
  body = tf.keras.Sequential([
    tf.keras.layers.Dense(64),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1)
  ])

  preprocessed_inputs = preprocessing_head(inputs)
  result = body(preprocessed_inputs)
  model = tf.keras.Model(inputs, result)

  model.compile(loss=tf.keras.losses.MeanSquaredError(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=[tf.keras.metrics.RootMeanSquaredError()]),
  return model

ltdu_model = ltdu_model(ltdu_preprocessing, inputs)

In [18]:
ltdu_model.fit(x=ltdu_features_dict, y=target, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f8b781779a0>