# Import Libraries
See requirements.txt for versions

In [None]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from scipy.special import logit
import tensorflow as tf
from tensorflow.keras import layers, optimizers, callbacks, utils
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model, load_model
tf.config.run_functions_eagerly(True)
import warnings
warnings.filterwarnings('ignore')

# Import Data
Workspace specific, we worked inside of the AllofUs Research Workspace

In [None]:
df = pd.read_csv('data.csv')
output = 'lNH'
dataset = df[[output, 'AGE', 'RACE', 'GENDER', 'marital', 'income', 'employ', 'edu']].copy()
train, rem = train_test_split(dataset, train_size=.7, random_state=42)
valid, test = train_test_split(dataset, train_size=.5, random_state=42)
features = ['RACE', 'GENDER', 'marital', 'income', 'employ', 'edu', 'AGE']
output = dataset.loc[:, ~dataset.columns.isin(features)].columns.tolist()

# Neural Network

### Define the Model

In [None]:
def create_model(data, catcols):    
    inputs = []
    outputs = []
    for c in catcols:
        num_unique_values = int(data[c].nunique())
        embed_dim = int(min(np.ceil(num_unique_values / 2), 50))
        inp = layers.Input(shape=(1,))
        out = layers.Embedding(num_unique_values + 1, embed_dim, name=c)(inp)
        out = layers.Reshape(target_shape=(embed_dim, ))(out)
        inputs.append(inp)
        outputs.append(out)
    
    x = layers.Concatenate()(outputs)
    x = layers.Dropout(0.3)(x)
    
    x = layers.Dense(32, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    
    x = layers.Dense(64, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    
    y = layers.Dense(len(output), activation="sigmoid")(x)

    model = Model(inputs=inputs, outputs=y)
    return model

### Training

In [None]:
tf.keras.backend.clear_session()

model = create_model(train, features)

optimizer = optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False)


es = callbacks.EarlyStopping(monitor='val_auc', min_delta=0.01, patience=50, mode='max', restore_best_weights=True)

mlabel = len(output) > 1
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=[tf.keras.metrics.AUC(multi_label=mlabel)])

history = model.fit(x=[train.loc[:, f].values for f in features], 
                    y=train[output].values, 
                    validation_data=([valid.loc[:, f].values for f in features], valid[output].values),
                    epochs=1000, 
                    batch_size=128,
                    verbose=2,
                    callbacks=[es])

### Evaluate Model

In [None]:
model.evaluate(x=[test.loc[:, f].values for f in features], 
               y=test[output].values,
               verbose = 2,
               callbacks=[es])

# Impact Score Measurement
- First, a saved model is loaded and used to make predictions the results of which are passed through a logit function. 
- Next, reference values for each category is chosen (we used the most frequent value but this is not required)
- Then, the impact score can be calculated

This cell contains encoding information for the purposes of calculating impact scores

In [None]:
xr = pd.DataFrame(data=[[3, 0, 2, 1, 2, 2, 4]], 
                  columns=['RACE', 'GENDER', 'marital', 'income', 'employ', 'edu', 'AGE'])

race = {0: 'Black or African American', 1: 'Other', 2: 'Skip', 3: 'White'}
gender = {0: 'Female', 1: 'Male', 2: 'Unknown'}
marital = {0: '0', 1: 'DSW', 2: 'MLWP', 3: 'NM'}
income = {0: '0', 1: '100-150K', 2: '150K+', 3: '25-75K', 4: '75-100K', 5: '<25K'}
employ = {0: '0', 1: 'Employed', 2: 'Unemployed'}
edu = {0: '0', 1: '<HS', 2: 'College+', 3: 'HS/GED', 4: 'someCollege'}
age = {0: 66, 1: 67, 2: 68, 3: 69, 4: 70,
       5: 71, 6: 72, 7: 73, 8: 74, 9: 75,
       10: 76, 11: 77, 12: 78, 13: 79,
       14: 80, 15: 81, 16: 82, 17: 83,
       18: 84, 19: 85, 20: 86, 21: 87,
       22: 88, 23: 89, 24: 90}

def translate(feat, variable):
    if feat is 'RACE':
        return race.get(variable)
    elif feat is 'GENDER':
        return gender.get(variable)
    elif feat is 'marital':
        return marital.get(variable)
    elif feat is 'income':
        return income.get(variable)
    elif feat is 'employ':
        return employ.get(variable)
    elif feat is 'edu':
        return edu.get(variable)
    elif feat is 'AGE':
        return age.get(variable)

In [None]:
model = tf.keras.models.load_model(model_file)
lipred = logit(model.predict(x=[dataset.loc[:, f].values for f in features],
                     verbose = 0))

mIndex = 0
templist = lipred.tolist()
mlipred = []
for element in templist:
    mlipred.append(element[mIndex])
impactdf = dataset.copy()
impactdf['lipred'] = mlipred
impactdf['tAGE'] = 4
impactdf['tRACE'] = 3
impactdf['tGENDER'] = 0
impactdf['tmarital'] = 2
impactdf['tincome'] = 1
impactdf['temploy'] = 2
impactdf['tedu'] = 2

In [None]:
def impactscore(fxi, fxr, xi, xr, feat):
    if abs(xi-xr) is 0:
        return None
    elif feat is 'AGE':
        impact = (fxi-fxr)/(xi-xr)
        return impact
    else:
        impact = (fxi-fxr)
        return impact
    
opdf = pd.DataFrame(columns = ['label', 'impactscore'])    
for feat in tqdm(features):
    xfeatures = ['t'+xfeat if xfeat is feat else xfeat for xfeat in features]
    lrpred = logit(model.predict(x=[impactdf.loc[:, f].values for f in xfeatures],
                   verbose = 0))
    temppred = lrpred.tolist()
    mlrpred = []
    for element in temppred:
        mlrpred.append(element[mIndex])
    impactdf['lrpred'] = mlrpred
    feat_list = impactdf[feat].unique().tolist()
    label = feat + '_impactscore_'
    impactdf[label] = [impactscore(row[0], row[1], row[2], row[3], feat) for row in zip(impactdf['lipred'], impactdf['lrpred'], impactdf[feat], impactdf['t'+feat])]
    for variable in feat_list:
        tempdf = impactdf.loc[impactdf[feat] == variable]
        op = tempdf[label].mean()
        opdf.loc[len(opdf.index)] = [feat + '_' + str(translate(feat, variable)), op]
    
opdf.style.hide_index()