In [1]:
import pandas as pd
import os
import sys
import tensorflow as tf
import numpy as np
import tensorflow_hub as hub

In [2]:
import wandb
from wandb.keras import WandbCallback

In [3]:
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

In [4]:
feature_names = ["product", "sub_product", "issue", "sub_issue", "state", "zip_code", "company", "company_response", "timely_response", "consumer_disputed", "consumer_complaint_narrative"]
one_hot_features = ['product', 'sub_product', 'company_response', 'state', 'issue']
numeric_features = ['zip_code']
text_features = ['consumer_complaint_narrative']

In [6]:
# df = pd.read_csv('../data/6Mar/consumer_complaints_with_narrative.csv', usecols=feature_names)
df = pd.read_csv('../data/consumer_complaints_with_narrative.csv', usecols=feature_names)

In [7]:
df.head()

Unnamed: 0,product,sub_product,issue,sub_issue,consumer_complaint_narrative,company,state,zip_code,company_response,timely_response,consumer_disputed
0,Debt collection,I do not know,Disclosure verification of debt,Right to dispute notice not received,I was denied employment because of a judgment ...,Encore Capital Group,NY,113XX,Closed with explanation,Yes,0
1,Credit reporting,,Improper use of my credit report,Report improperly shared by CRC,I have a credit card through XXXX XXXX and XXX...,Experian,IL,606XX,Closed with non-monetary relief,Yes,0
2,Debt collection,I do not know,Cont'd attempts collect debt not owed,Debt is not mine,Almost daily phone calls from Stellar Recovery...,Stellar Recovery Inc.,MI,480XX,Closed with explanation,Yes,1
3,Mortgage,Conventional fixed mortgage,"Loan servicing, payments, escrow account",,I submitted my monthly mortgage payment to Pri...,Primary Residential Mortgage,CT,066XX,Closed with monetary relief,Yes,0
4,Student loan,Non-federal student loan,Dealing with my lender or servicer,Received bad information about my loan,I contacted America Education Services in XX/X...,AES/PHEAA,FL,321XX,Closed with explanation,Yes,1


In [8]:
for col in one_hot_features:
    print(col)
    print(df[col].nunique())

product
11
sub_product
45
company_response
5
state
60
issue
90


In [9]:
df['consumer_disputed'] = df['consumer_disputed'].map({'Yes':1, 'No':0})

In [10]:
for feature in one_hot_features:
    df[feature] = df[feature].astype("category").cat.codes

In [11]:
one_hot_x = [pd.np.asarray(tf.keras.utils.to_categorical(df[feature_name].values)) for feature_name in one_hot_features]

In [12]:
embedding_x = [pd.np.asarray(df[feature_name].values).reshape(-1) for feature_name in text_features]

In [19]:
# df['zip_code'] = df['zip_code'].str.replace('X', '0', regex=True) # for pandas 1.2.4
df['zip_code'] = df['zip_code'].str.replace('X', '0') # for pandas 0.22.0

In [23]:
# df['zip_code'] = df['zip_code'].str.replace(r'\[|\*|\+|\-|`|\.|\ |\$|\/|!|\(', '0', regex=True) # for pandas 1.2.4
df['zip_code'] = df['zip_code'].str.replace(r'\[|\*|\+|\-|`|\.|\ |\$|\/|!|\(', '0', regex=True) # for pandas 0.22.0

In [24]:
df['zip_code'] = df['zip_code'].fillna(0)

In [25]:
df['zip_code'] = df['zip_code'].astype('int32')

In [26]:
df['zip_code'] = df['zip_code'].apply(lambda x: x//10000)

In [27]:
numeric_x = [df['zip_code'].values]

In [28]:
X = one_hot_x + numeric_x + embedding_x

In [29]:
y = np.asarray(df["consumer_disputed"], dtype=np.uint8).reshape(-1)

In [31]:
def get_model(show_summary=True):
    """
    Function defines a Keras model and returns the model as Keras object
    """
    wandb.init(project="consumer-complaints")
    config = wandb.config
    config.name='final_features_wide'
    #config.hidden_layer_size = 256
    config.optimizer = 'adam'
    config.learning_rate = 0.001
    config.data_version = 'cc_imbalanced_narrative'
    config.one_hot_features = one_hot_features
    config.numeric_features = numeric_features
    config.text_features = text_features
    
    # one-hot categorical features
    num_products = 11
    num_sub_products = 45
    num_company_responses = 5
    num_states = 60
    num_issues = 90

    input_product = tf.keras.Input(shape=(num_products,), name="product_xf")
    input_sub_product = tf.keras.Input(shape=(num_sub_products,), name="sub_product_xf")
    input_company_response = tf.keras.Input(shape=(num_company_responses,), name="company_response_xf")
    input_state = tf.keras.Input(shape=(num_states,), name="state_xf")
    input_issue = tf.keras.Input(shape=(num_issues,), name="issue_xf")
    
    # numeric features
    input_zip_code = tf.keras.Input(shape=(1,), name="zip_code_xf")

    # text features
    input_narrative = tf.keras.Input(shape=(1,), name="narrative_xf", dtype=tf.string)

    # embed text features
    module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
    embed = hub.KerasLayer(module_url)
    reshaped_narrative = tf.reshape(input_narrative, [-1])
    embed_narrative = embed(reshaped_narrative) 
    deep_ff = tf.keras.layers.Reshape((512, ), input_shape=(1, 512))(embed_narrative)
    
    deep = tf.keras.layers.Dense(256, activation='relu')(deep_ff)
    deep = tf.keras.layers.Dense(64, activation='relu')(deep)
    deep = tf.keras.layers.Dense(16, activation='relu')(deep)

    wide_ff = tf.keras.layers.concatenate(
        [input_product, input_sub_product, input_company_response, 
         input_state, input_issue, input_zip_code])
    wide = tf.keras.layers.Dense(16, activation='relu')(wide_ff)


    both = tf.keras.layers.concatenate([deep, wide])

    output = tf.keras.layers.Dense(1, activation='sigmoid')(both) 

    _inputs = [input_product, input_sub_product, input_company_response,  
               input_state, input_issue, input_zip_code, input_narrative]

    keras_model = tf.keras.models.Model(_inputs, output)
    keras_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                     loss='binary_crossentropy',  
                     metrics=[
                         tf.keras.metrics.BinaryAccuracy(),
                         tf.keras.metrics.TruePositives()
                         ])
    if show_summary:
        keras_model.summary()

    return keras_model

In [34]:
model = get_model(show_summary=False)

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

In [35]:
model.fit(x=X, y=y, batch_size=32, validation_split=0.2, epochs=5, 
          callbacks=[WandbCallback()])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f407c2fb2e8>

In [36]:
#from IPython.display import Image

file_name = 'model.png'
tf.keras.utils.plot_model(model, to_file=file_name)
#Image(filename=file_name)

Failed to import pydot. You must install pydot and graphviz for `pydotprint` to work.


In [37]:
def get_model(show_summary=True):
    """
    Function defines a Keras model and returns the model as Keras object
    """
    
    # one-hot categorical features
    num_products = 11
    num_sub_products = 45
    num_company_responses = 5
    num_states = 60
    num_issues = 90

    input_product = tf.keras.Input(shape=(num_products,), name="product_xf")
    input_sub_product = tf.keras.Input(shape=(num_sub_products,), name="sub_product_xf")
    input_company_response = tf.keras.Input(shape=(num_company_responses,), name="company_response_xf")
    input_state = tf.keras.Input(shape=(num_states,), name="state_xf")
    input_issue = tf.keras.Input(shape=(num_issues,), name="issue_xf")
    
    # numeric features
    input_zip_code = tf.keras.Input(shape=(1,), name="zip_code_xf")

    # text features
    input_narrative = tf.keras.Input(shape=(1,), name="narrative_xf", dtype=tf.string)

    # embed text features
    module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
    embed = hub.KerasLayer(module_url)
    reshaped_narrative = tf.reshape(input_narrative, [-1])
    embed_narrative = embed(reshaped_narrative) 
    deep_ff = tf.keras.layers.Reshape((512, ), input_shape=(1, 512))(embed_narrative)
    
    deep = tf.keras.layers.Dense(256, activation='relu')(deep_ff)
    deep = tf.keras.layers.Dense(64, activation='relu')(deep)
    deep = tf.keras.layers.Dense(16, activation='relu')(deep)

    wide_ff = tf.keras.layers.concatenate(
        [input_product, input_sub_product, input_company_response, 
         input_state, input_issue, input_zip_code])
    wide = tf.keras.layers.Dense(16, activation='relu')(wide_ff)


    both = tf.keras.layers.concatenate([deep, wide])

    output = tf.keras.layers.Dense(1, activation='sigmoid')(both) 

    _inputs = [input_product, input_sub_product, input_company_response,  
               input_state, input_issue, input_zip_code, input_narrative]

    keras_model = tf.keras.models.Model(_inputs, output)
    keras_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                     loss='binary_crossentropy',  
                     metrics=[
                         tf.keras.metrics.BinaryAccuracy(),
                         tf.keras.metrics.TruePositives()
                         ])
    if show_summary:
        keras_model.summary()

    return keras_model