In [1]:
import pandas as pd
import os
import sys
import tensorflow as tf
import numpy as np
import tensorflow_hub as hub

In [2]:
import wandb
from wandb.keras import WandbCallback

In [3]:
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

In [4]:
feature_names = ["product", "sub_product", "issue", "sub_issue", "state", "zip_code", "company", "company_response", "timely_response", "consumer_disputed", "consumer_complaint_narrative"]
cat_features = ["sub_product", "state", "issue"]#, "company", "sub_issue"]
#bucket_features = ['zip_code']
text_features = ['consumer_complaint_narrative']
one_hot_features = ['product', 'company_response', 'timely_response']# , 'zip_code_short']

In [17]:
#df = pd.read_csv('../data/21Feb/26k-consumer-complaints-labels.csv', usecols=feature_names, na_values=0, keep_default_na=False)
#df = pd.read_csv('../data/29Feb/cc_large_balanced.csv', usecols=feature_names, na_values=0, keep_default_na=False)
df = pd.read_csv('../data/29Feb/cc_balanced_narrative.csv', usecols=feature_names, na_values=0, keep_default_na=False)

In [None]:
df[df['consumer_disputed']==1].head()

In [20]:
df['consumer_disputed'] = df['consumer_disputed'].fillna(0)

In [16]:
for col in cat_features:
    print(col)
    print(df[col].nunique())

sub_product
45
state
61
issue
88


In [32]:
df['issue'].value_counts()

48    3666
24    3455
54    3187
53    2482
37    1498
      ... 
5        3
2        2
56       2
77       1
52       1
Name: issue, Length: 88, dtype: int64

In [21]:
df['consumer_disputed'].value_counts()

1.0    15577
0.0    15577
Name: consumer_disputed, dtype: int64

In [22]:
top_n = 100
top_companies = df['company'].value_counts().head(top_n).index.tolist()
df.loc[~df['company'].isin(top_companies), 'company'] = 'Other'

In [23]:
for feature in cat_features + one_hot_features:
    df[feature] = df[feature].astype("category").cat.codes

In [24]:
one_hot_x = [pd.np.asarray(tf.keras.utils.to_categorical(df[feature_name].values)) for feature_name in one_hot_features]

In [25]:
embedding_x = [pd.np.asarray(df[feature_name].values).reshape(-1) for feature_name in cat_features +text_features]#[:-1]

In [26]:
len(one_hot_x)

3

In [27]:
X = one_hot_x + embedding_x

In [28]:
y = np.asarray(df["consumer_disputed"], dtype=np.uint8).reshape(-1)

In [29]:
def wide_and_deep_model(show_summary=True):
    """
    Function defines a Keras model and returns the model as Keras object
    """
    wandb.init(project="consumer-complaints")
    config = wandb.config
    config.name='test_wide_and_deep_with_narrative'
    #config.hidden_layer_size = 256
    config.optimizer = 'adam'
    config.learning_rate = 0.001
    config.data_version = 'cc_balanced_narrative'
    config.one_hot_features = one_hot_features
    config.cat_features = cat_features

    #cat_features = ["sub_product", "state", "issue", "company", "sub_issue"]
    # one_hot_features = ['product', 'company_response', 'timely_response', 'company', 'issue', 'zip_code_short']

    # one-hot categorical features
    num_products = 11
    num_company_responses = 5
    num_timely_responses = 2
    num_companies = 41
    num_issues = 40
    num_zip_codes = 15

    input_product = tf.keras.Input(shape=(num_products,), name="product_xf")
    input_company_response = tf.keras.Input(shape=(num_company_responses,), name="company_response_xf")
    input_timely_response = tf.keras.Input(shape=(num_timely_responses,), name="timely_response_xf")
    #input_zip_code = tf.keras.Input(shape=(num_zip_codes,), name="zip_code_xf")

    # categorical features
    input_sub_product = tf.keras.Input(shape=(1,), name="sub_product_xf")
    input_state = tf.keras.Input(shape=(1,), name="state_xf")
    input_issue = tf.keras.Input(shape=(1,), name="issue_xf")
    #input_company = tf.keras.Input(shape=(1,), name="company_xf")
    #input_sub_issue = tf.keras.Input(shape=(1,), name="sub_issue_xf")
    input_narrative = tf.keras.Input(shape=(1,), name="narrative_xf", dtype=tf.string)

    # convert to embeddings
    embed_sub_product = tf.keras.layers.Embedding(75, 35)(input_sub_product)
    embed_sub_product = tf.keras.layers.Reshape((35, ), input_shape=(1, 35))(embed_sub_product)

    embed_state = tf.keras.layers.Embedding(75, 35)(input_state)
    embed_state = tf.keras.layers.Reshape((35, ), input_shape=(1, 35))(embed_state)
    
    embed_issue = tf.keras.layers.Embedding(100, 50)(input_issue)
    embed_issue = tf.keras.layers.Reshape((50, ), input_shape=(1, 50))(embed_issue)
    
    #embed_company = tf.keras.layers.Embedding(110, 50)(input_company)
    #embed_company = tf.keras.layers.Reshape((50, ), input_shape=(1, 50))(embed_company)
    
    #embed_sub_issue = tf.keras.layers.Embedding(70, 35)(input_sub_issue)
    #embed_sub_issue = tf.keras.layers.Reshape((35, ), input_shape=(1, 35))(embed_sub_issue)
    
    # add USE for narrative, and reduce number of dimensions
    module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
    embed = hub.KerasLayer(module_url)
    reshaped_narrative = tf.reshape(input_narrative, [-1])
    embed_narrative = embed(reshaped_narrative) 
    embed_narrative = tf.keras.layers.Reshape((512, ), input_shape=(1, 512))(embed_narrative)

    
    deep_ff = tf.keras.layers.concatenate(
        [embed_sub_product, embed_state, embed_issue, embed_narrative])#embed_company, embed_sub_issue])
    
    deep = tf.keras.layers.Dense(256, activation='relu')(deep_ff)
    deep = tf.keras.layers.Dense(64, activation='relu')(deep)
    deep = tf.keras.layers.Dense(16, activation='relu')(deep)

    wide = tf.keras.layers.concatenate(
        [input_product, input_company_response, input_timely_response])#, input_zip_code])

    both = tf.keras.layers.concatenate([deep, wide])

    output = tf.keras.layers.Dense(1, activation='sigmoid')(both) 

    _inputs = [input_product, input_company_response, input_timely_response,# input_zip_code, 
               input_sub_product, input_state, 
              input_issue, input_narrative]#input_company, input_sub_issue] 

    keras_model = tf.keras.models.Model(_inputs, output)
    keras_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                     loss='binary_crossentropy',  # categorical_crossentropy
                     metrics=[
                         tf.keras.metrics.BinaryAccuracy(),
                         ])
    if show_summary:
        keras_model.summary()

    return keras_model

In [30]:
model = wide_and_deep_model(show_summary=False)

In [31]:
model.fit(x=X, y=y, batch_size=128, validation_split=0.2, epochs=40, 
          callbacks=[WandbCallback(), tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)])

Train on 24923 samples, validate on 6231 samples
Epoch 1/40




Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40


<tensorflow.python.keras.callbacks.History at 0x1a2b213c8>