In [1]:
import pandas as pd
import os
import sys
import tensorflow as tf
import numpy as np

In [31]:
import wandb
from wandb.keras import WandbCallback

In [2]:
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

In [3]:
feature_names = ["product", "sub_product", "issue", "sub_issue", "state", "zip_code", "company", "company_response", "timely_response", "consumer_disputed"]
cat_features = ["sub_product", "state", "issue", "company", "sub_issue"]
#bucket_features = ['zip_code']
one_hot_features = ['product', 'company_response', 'timely_response', 'zip_code_short']

In [4]:
#df = pd.read_csv('../data/21Feb/26k-consumer-complaints-labels.csv', usecols=feature_names, na_values=0, keep_default_na=False)
df = pd.read_csv('../data/29Feb/cc_large_balanced.csv', usecols=feature_names, na_values=0, keep_default_na=False)

In [5]:
df.head()

Unnamed: 0,product,sub_product,issue,sub_issue,company,state,zip_code,company_response,timely_response,consumer_disputed
0,Credit card,,Identity theft / Fraud / Embezzlement,,Bank of America,TX,77411,Closed with explanation,Yes,
1,Credit reporting,,Incorrect information on credit report,Information is not mine,Equifax,NY,14075,Closed with explanation,Yes,1.0
2,Debt collection,Credit card,Cont'd attempts collect debt not owed,Debt was paid,"Oxford Law, LLC",FL,32225,Closed with explanation,Yes,1.0
3,Credit reporting,,Incorrect information on credit report,Information is not mine,Equifax,NE,69341,Closed with explanation,Yes,
4,Mortgage,Other mortgage,"Loan servicing, payments, escrow account",,Fifth Third Financial Corporation,PA,15137,Closed with explanation,Yes,1.0


In [6]:
len(df)

120212

In [None]:
df['product'].nunique()

In [None]:
for col in one_hot_features:
    print(col)
    print(df[col].nunique())

In [7]:
df['zip_code_short'] = df['zip_code'].apply(lambda x: str(x)[0] if len(x)>0 else np.nan)

In [None]:
for col in cat_features:
    print(col)
    print(df[col].nunique())

In [8]:
top_n = 100
top_companies = df['company'].value_counts().head(top_n).index.tolist()
df.loc[~df['company'].isin(top_companies), 'company'] = 'Other'

In [None]:
#top_issues = df['issue'].value_counts().head(top_n).index.tolist()
#df.loc[~df['issue'].isin(top_issues), 'issue'] = 'Other'

In [9]:
for feature in cat_features + one_hot_features:
    df[feature] = df[feature].astype("category").cat.codes

In [None]:
#df['consumer_disputed'] = df['consumer_disputed'].fillna(0)

In [None]:
#df_majority = df[df['consumer_disputed']==0.]
#df_minority = df[df['consumer_disputed']==1.]

In [None]:
#len(df_minority)

In [None]:
#df_majority = df_majority.sample(n=4708, replace=False)

In [None]:
#df_resampled = pd.concat([df_majority, df_minority])

In [None]:
#df_resampled = df_resampled.sample(frac=1).reset_index(drop=True)


In [10]:
one_hot_x = [pd.np.asarray(tf.keras.utils.to_categorical(df[feature_name].values)) for feature_name in one_hot_features]

In [11]:
embedding_x = [pd.np.asarray(df[feature_name].values).reshape(-1) for feature_name in cat_features]#[:-1]

In [12]:
len(one_hot_x)

4

In [13]:
X = one_hot_x + embedding_x

In [14]:
y = np.asarray(df["consumer_disputed"], dtype=np.uint8).reshape(-1)

In [19]:
 def get_model(hp, show_summary=True):
    """
    Function defines a Keras model and returns the model as Keras object
    """
    #wandb.init(project="consumer-complaints")
    #config = wandb.config
    #config.hidden_layer_size = 256
    #config.optimizer = 'adam'
    #config.data_version = 'cc_large_balanced'

    #cat_features = ["sub_product", "state", "issue", "company", "sub_issue"]
    # one_hot_features = ['product', 'company_response', 'timely_response', 'company', 'issue', 'zip_code_short']

    # one-hot categorical features
    num_products = 11
    num_company_responses = 5
    num_timely_responses = 2
    num_companies = 41
    num_issues = 40
    num_zip_codes = 14

    input_product = tf.keras.Input(shape=(num_products,), name="product_xf")
    input_company_response = tf.keras.Input(shape=(num_company_responses,), name="company_response_xf")
    input_timely_response = tf.keras.Input(shape=(num_timely_responses,), name="timely_response_xf")
    input_zip_code = tf.keras.Input(shape=(num_zip_codes,), name="zip_code_xf")

    # categorical features
    input_sub_product = tf.keras.Input(shape=(1,), name="sub_product_xf")
    input_state = tf.keras.Input(shape=(1,), name="state_xf")
    input_issue = tf.keras.Input(shape=(1,), name="issue_xf")
    input_company = tf.keras.Input(shape=(1,), name="company_xf")
    input_sub_issue = tf.keras.Input(shape=(1,), name="sub_issue_xf")


    # convert to embeddings
    embed_sub_product = tf.keras.layers.Embedding(70, 35)(input_sub_product)
    embed_sub_product = tf.keras.layers.Reshape((35, ), input_shape=(1, 35))(embed_sub_product)

    embed_state = tf.keras.layers.Embedding(70, 35)(input_state)
    embed_state = tf.keras.layers.Reshape((35, ), input_shape=(1, 35))(embed_state)
    
    embed_issue = tf.keras.layers.Embedding(100, 50)(input_issue)
    embed_issue = tf.keras.layers.Reshape((50, ), input_shape=(1, 50))(embed_issue)
    
    embed_company = tf.keras.layers.Embedding(110, 50)(input_company)
    embed_company = tf.keras.layers.Reshape((50, ), input_shape=(1, 50))(embed_company)
    
    embed_sub_issue = tf.keras.layers.Embedding(70, 35)(input_sub_issue)
    embed_sub_issue = tf.keras.layers.Reshape((35, ), input_shape=(1, 35))(embed_sub_issue)

    x_feed_forward = tf.keras.layers.concatenate(
        [input_product, input_company_response, input_timely_response, input_zip_code, 
         embed_sub_product, embed_state, embed_issue, embed_company, embed_sub_issue])

    
    x = tf.keras.layers.Dense(256, activation='relu')(x_feed_forward)
    x = tf.keras.layers.Dropout(0.35)(x)
    x = tf.keras.layers.Dense(256, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.15)(x)
    x = tf.keras.layers.Dense(32, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.35)(x)

    output = tf.keras.layers.Dense(1, activation='sigmoid')(x) 

    _inputs = [input_product, input_company_response, input_timely_response, input_zip_code, 
               input_sub_product, input_state, 
              input_issue, input_company, input_sub_issue] 

    keras_model = tf.keras.models.Model(_inputs, output)
    keras_model.compile(optimizer=tf.keras.optimizers.Adam('learning_rate'=0.001),
                     loss='binary_crossentropy',  # categorical_crossentropy
                     metrics=[
                         tf.keras.metrics.BinaryAccuracy(),
                         ])
    if show_summary:
        keras_model.summary()

    return keras_model

In [None]:
model = get_model(show_summary=False)

In [None]:
model.fit(x=X, y=y, batch_size=128, validation_split=0.2, epochs=40, 
          callbacks=[WandbCallback(), tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)])

In [23]:
from kerastuner.tuners import RandomSearch

tuner = RandomSearch(
    get_model,
    objective='val_binary_accuracy',
    max_trials=5,
    executions_per_trial=3,
    directory='./tuner_experiment',
    project_name='helloworld')

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
sub_product_xf (InputLayer)     [(None, 1)]          0                                            
__________________________________________________________________________________________________
state_xf (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
issue_xf (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
company_xf (InputLayer)         [(None, 1)]          0                                            
______________________________________________________________________________________________

In [24]:
tuner.search(X, y,
             epochs=5,
             validation_split=0.2)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
sub_product_xf (InputLayer)     [(None, 1)]          0                                            
__________________________________________________________________________________________________
state_xf (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
issue_xf (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
company_xf (InputLayer)         [(None, 1)]          0                                            
______________________________________________________________________________________________

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
sub_product_xf (InputLayer)     [(None, 1)]          0                                            
__________________________________________________________________________________________________
state_xf (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
issue_xf (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
company_xf (InputLayer)         [(None, 1)]          0                                            
______________________________________________________________________________________________

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
sub_product_xf (InputLayer)     [(None, 1)]          0                                            
__________________________________________________________________________________________________
state_xf (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
issue_xf (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
company_xf (InputLayer)         [(None, 1)]          0                                            
______________________________________________________________________________________________

INFO:tensorflow:Oracle triggered exit


In [None]:
# https://towardsdatascience.com/how-to-build-a-wide-and-deep-model-using-keras-in-tensorflow-2-0-2f7a236b5a4b
def wide_and_deep_classifier(inputs, linear_feature_columns, dnn_feature_columns, dnn_hidden_units):
    deep = tf.keras.layers.DenseFeatures(dnn_feature_columns)(inputs)
    for numnodes in dnn_hidden_units:
        deep = tf.keras.layers.Dense(numnodes, activation='relu')(deep)        
    wide = tf.keras.layers.DenseFeatures(linear_feature_columns)(inputs)
    both = tf.keras.layers.concatenate([deep, wide])
    output = tf.keras.layers.Dense(1, activation='sigmoid')(both)
    model = tf.keras.Model(inputs, output)
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model


In [None]:
model = wide_and_deep_classifier(inputs, sparse.values(), real.values(), [64, 16])

In [29]:
def wide_and_deep_model(show_summary=True):
    """
    Function defines a Keras model and returns the model as Keras object
    """
    wandb.init(project="consumer-complaints")
    config = wandb.config
    config.name='test_wide_and_deep'
    #config.hidden_layer_size = 256
    config.optimizer = 'adam'
    config.learning_rate = 0.001
    config.data_version = 'cc_large_balanced'

    #cat_features = ["sub_product", "state", "issue", "company", "sub_issue"]
    # one_hot_features = ['product', 'company_response', 'timely_response', 'company', 'issue', 'zip_code_short']

    # one-hot categorical features
    num_products = 11
    num_company_responses = 5
    num_timely_responses = 2
    num_companies = 41
    num_issues = 40
    num_zip_codes = 14

    input_product = tf.keras.Input(shape=(num_products,), name="product_xf")
    input_company_response = tf.keras.Input(shape=(num_company_responses,), name="company_response_xf")
    input_timely_response = tf.keras.Input(shape=(num_timely_responses,), name="timely_response_xf")
    input_zip_code = tf.keras.Input(shape=(num_zip_codes,), name="zip_code_xf")

    # categorical features
    input_sub_product = tf.keras.Input(shape=(1,), name="sub_product_xf")
    input_state = tf.keras.Input(shape=(1,), name="state_xf")
    input_issue = tf.keras.Input(shape=(1,), name="issue_xf")
    input_company = tf.keras.Input(shape=(1,), name="company_xf")
    input_sub_issue = tf.keras.Input(shape=(1,), name="sub_issue_xf")


    # convert to embeddings
    embed_sub_product = tf.keras.layers.Embedding(70, 35)(input_sub_product)
    embed_sub_product = tf.keras.layers.Reshape((35, ), input_shape=(1, 35))(embed_sub_product)

    embed_state = tf.keras.layers.Embedding(70, 35)(input_state)
    embed_state = tf.keras.layers.Reshape((35, ), input_shape=(1, 35))(embed_state)
    
    embed_issue = tf.keras.layers.Embedding(100, 50)(input_issue)
    embed_issue = tf.keras.layers.Reshape((50, ), input_shape=(1, 50))(embed_issue)
    
    embed_company = tf.keras.layers.Embedding(110, 50)(input_company)
    embed_company = tf.keras.layers.Reshape((50, ), input_shape=(1, 50))(embed_company)
    
    embed_sub_issue = tf.keras.layers.Embedding(70, 35)(input_sub_issue)
    embed_sub_issue = tf.keras.layers.Reshape((35, ), input_shape=(1, 35))(embed_sub_issue)
    
    # add USE for narrative, and reduce number of dimensions
    
    deep_ff = tf.keras.layers.concatenate(
        [embed_sub_product, embed_state, embed_issue, embed_company, embed_sub_issue])
    
    deep = tf.keras.layers.Dense(64, activation='relu')(deep_ff)
    deep = tf.keras.layers.Dense(64, activation='relu')(deep)


    wide = tf.keras.layers.concatenate(
        [input_product, input_company_response, input_timely_response, input_zip_code])

    both = tf.keras.layers.concatenate([deep, wide])

    output = tf.keras.layers.Dense(1, activation='sigmoid')(both) 

    _inputs = [input_product, input_company_response, input_timely_response, input_zip_code, 
               input_sub_product, input_state, 
              input_issue, input_company, input_sub_issue] 

    keras_model = tf.keras.models.Model(_inputs, output)
    keras_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                     loss='binary_crossentropy',  # categorical_crossentropy
                     metrics=[
                         tf.keras.metrics.BinaryAccuracy(),
                         ])
    if show_summary:
        keras_model.summary()

    return keras_model

In [32]:
model = wide_and_deep_model(show_summary=False)

In [33]:
model.fit(x=X, y=y, batch_size=128, validation_split=0.2, epochs=40, 
          callbacks=[WandbCallback(), tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)])

Train on 96169 samples, validate on 24043 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40


<tensorflow.python.keras.callbacks.History at 0x142fa9d68>