In [1]:
import pandas as pd
import os
import sys
import tensorflow as tf
import numpy as np

In [41]:
import wandb
from wandb.keras import WandbCallback

W&B Run: https://app.wandb.ai/drcat101/consumer-complaints/runs/dqu609d4

In [2]:
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

In [3]:
feature_names = ["product", "sub_product", "issue", "sub_issue", "state", "zip_code", "company", "company_response", "timely_response", "consumer_disputed"]
cat_features = ["sub_product", "state"]
#bucket_features = ['zip_code']
one_hot_features = ['product', 'company_response', 'timely_response', 'company', 'issue', 'zip_code_short']

In [4]:
df = pd.read_csv('../data/21Feb/26k-consumer-complaints-labels.csv', usecols=feature_names, na_values=0, keep_default_na=False)

In [5]:
df.head()

Unnamed: 0,product,sub_product,issue,sub_issue,state,zip_code,company,company_response,timely_response,consumer_disputed
0,Debt collection,,Communication tactics,Frequent or repeated calls,TX,76119,"Premium Asset Services, LLC",In progress,Yes,
1,Debt collection,Medical,Cont'd attempts collect debt not owed,Debt is not mine,TX,77479,Accounts Receivable Consultants Inc.,Closed with explanation,Yes,
2,Mortgage,FHA mortgage,"Application, originator, mortgage broker",,MA,2127,RBS Citizens,Closed with explanation,Yes,1.0
3,Credit card,,Other,,CA,92592,Navy FCU,In progress,Yes,
4,Debt collection,Non-federal student loan,Cont'd attempts collect debt not owed,Debt resulted from identity theft,,43068,Transworld Systems Inc.,In progress,Yes,


In [66]:
df['sub_issue'].value_counts()

                                            14945
Debt is not mine                             2125
Account status                               1604
Information is not mine                      1307
Debt was paid                                 890
Not given enough info to verify debt          815
Frequent or repeated calls                    754
Account terms                                 484
Attempted to collect wrong amount             482
Right to dispute notice not received          387
Problem getting my free annual report         386
Public record                                 356
Talked to a third party about my debt         289
Personal information                          258
Threatened to take legal action               246
Problem with statement of dispute             243
No notice of investigation status/result      241
Debt resulted from identity theft             205
Problem getting report or credit score        198
Report improperly shared by CRC               162


In [7]:
top_n = 20
top_companies = df['company'].value_counts().head(top_n).index.tolist()
df.loc[~df['company'].isin(top_companies), 'company'] = 'other'

In [8]:
top_issues = df['issue'].value_counts().head(top_n).index.tolist()
df.loc[~df['issue'].isin(top_issues), 'issue'] = 'Other'

In [9]:
df['zip_code_short'] = df['zip_code'].apply(lambda x: str(x)[0])

In [10]:
for col in one_hot_features:
    print(col)
    print(df[col].nunique())

product
11
company_response
6
timely_response
2
company
21
issue
20
zip_code_short
9


In [11]:
for feature in cat_features + one_hot_features:
    df[feature] = df[feature].astype("category").cat.codes

In [19]:
df['consumer_disputed'] = df['consumer_disputed'].fillna(0)

In [21]:
df_majority = df[df['consumer_disputed']==0.]
df_minority = df[df['consumer_disputed']==1.]

In [18]:
len(df_minority)

4708

In [22]:
df_majority = df_majority.sample(n=4708, replace=False)

In [23]:
df_resampled = pd.concat([df_majority, df_minority])

In [24]:
df_resampled = df_resampled.sample(frac=1).reset_index(drop=True)


In [26]:
one_hot_x = [pd.np.asarray(tf.keras.utils.to_categorical(df_resampled[feature_name].values)) for feature_name in one_hot_features]

In [27]:
embedding_x = [pd.np.asarray(df_resampled[feature_name].values).reshape(-1) for feature_name in cat_features]#[:-1]

In [28]:
X = one_hot_x + embedding_x

In [33]:
y = np.asarray(df_resampled["consumer_disputed"], dtype=np.uint8).reshape(-1)

In [63]:
def get_model(show_summary=True):
    """
    Function defines a Keras model and returns the model as Keras object
    """
    wandb.init(project="consumer-complaints")
    config = wandb.config
    config.hidden_layer_size = 256
    config.optimizer = 'adam'

    # cat_features = ["sub_product", "state"]
    # one_hot_features = ['product', 'company_response', 'timely_response', 'company', 'issue', 'zip_code_short']

    # one-hot categorical features
    num_products = 11
    num_company_responses = 6
    num_timely_responses = 2
    num_companies = 21
    num_issues = 20
    num_zip_codes = 9

    input_product = tf.keras.Input(shape=(num_products,), name="product_xf")
    input_company_response = tf.keras.Input(shape=(num_company_responses,), name="company_response_xf")
    input_timely_response = tf.keras.Input(shape=(num_timely_responses,), name="timely_response_xf")
    input_company = tf.keras.Input(shape=(num_companies,), name="company_xf")
    input_issue = tf.keras.Input(shape=(num_issues,), name="issue_xf")
    input_zip_code = tf.keras.Input(shape=(num_zip_codes,), name="zip_code_xf")

    # categorical features
    input_sub_product = tf.keras.Input(shape=(1,), name="sub_product_xf")
    input_state = tf.keras.Input(shape=(1,), name="state_xf")

    # convert to embeddings
    embed_sub_product = tf.keras.layers.Embedding(70, 5)(input_sub_product)
    embed_sub_product = tf.keras.layers.Reshape((5, ), input_shape=(1, 5))(embed_sub_product)

    embed_state = tf.keras.layers.Embedding(70, 5)(input_state)
    embed_state = tf.keras.layers.Reshape((5, ), input_shape=(1, 5))(embed_state)

    x_feed_forward = tf.keras.layers.concatenate(
        [input_product, input_company_response, input_timely_response, input_company,
        input_issue, input_zip_code, embed_sub_product, embed_state])

    
    x = tf.keras.layers.Dense(config.hidden_layer_size, activation='relu')(x_feed_forward)
    x = tf.keras.layers.Dense(128, activation='relu')(x)
    x = tf.keras.layers.Dense(16, activation='relu')(x)
    output = tf.keras.layers.Dense(1, activation='sigmoid')(x) 

    _inputs = [input_product, input_company_response, input_timely_response, input_company, 
              input_issue, input_zip_code, input_sub_product, input_state] 

    keras_model = tf.keras.models.Model(_inputs, output)
    keras_model.compile(optimizer=config.optimizer,
                     loss='binary_crossentropy',  # categorical_crossentropy
                     metrics=[
                         tf.keras.metrics.BinaryAccuracy(),
                         ])
    if show_summary:
        keras_model.summary()

    return keras_model

In [64]:
model = get_model(show_summary=False)

In [65]:
model.fit(x=X, y=y, batch_size=32, validation_split=0.25, epochs=40, 
          callbacks=[WandbCallback(), tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)])

Train on 7062 samples, validate on 2354 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40


<tensorflow.python.keras.callbacks.History at 0x149fd6be0>