In [1]:
import sys,os

sys.path.insert(0,'/home/ubuntu/notebooks/')

from typing import Callable

import pandas as pd
import numpy as np

import tensorflow as tf

from tensorflow.keras.layers import IntegerLookup
from tensorflow.keras.layers import Normalization
from tensorflow.keras.layers import StringLookup

In [5]:
from server_api import parse

In [None]:
tf.__version__

'2.6.0'

# Tasks

- Create an API endpoint which accepts a CSV file as the input via POST method


- Load the data into a database of your choice


- Create an ML model to predict the likelihood of a customer subscribing to a banking product. Note that this can be a very simple model, and the goal here is not to spend time on tweaking the model to achieve the best classification performance. 


- Create an API endpoint which accepts relevant parameters from a potential customer and returns the likelihood of them also subscribing to the product


# EDA and parsing of data

In [7]:
df, missing_cols = parse.parse_csv("bank-full.csv")
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,4,1,2,0,2143,1,0,2,5,8,261,1,-1,0,3
1,44,9,2,1,0,29,1,0,2,5,8,151,1,-1,0,3
2,33,2,1,1,0,2,1,1,2,5,8,76,1,-1,0,3
3,47,1,1,3,0,1506,1,0,2,5,8,92,1,-1,0,3
4,33,11,2,3,0,1,0,0,2,5,8,198,1,-1,0,3


# Categorical Encoding


## target column "y"

In [13]:
df["y"].unique()

n_positive = len(df[df["y"] == 1]); print(n_positive)

n_negative = len(df[df["y"] == 0]); print(n_negative)

n_positive / n_negative

5289
39922


0.1324833425179099

# Datasets

## Data functions

In [55]:
def dataframe_to_dataset(df, shuffle=True):
    df = df.copy()
    if "duration" in df.columns:
        df = df.drop("duration",axis=1)
    labels = df.pop("y") if "y" in df.columns else [-1] * len(df)
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(df))
    return ds

def encode_numerical_feature(feature, name, dataset):
    # Create a Normalization layer for our feature
    normalizer = Normalization()

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # Learn the statistics of the data
    normalizer.adapt(feature_ds)

    # Normalize the input feature
    encoded_feature = normalizer(feature)
    return encoded_feature


def encode_categorical_feature(feature, name, dataset, is_string=False):
    lookup_class = StringLookup if is_string else IntegerLookup
    # Create a lookup layer which will turn strings into integer indices
    lookup = lookup_class(output_mode="binary")

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # Learn the set of possible string values and assign them a fixed integer index
    lookup.adapt(feature_ds)

    # Turn the string input into integer indices
    encoded_feature = lookup(feature)
    return encoded_feature


In [15]:
@dataclass
class Feature:
    name: str
    dtyper: str
    is_string: bool=False
    __input_placeholder: Callable=None
        
    @property
    def input_placeholder(self):
        if self.__input_placeholder is None:
            self.__input_placeholder = tf.keras.Input(shape=(1,), name=self.name, dtype=self.dtyper)
        return self.__input_placeholder
    
    @property
    def encode_func(self):
        if self.dtyper=="string":
            return encode_categorical_feature(self.input_placeholder, self.name, train_ds, True)
        elif self.dtyper=="int64":
            return encode_categorical_feature(self.input_placeholder, self.name, train_ds, False)
        elif self.dtyper=="float32":
            return encode_numerical_feature(self.input_placeholder, self.name, train_ds)
        else:
            raise AttributeError(f"Type {self.dtyper} not recognised no valid logic ")
        

## Sample balanced proportion of positive and negative for validation

- Ideally don't want to loose 50% of positive class training data but at this stage confirming findings and power of model more important than optim

In [11]:
weights = 1 / df.groupby('y')['y'].transform('count')

In [16]:
validation_df = df.sample(frac=0.05)

len(validation_df[validation_df["y"] == 0])

len(validation_df[validation_df["y"] == 1])

train_df = df.drop(validation_df.index)


train_ds = dataframe_to_dataset(train_df)
validation_ds = dataframe_to_dataset(validation_df)


## Encode the variables

In [19]:
feature_names_lookup = {
    "cat_int":["day","job", "marital","education","default","housing", "loan", "contact", "month","poutcome"],
    "numerical_features":["age", "balance","campaign","pdays","previous"]
}

In [20]:
%%time
inputs = []
features = []
    
for feature_cat, feat_name_list in feature_names_lookup.items():
    
    if feature_cat == "cat_strings":
        feature_list = [Feature(feat_name,"string") for feat_name in feat_name_list]
    elif feature_cat == "cat_int":
        feature_list = [Feature(feat_name,"int64") for feat_name in feat_name_list]
    else:
        feature_list = [Feature(feat_name,"float32") for feat_name in feat_name_list]
        
    inputs.extend([f.input_placeholder for f in feature_list])
    features.extend([f.encode_func for f in feature_list])

CPU times: user 4min 43s, sys: 1min 27s, total: 6min 11s
Wall time: 3min 12s


# Model

Ridiculously simple baseline model 

In [21]:
all_features = tf.keras.layers.concatenate(features)

x = tf.keras.layers.Dense(60, activation="relu")(all_features)
x = tf.keras.layers.Dropout(0.5)(x)
x = tf.keras.layers.Dense(30, activation="relu")(x)

output = tf.keras.layers.Dense(1, activation="sigmoid")(x)

model = tf.keras.Model(inputs, output)
model.compile("adam", "binary_crossentropy", metrics=[tf.keras.metrics.BinaryAccuracy(),
                                                     tf.keras.metrics.AUC(),
                                                     tf.keras.metrics.Recall(),
                                                     tf.keras.metrics.Precision()])

In [22]:
__train_ds = train_ds.batch(batch_size=70)

__validation_ds = validation_ds.batch(batch_size=70)

In [23]:
model.fit(__train_ds, epochs=50, validation_data=__validation_ds, validation_freq=5, verbose=2)

Epoch 1/50
614/614 - 3s - loss: 0.3321 - binary_accuracy: 0.8846 - auc: 0.7042 - recall: 0.0570 - precision: 0.5586
Epoch 2/50
614/614 - 2s - loss: 0.3058 - binary_accuracy: 0.8912 - auc: 0.7585 - recall: 0.1571 - precision: 0.6391
Epoch 3/50
614/614 - 2s - loss: 0.3006 - binary_accuracy: 0.8924 - auc: 0.7681 - recall: 0.1884 - precision: 0.6321
Epoch 4/50
614/614 - 2s - loss: 0.2960 - binary_accuracy: 0.8925 - auc: 0.7790 - recall: 0.2000 - precision: 0.6241
Epoch 5/50
614/614 - 2s - loss: 0.2950 - binary_accuracy: 0.8922 - auc: 0.7801 - recall: 0.2030 - precision: 0.6162 - val_loss: 0.2952 - val_binary_accuracy: 0.8921 - val_auc: 0.8055 - val_recall: 0.2007 - val_precision: 0.6875
Epoch 6/50
614/614 - 2s - loss: 0.2926 - binary_accuracy: 0.8925 - auc: 0.7852 - recall: 0.2090 - precision: 0.6172
Epoch 7/50
614/614 - 2s - loss: 0.2898 - binary_accuracy: 0.8942 - auc: 0.7908 - recall: 0.2301 - precision: 0.6275
Epoch 8/50
614/614 - 2s - loss: 0.2885 - binary_accuracy: 0.8934 - auc: 0.79

<keras.callbacks.History at 0x7fa0f479d700>

## Save to disk

In [25]:
model.save("simple_bank_model.model")

INFO:tensorflow:Assets written to: server_api/simple_bank_model.model/assets


In [26]:
test = df.iloc[:2]

In [30]:
test = test.drop(["y", "duration"],axis=1)

In [85]:
test_ds = dataframe_to_dataset(test, shuffle=False)
test_ds = test_ds.batch(70)


preds = model.predict(test_ds)
preds = preds.ravel()

preds

array([0.05408671, 0.04288122], dtype=float32)

# Validate model reloads

In [76]:
from server_api import prediction_model

In [80]:
reload(prediction_model)
prediction_model.predict_liklihood(test)

array([0.05408671, 0.04288122], dtype=float32)