### Kiva Loan Default Prediction


In [None]:
!pip install transformers



In [None]:
import matplotlib.pyplot as plt
import tensorflow.keras as keras
import pandas as pd

try: # this is only working on the 2nd try in colab :)
  from transformers import DistilBertTokenizer, TFDistilBertModel
  from transformers import BertTokenizer, TFBertModel
except Exception as err: # so we catch the error and import it again
  from transformers import DistilBertTokenizer, TFDistilBertModel
  from transformers import BertTokenizer, TFBertModel


import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.layers import Dense, Input, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


dbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_tok = BertTokenizer.from_pretrained('bert-base-cased')

# Data Preparation

## Clean the text and your targets
Hints: 

3. These resources will help you understand what type of cleaning will be required and how you can encode your text for the network:
    - a) Preprocessing: https://huggingface.co/transformers/preprocessing.html
    - b) Summary of tokenizers (DistilBERT uses WordPiece): https://huggingface.co/transformers/tokenizer_summary.html#wordpiece
4. Consider the text length, is this too big/small for DistilBERT? what impact would padding/truncation have?
5. In load data you generated a profiling report of this dataset, might be helpful to review that as well

In [None]:
#Creating the dataframes
kiva_train = pd.read_csv('https://raw.githubusercontent.com/DDave94/Loan-Default-Prediction/main/kiva_train.csv')
kiva_test = pd.read_csv('https://raw.githubusercontent.com/DDave94/Loan-Default-Prediction/main/kiva_test.csv')

#Formatting kaggle dataframe to run in the predict function
kaggle_test = kiva_test['en_clean']

kiva_train.head(10)

Unnamed: 0,loan_id,en_clean,defaulted
0,7779,She opened a colmado out of the side of her ho...,0
1,2777,(First Loan): Joffre continues to run his loc...,1
2,6007,"Dina Santana is the mother of two children, Ju...",0
3,76,"Rosemary is 50 years old, single, and has 6 ch...",1
4,4217,"Segundo has a shop where he sells animal feed,...",0
5,5077,"I am a single parent, mother of 4. I sell groc...",0
6,6033,Mariana Jose Serda owns a general store in Mag...,0
7,843,Mary is a mother of four children. One of the ...,1
8,4357,Wilson is seeking his second loan with Kiva an...,0
9,7007,Clara lives with her two children (ages: 20 an...,0


In [None]:
# Creating Training Data
X = kiva_train['en_clean']
y = kiva_train['defaulted']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_test = kaggle_test

### Tokenization of Data

In [None]:
def encode_text(text):
    # For distilbert
    # model_inputs_and_masks = dbert_tokenizer(
    #     text.to_list(), 
    #     return_tensors="tf",
    #     padding='max_length',
    #     truncation=True,
    #     max_length=400
    # )

    #For Bert
    model_inputs_and_masks = bert_tok(
        text.to_list(), 
        return_tensors="tf",
        padding='max_length',
        truncation=True,
        max_length=300
    )

    input_ids = model_inputs_and_masks['input_ids']
    tokens =  model_inputs_and_masks['token_type_ids']
    attention_mask = model_inputs_and_masks['attention_mask']

    return input_ids, tokens, attention_mask

def prepare_target(raw_y):
    # 0 = "non-default"
    # 1 = "default"
    y = keras.utils.to_categorical(raw_y)
    y = np.array(y)
    return y



In [None]:
#Creating Data for distilBert

train_input, train_tokens, train_mask = encode_text(X_train)
train_y = y_train 

val_input, val_tokens, val_mask = encode_text(X_val)
val_y = y_val 

test_input, test_tokens, test_mask = encode_text(X_test)


train_model_inputs_and_masks = {
    'inputs' : train_input,
    'tokens': train_tokens,
    'masks' : train_mask
}

val_model_inputs_and_masks = {
    'inputs' : val_input,
    'tokens': val_tokens,
    'masks' : val_mask
}

test_model_inputs_and_masks = {
    'inputs' : test_input,
    'tokens': test_tokens,
    'masks' : test_mask
}

# Modelling

Resources:
- DistilBERT paper: https://arxiv.org/abs/1910.01108
- DistilBERT Tensorflow Documentation: https://huggingface.co/transformers/model_doc/distilbert.html#tfdistilbertmodel

In [None]:
def build_model(base_model, trainable=True, params={}):
    
    max_seq_len = params["max_seq_len"]
    inputs = Input(shape = (max_seq_len,), dtype='int64', name='inputs')
    tokens = Input(shape = (max_seq_len,), dtype='int64', name='tokens')
    masks  = Input(shape = (max_seq_len,), dtype='int64', name='masks')

    base_model.trainable = trainable

    model_output = base_model(inputs, token_type_ids=tokens, attention_mask=masks)
    last_hidden_state = model_output.last_hidden_state

    # Any additional layers should go here
    # use the 'params' as a dictionary for hyper parameter to facilitate experimentation
    cls_output = last_hidden_state[:,0,:]
    # two fully connected layers with dropout. This can be tweaked
    x = Dense(params["layer_width1"], activation='relu')(cls_output)
    x = Dropout(params["dropout1"])(x)
    x = Dense(params["layer_width2"], activation='relu')(x)
    x = Dropout(params["dropout2"])(x)
    x = Dense(params["layer_width3"], activation='relu')(x)
    x = Dropout(params["dropout3"])(x)


    probs = Dense(1, activation='sigmoid')(x)

    model = keras.Model(inputs=[inputs, tokens, masks], outputs=probs)
    model.summary()
    return model



In [None]:
def compile_model(model):
    # TODO: compile the model, include relevant auc metrics when training
    # DO NOT CHANGE THE INPUTS OR OUTPUTS TO THIS FUNCTION
    # Hint: you may want to read up on the "multi_label" parameter in the keras AUC metrics
    model.compile(
        loss=keras.losses.BinaryCrossentropy(),
        optimizer=keras.optimizers.Adam(learning_rate=4e-5),
        metrics=[
            'accuracy', 
            keras.metrics.Precision(),
            keras.metrics.Recall()
        ]
    )
    
    return model

In [None]:
def train_model(model, model_inputs_and_masks_train, model_inputs_and_masks_val, y_train, y_val, batch_size, num_epochs):
    
    es = keras.callbacks.EarlyStopping(
        monitor="val_loss", 
        mode='min', 
        verbose=1,
        patience=1
    )
    
    history = model.fit(model_inputs_and_masks_train, y_train, batch_size=batch_size, epochs=num_epochs,verbose=1, validation_data= (model_inputs_and_masks_val, y_val), callbacks=[es])
    return model, history

# Execution




Use the cell below to execute and experiment with your model

In [None]:
#dbert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')
bert_model = bert_model = TFBertModel.from_pretrained('bert-base-cased')

params={"max_seq_len" : train_input.shape[1],
        "layer_width1" : 64,
        "dropout1" : 0.2,
        "layer_width2" : 32,
        "dropout2" : 0.2,
        "layer_width3" : 16,
        "dropout3" : 0.2}

model = build_model(bert_model, params=params)
model = compile_model(model)
model, history = train_model(model, train_model_inputs_and_masks, val_model_inputs_and_masks, train_y, val_y, batch_size=20, num_epochs=14)


Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
inputs (InputLayer)             [(None, 300)]        0                                            
_________________________________________

# Validation Data

In [None]:
pred_df = pd.DataFrame(columns = ['preds'])

def create_preds(text):

    #Encode Text 
    inputs_and_masks = bert_tok(text, return_tensors="tf",padding='max_length',truncation=True,max_length=300)

    input_ids = inputs_and_masks['input_ids']
    tokens = inputs_and_masks['token_type_ids']
    attention_mask = inputs_and_masks['attention_mask']
    

    # Creating dictionary for ids and masks
    model_inputs_and_masks = {
    'inputs' : input_ids,
    'tokens': tokens,
    'masks' : attention_mask 
    }

    # Get prediction from model
    prediction = model(model_inputs_and_masks, training=False)
    
    # # Store prediction
    pred = np.array(prediction)

    return pred

# 
pred_df = X_val.apply(create_preds)

new_preds = pred_df.apply(lambda x: x.flatten())
new_preds = new_preds.apply(lambda x: x.round())
new_preds = new_preds.apply(lambda x: int(x))

print(classification_report(y_val, new_preds))

              precision    recall  f1-score   support

           0       0.82      0.90      0.86       629
           1       0.88      0.79      0.83       599

    accuracy                           0.85      1228
   macro avg       0.85      0.84      0.85      1228
weighted avg       0.85      0.85      0.85      1228



### Kaggle Predictions

In [None]:
#model.save('/content/drive/MyDrive/Model')