In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt


# Import checkpoint dependencies
import os
from tensorflow.keras.callbacks import ModelCheckpoint

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### Description Of The Source Of Data

Fannie Mae provides loan performance data on a portion of its single-family mortgage loans to promote better understanding of the credit performance of Fannie Mae mortgage loans.

Link: https://www.fanniemae.com/portal/funding-the-market/data/loan-performance-data.html

In [2]:
# Import our input dataset
mortgage_df = pd.read_csv('Final Project Data.csv')
mortgage_df.head()


Unnamed: 0,Loan Identifier,Origination Channel,Seller Name,Original Interest Rate,Original UPB,Original Loan Term,Origination Date,First Payment Date,Original Loan-to-value (LTV),Original Combined Loan-to-value (CLTV),...,Property Type,Number of Units,Occupancy Type,Property State,Zip Code Short,Primary Mortgage Insurance Percent,Product Type,Co-borrower Credit Score at Origination,Mortgage Insurance Type,Relocation Mortgage Indicator
0,100000913397,C,"JPMORGAN CHASE BANK, NATIONAL ASSOCIATION",6,324000,360,2018-09-01,2018-11-01,80,80,...,PU,1,P,CA,925,,FRM,665.0,,N
1,100017539727,B,OTHER,5,307000,360,2018-12-01,2019-02-01,90,90,...,PU,1,P,TX,770,25.0,FRM,,1.0,N
2,100018053040,R,"JPMORGAN CHASE BANK, NATIONAL ASSOCIATION",5,256000,360,2018-11-01,2019-01-01,90,90,...,SF,1,S,NC,286,25.0,FRM,738.0,1.0,N
3,100019764317,C,"WELLS FARGO BANK, N.A.",5,248000,360,2018-12-01,2019-02-01,90,90,...,SF,1,P,IL,600,25.0,FRM,791.0,1.0,N
4,100019765730,B,"UNITED SHORE FINANCIAL SERVICES, LLC DBA UNITE...",4,490000,360,2019-03-01,2019-05-01,67,67,...,CO,1,P,CA,945,,FRM,,,N


In [3]:
len(mortgage_df.index)

297452

### Drop Cloumns which are not adding value to the learning process
Right away we can observe that Loan Identifier column does not provide any value. So we will remove that column.

In [4]:
# Drop the unwanted variables from the data
mortgage_input = mortgage_df.drop(columns = ['Loan Identifier', 'Seller Name'])
mortgage_input.head()

Unnamed: 0,Origination Channel,Seller Name,Original Interest Rate,Original UPB,Original Loan Term,Origination Date,First Payment Date,Original Loan-to-value (LTV),Original Combined Loan-to-value (CLTV),Number of Borrowers,...,Property Type,Number of Units,Occupancy Type,Property State,Zip Code Short,Primary Mortgage Insurance Percent,Product Type,Co-borrower Credit Score at Origination,Mortgage Insurance Type,Relocation Mortgage Indicator
0,C,"JPMORGAN CHASE BANK, NATIONAL ASSOCIATION",6,324000,360,2018-09-01,2018-11-01,80,80,2,...,PU,1,P,CA,925,,FRM,665.0,,N
1,B,OTHER,5,307000,360,2018-12-01,2019-02-01,90,90,1,...,PU,1,P,TX,770,25.0,FRM,,1.0,N
2,R,"JPMORGAN CHASE BANK, NATIONAL ASSOCIATION",5,256000,360,2018-11-01,2019-01-01,90,90,2,...,SF,1,S,NC,286,25.0,FRM,738.0,1.0,N
3,C,"WELLS FARGO BANK, N.A.",5,248000,360,2018-12-01,2019-02-01,90,90,2,...,SF,1,P,IL,600,25.0,FRM,791.0,1.0,N
4,B,"UNITED SHORE FINANCIAL SERVICES, LLC DBA UNITE...",4,490000,360,2019-03-01,2019-05-01,67,67,1,...,CO,1,P,CA,945,,FRM,,,N


### Remove all Nan Values

In [5]:
# Drop NA rows
mortgage_input = mortgage_input.dropna()
mortgage_input.head()

Unnamed: 0,Origination Channel,Seller Name,Original Interest Rate,Original UPB,Original Loan Term,Origination Date,First Payment Date,Original Loan-to-value (LTV),Original Combined Loan-to-value (CLTV),Number of Borrowers,...,Property Type,Number of Units,Occupancy Type,Property State,Zip Code Short,Primary Mortgage Insurance Percent,Product Type,Co-borrower Credit Score at Origination,Mortgage Insurance Type,Relocation Mortgage Indicator
2,R,"JPMORGAN CHASE BANK, NATIONAL ASSOCIATION",5,256000,360,2018-11-01,2019-01-01,90,90,2,...,SF,1,S,NC,286,25.0,FRM,738.0,1.0,N
3,C,"WELLS FARGO BANK, N.A.",5,248000,360,2018-12-01,2019-02-01,90,90,2,...,SF,1,P,IL,600,25.0,FRM,791.0,1.0,N
6,R,"JPMORGAN CHASE BANK, NATIONAL ASSOCIATION",5,264000,360,2018-10-01,2018-12-01,95,95,2,...,PU,1,P,TX,773,30.0,FRM,778.0,1.0,Y
8,R,OTHER,5,174000,360,2018-12-01,2019-02-01,97,97,2,...,SF,1,P,IN,479,35.0,FRM,708.0,1.0,N
9,B,"UNITED SHORE FINANCIAL SERVICES, LLC DBA UNITE...",5,204000,360,2019-01-01,2019-03-01,95,95,2,...,PU,1,P,UT,844,25.0,FRM,722.0,1.0,N


In [None]:
len(mortgage_input.index)

In [6]:
# Generate our categorical variable list
mortgage_cat = mortgage_input.dtypes[mortgage_input.dtypes == "object"].index.tolist()
mortgage_cat

['Origination Channel',
 'Seller Name',
 'Origination Date',
 'First Payment Date',
 'First Time Home Buyer Indicator',
 'Loan Purpose ',
 'Property Type',
 'Occupancy Type',
 'Property State',
 'Product Type',
 'Relocation Mortgage Indicator']

In [7]:
# Check the number of unique values in each column
mortgage_input[mortgage_cat].nunique()

Origination Channel                 3
Seller Name                        17
Origination Date                   19
First Payment Date                 19
First Time Home Buyer Indicator     2
Loan Purpose                        3
Property Type                       5
Occupancy Type                      3
Property State                     52
Product Type                        1
Relocation Mortgage Indicator       2
dtype: int64

### We will now use One Hot Encoder to convert text based entries to number values which we will be able to use in our machine learning model.

In [8]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(mortgage_input[mortgage_cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(mortgage_cat)
encode_df.head()

Unnamed: 0,Origination Channel_B,Origination Channel_C,Origination Channel_R,"Seller Name_AMERIHOME MORTGAGE COMPANY, LLC",Seller Name_DITECH FINANCIAL LLC,"Seller Name_EAGLE HOME MORTGAGE, LLC",Seller Name_FAIRWAY INDEPENDENT MORTGAGE CORPORATION,"Seller Name_FLAGSTAR BANK, FSB",Seller Name_FREEDOM MORTGAGE CORP.,"Seller Name_JPMORGAN CHASE BANK, NATIONAL ASSOCIATION",...,Property State_UT,Property State_VA,Property State_VT,Property State_WA,Property State_WI,Property State_WV,Property State_WY,Product Type_FRM,Relocation Mortgage Indicator_N,Relocation Mortgage Indicator_Y
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [9]:
# Merge one-hot encoded features and drop the originals
mortgage_merged = mortgage_input.merge(encode_df,left_index=True, right_index=True)
mortgage_merged = mortgage_merged.drop(mortgage_cat,1)
mortgage_merged.head()

Unnamed: 0,Original Interest Rate,Original UPB,Original Loan Term,Original Loan-to-value (LTV),Original Combined Loan-to-value (CLTV),Number of Borrowers,Original Debt to Income Ratio,Borrower Credit Score at Origination,Number of Units,Zip Code Short,...,Property State_UT,Property State_VA,Property State_VT,Property State_WA,Property State_WI,Property State_WV,Property State_WY,Product Type_FRM,Relocation Mortgage Indicator_N,Relocation Mortgage Indicator_Y
2,5,256000,360,90,90,2,41.0,728.0,1,286,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,5,248000,360,90,90,2,40.0,730.0,1,600,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
6,5,264000,360,95,95,2,45.0,710.0,1,773,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
8,5,174000,360,97,97,2,37.0,747.0,1,479,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
9,5,204000,360,95,95,2,42.0,717.0,1,844,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [10]:
def decision(probability):
    return np.random.choice([0, 1], size = len(mortgage_merged.index), p = [1 - probability, probability])

foreclosure_rng = decision(0.266)
foreclosure_rng

array([1, 1, 1, ..., 0, 1, 1])

In [11]:
mortgage_merged["Foreclosure"] = foreclosure_rng
mortgage_merged

Unnamed: 0,Original Interest Rate,Original UPB,Original Loan Term,Original Loan-to-value (LTV),Original Combined Loan-to-value (CLTV),Number of Borrowers,Original Debt to Income Ratio,Borrower Credit Score at Origination,Number of Units,Zip Code Short,...,Property State_VA,Property State_VT,Property State_WA,Property State_WI,Property State_WV,Property State_WY,Product Type_FRM,Relocation Mortgage Indicator_N,Relocation Mortgage Indicator_Y,Foreclosure
2,5,256000,360,90,90,2,41.0,728.0,1,286,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1
3,5,248000,360,90,90,2,40.0,730.0,1,600,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1
6,5,264000,360,95,95,2,45.0,710.0,1,773,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1
8,5,174000,360,97,97,2,37.0,747.0,1,479,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0
9,5,204000,360,95,95,2,42.0,717.0,1,844,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46281,5,209000,360,95,95,2,33.0,753.0,1,180,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0
46287,5,109000,360,96,96,2,35.0,807.0,1,338,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0
46289,5,105000,360,97,97,2,47.0,683.0,1,74,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0
46294,6,228000,360,97,101,2,47.0,752.0,1,891,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1


## Now we can start to split the data into training and test models

In [12]:
# Split our preprocessed data into our features and target arrays
y = mortgage_merged["Foreclosure"].values
X = mortgage_merged.drop(["Foreclosure"],1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [13]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### why we chose logical regression?
Logistic regression is used to describe data and to explain the relationship between one dependent binary variable and one or more nominal, ordinal, interval or ratio-level independent variables.

We went with Logical regression because of the limited size of the data set available for us to use. Other neural Network (Such as Deep Forest) was not permorming with enough accuracy or (SVM) was getting stuck when trying to fit the data into the model

In [14]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [15]:
classifier.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [16]:
y_pred = classifier.predict(X_test_scaled)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,1
9,0,0


In [17]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.7292239955971381


### Model the Data

We initially were using higher number of nodes which was giving us a high LOSS value at over 1.3
1817/1817 - 0s - loss: 1.3006 - acc: 0.6258
Loss: 1.3005778135410953, Accuracy: 0.6257567405700684

So we reduce the number of nodes to better scale with the data.
hidden_nodes_layer1 = 24
hidden_nodes_layer2 = 8
to
hidden_nodes_layer1 = 8
hidden_nodes_layer2 = 3
and saw  significant improvement on accuracy and loss. ill work through it this week, and will see if i can get a piepline going.

1817/1817 - 0s - loss: 0.6634 - acc: 0.6868
Loss: 0.663360848850645, Accuracy: 0.6868464350700378

In [18]:
# Define the model - deep neural net
number_input_features = len(X_train[0])
#hidden_nodes_layer1 = 24
#hidden_nodes_layer2 = 8
hidden_nodes_layer1 = 8
hidden_nodes_layer2 = 3

activ_func = "relu"

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation=activ_func)
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation=activ_func))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 8)                 1120      
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 27        
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 4         
Total params: 1,151
Trainable params: 1,151
Non-trainable params: 0
_________________________________________________________________


In [19]:
# Define the checkpoint path and filenames
os.makedirs("checkpoints/",exist_ok=True)
checkpoint_path = "checkpoints/cp.{acc:.4f}-{epoch:02d}.hdf5"

In [20]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="nadam", metrics=["accuracy"])

# Create a callback that saves the model's weights every 5 epochs
cp_callback = ModelCheckpoint(
    filepath=checkpoint_path,
    monitor='accuracy',
    verbose=1,
    #save_best_only=True,
    save_weights_only=False,
    save_freq='epoch')

# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=100,callbacks=[cp_callback])

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/100
Epoch 00001: saving model to checkpoints/cp.0.6535-01.hdf5
Epoch 2/100
Epoch 00002: saving model to checkpoints/cp.0.7234-02.hdf5
Epoch 3/100
Epoch 00003: saving model to checkpoints/cp.0.7296-03.hdf5
Epoch 4/100
Epoch 00004: saving model to checkpoints/cp.0.7309-04.hdf5
Epoch 5/100
Epoch 00005: saving model to checkpoints/cp.0.7320-05.hdf5
Epoch 6/100
Epoch 00006: saving model to checkpoints/cp.0.7342-06.hdf5
Epoch 7/100
Epoch 00007: saving model to checkpoints/cp.0.7384-07.hdf5
Epoch 8/100
Epoch 00008: saving model to checkpoints/cp.0.7383-08.hdf5
Epoch 9/100
Epoch 00009: saving model to checkpoints/cp.0.7408-09.hdf5
Epoch 10/100
Epoch 00010: saving model to checkpoints/cp.0.7416-10.hdf5
Epoch 11/100
Epoch 00011: saving model to checkpoints/cp.0.7441-11.hdf5
Epoch 12/100
Epoch 00012: saving model to checkpoints/cp.0.7445-12.hdf5
Epoch 13/100
Epoch 00013: saving model to checkpoin

Epoch 00033: saving model to checkpoints/cp.0.7630-33.hdf5
Epoch 34/100
Epoch 00034: saving model to checkpoints/cp.0.7641-34.hdf5
Epoch 35/100
Epoch 00035: saving model to checkpoints/cp.0.7647-35.hdf5
Epoch 36/100
Epoch 00036: saving model to checkpoints/cp.0.7667-36.hdf5
Epoch 37/100
Epoch 00037: saving model to checkpoints/cp.0.7652-37.hdf5
Epoch 38/100
Epoch 00038: saving model to checkpoints/cp.0.7656-38.hdf5
Epoch 39/100
Epoch 00039: saving model to checkpoints/cp.0.7673-39.hdf5
Epoch 40/100
Epoch 00040: saving model to checkpoints/cp.0.7673-40.hdf5
Epoch 41/100
Epoch 00041: saving model to checkpoints/cp.0.7671-41.hdf5
Epoch 42/100
Epoch 00042: saving model to checkpoints/cp.0.7678-42.hdf5
Epoch 43/100
Epoch 00043: saving model to checkpoints/cp.0.7684-43.hdf5
Epoch 44/100
Epoch 00044: saving model to checkpoints/cp.0.7693-44.hdf5
Epoch 45/100
Epoch 00045: saving model to checkpoints/cp.0.7693-45.hdf5
Epoch 46/100
Epoch 00046: saving model to checkpoints/cp.0.7685-46.hdf5
Epoch

Epoch 67/100
Epoch 00067: saving model to checkpoints/cp.0.7761-67.hdf5
Epoch 68/100
Epoch 00068: saving model to checkpoints/cp.0.7773-68.hdf5
Epoch 69/100
Epoch 00069: saving model to checkpoints/cp.0.7772-69.hdf5
Epoch 70/100
Epoch 00070: saving model to checkpoints/cp.0.7773-70.hdf5
Epoch 71/100
Epoch 00071: saving model to checkpoints/cp.0.7777-71.hdf5
Epoch 72/100
Epoch 00072: saving model to checkpoints/cp.0.7777-72.hdf5
Epoch 73/100
Epoch 00073: saving model to checkpoints/cp.0.7783-73.hdf5
Epoch 74/100
Epoch 00074: saving model to checkpoints/cp.0.7781-74.hdf5
Epoch 75/100
Epoch 00075: saving model to checkpoints/cp.0.7805-75.hdf5
Epoch 76/100
Epoch 00076: saving model to checkpoints/cp.0.7773-76.hdf5
Epoch 77/100
Epoch 00077: saving model to checkpoints/cp.0.7785-77.hdf5
Epoch 78/100
Epoch 00078: saving model to checkpoints/cp.0.7788-78.hdf5
Epoch 79/100
Epoch 00079: saving model to checkpoints/cp.0.7781-79.hdf5
Epoch 80/100
Epoch 00080: saving model to checkpoints/cp.0.7790-

1817/1817 - 0s - loss: 0.7644 - acc: 0.7023
Loss: 0.7643859866692854, Accuracy: 0.702256441116333


Just commited this, my first draft at a neural network. Does some real rough encoding of categories, scaling, etc.
Looks like it's being overfitted or something atm, need to figure that out.

In [21]:
X_train_scaled

array([[-2.40084126, -0.28255216,  0.10832533, ...,  0.        ,
         0.08041092, -0.08041092],
       [-0.21942312, -0.37415897,  0.10832533, ...,  0.        ,
         0.08041092, -0.08041092],
       [-0.21942312,  0.80007377,  0.10832533, ...,  0.        ,
         0.08041092, -0.08041092],
       ...,
       [-0.21942312,  0.44197442,  0.10832533, ...,  0.        ,
         0.08041092, -0.08041092],
       [-0.21942312, -1.09868555,  0.10832533, ...,  0.        ,
         0.08041092, -0.08041092],
       [-0.21942312,  0.30872815,  0.10832533, ...,  0.        ,
         0.08041092, -0.08041092]])

In [22]:
y_score = classifier.decision_function(X_test)


In [23]:
from sklearn.metrics import average_precision_score
average_precision = average_precision_score(y_test, y_score)

print('Average precision-recall score: {0:0.2f}'.format(
      average_precision))


Average precision-recall score: 0.28


In [24]:
from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.728


In [25]:
from sklearn.svm import SVC
# Create the SVM model
svm = SVC(kernel='linear')
# Train the model
svm.fit(X_train, y_train)
# Evaluate the model
y_pred = svm.predict(X_test_scaled)
print(f" SVM model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 SVM model accuracy: 0.269
