In [1]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import Ridge
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
import pickle
import joblib
warnings.filterwarnings("ignore")

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
data=pd.read_csv('/content/drive/MyDrive/LoanExport.csv')

In [4]:
data.head()

Unnamed: 0,CreditScore,FirstPaymentDate,FirstTimeHomebuyer,MaturityDate,MSA,MIP,Units,Occupancy,OCLTV,DTI,...,PostalCode,LoanSeqNum,LoanPurpose,OrigLoanTerm,NumBorrowers,SellerName,ServicerName,EverDelinquent,MonthsDelinquent,MonthsInRepayment
0,0,199902,N,202901,16974,25,1,O,89,27,...,60400,F199Q1268030,P,360,2,FL,WASHINGTONMUTUALBANK,0,0,52
1,0,199902,N,202901,19740,0,1,O,73,17,...,80200,F199Q1015092,N,360,1,FT,CHASEHOMEFINANCELLC,0,0,144
2,0,199902,N,202901,29940,0,1,O,75,16,...,66000,F199Q1266886,N,360,2,FL,WASHINGTONMUTUALBANK,0,0,67
3,0,199902,N,202901,31084,0,1,O,76,14,...,90700,F199Q1178167,N,360,2,GM,GMACMTGECORP,0,0,35
4,0,199902,N,202901,35644,0,1,O,78,18,...,7600,F199Q1178517,N,360,2,GM,GMACMTGECORP,0,0,54


In [5]:
data['NumBorrowers'] = pd.to_numeric(data['NumBorrowers'], errors='coerce')
data['MSA'] = pd.to_numeric(data['MSA'], errors='coerce')

In [6]:
def replace_X_with_NAN(df):
    columns = data.select_dtypes(include=['object']).columns
    for column in columns:
        data.loc[data[column].str.strip() == 'X', column] = np.nan
    return df
data = replace_X_with_NAN(data)

In [7]:
data.isnull().sum()

CreditScore               0
FirstPaymentDate          0
FirstTimeHomebuyer    78015
MaturityDate              0
MSA                   39100
MIP                       0
Units                     0
Occupancy                 0
OCLTV                     0
DTI                       0
OrigUPB                   0
LTV                       0
OrigInterestRate          0
Channel                   0
PPM                    5405
ProductType               0
PropertyState             0
PropertyType             22
PostalCode                6
LoanSeqNum                0
LoanPurpose               0
OrigLoanTerm              0
NumBorrowers            339
SellerName            24994
ServicerName              0
EverDelinquent            0
MonthsDelinquent          0
MonthsInRepayment         0
dtype: int64

In [8]:
mode_value_SellerName = data['SellerName'].mode()[0]
mode_value_PropertyType = data['PropertyType'].mode()[0]
mode_value_MSA = data['MSA'].mode()[0]
mode_value_PostalCode = data['PostalCode'].mode()[0]
mode_value_PPM = data['PPM'].mode()[0]
mode_value_NumBorrowers = data['NumBorrowers'].mode()[0]

In [9]:
data['FirstTimeHomebuyer'].fillna('Y', inplace=True)
data['SellerName'].fillna(mode_value_SellerName, inplace=True)
data['PropertyType'].fillna(mode_value_PropertyType, inplace=True)
data['MSA'].fillna(mode_value_MSA, inplace=True)
data['PostalCode'].fillna(mode_value_PostalCode, inplace=True)
data['PPM'].fillna(mode_value_PPM, inplace=True)
data['NumBorrowers'].fillna(mode_value_NumBorrowers, inplace=True)

In [10]:
data.isnull().sum()

CreditScore           0
FirstPaymentDate      0
FirstTimeHomebuyer    0
MaturityDate          0
MSA                   0
MIP                   0
Units                 0
Occupancy             0
OCLTV                 0
DTI                   0
OrigUPB               0
LTV                   0
OrigInterestRate      0
Channel               0
PPM                   0
ProductType           0
PropertyState         0
PropertyType          0
PostalCode            0
LoanSeqNum            0
LoanPurpose           0
OrigLoanTerm          0
NumBorrowers          0
SellerName            0
ServicerName          0
EverDelinquent        0
MonthsDelinquent      0
MonthsInRepayment     0
dtype: int64

In [11]:
data.loc[(data['CreditScore'] < 300) | (data['CreditScore'] > 850),'CreditScore']=709

In [12]:
data.loc[data['MIP'] > 55, 'MIP']=data['MIP'].mean()

In [13]:
data.loc[data['Units']==0,'Units']=data['Units'].mode()[0]

In [14]:
data.loc[(data['DTI'] < 1) | (data['DTI'] > 65),'DTI'] = 30

In [15]:
data.loc[(data['LTV'] < 1) | (data['LTV'] > 998),'LTV'] = 77

In [16]:
data.loc[data['PropertyType']=='LH','PropertyType']='MH'

In [17]:
def credit_transformation(x):
    if x >= 0 and x <= 650:
        return 'Poor'

    elif x >= 651 and x <= 700:
        return 'Fair'

    elif x >= 700 and x <= 750:
        return 'Good'

    else:
        return 'Excellent'


data['credit_bins'] = data['CreditScore'].apply(credit_transformation)

In [18]:
def LTV_transformation(x):

    if x >= 0 and x <= 25:
        return 'Low'

    elif x >= 25 and x <= 50:
        return 'Medium'

    else:
        return 'High'

data['LTV_bins'] = data['LTV'].apply(LTV_transformation)

In [19]:
def MonthsInRepayment_transformation(x):
    z = x/12
    if z >= 0 and z <= 4:
        return '0 - 4 yrs'

    elif z > 4 and z <= 8:
        return '4 - 8 yrs'

    elif z > 8 and z <= 12:
        return '8 - 12 yrs'

    elif z > 12 and z <= 16:
        return '12 - 16 yrs'

    else:
        return '16 - 20 yrs'

data['MonthsInRepayment_bins'] = data['MonthsInRepayment'].apply(MonthsInRepayment_transformation)

In [20]:
data['IsFirstTime'] = data['FirstTimeHomebuyer'].replace({'Y': 'yes', 'N': 'no'})

In [21]:
data.IsFirstTime.value_counts()

no     184154
yes    107297
Name: IsFirstTime, dtype: int64

In [22]:
def calculate_monthly_payment(orig_upb, orig_loan_term, orig_interest_rate):
    monthly_interest_rate = orig_interest_rate / 12 / 100
    monthly_payment = orig_upb * (monthly_interest_rate * (1 + monthly_interest_rate) ** orig_loan_term) / ((1 + monthly_interest_rate) ** orig_loan_term - 1)

    return monthly_payment

In [23]:
data['monthly_payment'] = calculate_monthly_payment(data['OrigUPB'], data['OrigLoanTerm'], data['OrigInterestRate'])

In [24]:
def calculate_monthly_income(row):
    dti = row['DTI']
    emi = row['monthly_payment']
    if dti != 0:
      monthly_income = emi / dti
    else:
      return None

    return monthly_income
data['monthly_income'] = data.apply(calculate_monthly_income, axis=1)

In [25]:
data['monthly_rate'] = data['OrigInterestRate'] / 12 / 100

In [26]:
data['Total_payment'] = data['monthly_payment'] * data['OrigLoanTerm']
data['interest_amount'] = data['Total_payment'] - data['OrigUPB']

In [27]:
def principle(monthly_rate, amount, emi, month):
  for i in range(month):
    interest = monthly_rate * amount
    p = emi - interest
    amount -= p
    return amount

In [28]:
data['cur_principle'] = np.vectorize(principle)(data['monthly_rate'],data['OrigUPB'],data['monthly_payment'],data['MonthsInRepayment'])

In [29]:
def prepay(dti, income):
  if(dti < 40):
    p = income / 2
  else:
    p = income*3/4

  return p

In [30]:
data['prepayment'] = np.vectorize(prepay)(data['DTI'],data['monthly_income']*24)
data['prepayment'] = data['prepayment'] - (data['monthly_payment']*24)

In [31]:
data['ScheduledPrincipalPayment'] = data['OrigUPB'] * (data['OrigInterestRate'] / 12 / 100) * (1 - (1 + (data['OrigInterestRate'] / 12 / 100)) ** -data['OrigLoanTerm'])
data

Unnamed: 0,CreditScore,FirstPaymentDate,FirstTimeHomebuyer,MaturityDate,MSA,MIP,Units,Occupancy,OCLTV,DTI,...,MonthsInRepayment_bins,IsFirstTime,monthly_payment,monthly_income,monthly_rate,Total_payment,interest_amount,cur_principle,prepayment,ScheduledPrincipalPayment
0,709,199902,N,202901,16974.0,25.0,1,O,89,27,...,4 - 8 yrs,no,758.859773,28.105918,0.005625,273189.518275,156189.518275,116899.265227,-17875.363541,570.762255
1,709,199902,N,202901,19740.0,0.0,1,O,73,17,...,8 - 12 yrs,no,688.954146,40.526714,0.005417,248023.492419,139023.492419,108901.462521,-16048.578921,505.972484
2,709,199902,N,202901,29940.0,0.0,1,O,75,16,...,4 - 8 yrs,no,578.097356,36.131085,0.005729,208115.048172,120115.048172,87926.069311,-13440.763528,439.690694
3,709,199902,N,202901,31084.0,0.0,1,O,76,14,...,0 - 4 yrs,no,1051.086102,75.077579,0.005729,378390.996677,218390.996677,159865.580565,-24325.135501,799.437626
4,709,199902,N,202901,35644.0,0.0,1,O,78,18,...,4 - 8 yrs,no,734.353190,40.797399,0.005938,264367.148543,155367.148543,108912.834310,-17134.907776,570.368136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
291446,839,199903,N,202902,37964.0,0.0,1,O,60,32,...,0 - 4 yrs,no,598.772246,18.711633,0.005833,215558.008438,125558.008438,89926.227754,-14145.994304,460.316927
291447,840,200210,N,202904,35644.0,0.0,1,O,73,30,...,4 - 8 yrs,no,1739.309232,57.976974,0.005313,554839.644990,287839.644990,266679.128268,-41047.697874,1156.760917
291448,840,200304,N,202904,42044.0,0.0,1,O,73,31,...,0 - 4 yrs,no,530.567053,17.115066,0.004687,166067.487620,79067.487620,86877.245447,-12528.228480,313.459033
291449,845,199904,N,202903,37964.0,0.0,1,I,80,54,...,0 - 4 yrs,no,585.466196,10.841967,0.005833,210767.830473,122767.830473,87927.867138,-13856.033300,450.087662


In [32]:
features = data.drop(columns=['EverDelinquent'])
target = data['EverDelinquent']

In [33]:
data['Actual_Principal_Payment'] = data['Total_payment'] - data['interest_amount']

In [34]:
prepayment_risk = []

for i in range(len(data['ScheduledPrincipalPayment'])):
    ppr = (data['ScheduledPrincipalPayment'][i] - data['Actual_Principal_Payment'][i]) / data['cur_principle'][i]
    prepayment_risk.append(ppr)


# for i, ppr in enumerate(prepayment_risk):
#     print(f"Period {i + 1}: PPR = {ppr:.2%}")

In [35]:
data['ppr'] = ((data['ScheduledPrincipalPayment'] - data['Actual_Principal_Payment']) / data['cur_principle'])*100

In [36]:
data  = data.drop(['FirstPaymentDate','MaturityDate','OrigLoanTerm','PostalCode','PPM', 'LoanSeqNum', 'ProductType', 'Occupancy', 'FirstTimeHomebuyer','Channel', 'PropertyType', 'LoanPurpose','PropertyState'],axis=1)

In [37]:
encoder=LabelEncoder()
obj = ['NumBorrowers','LTV_bins','credit_bins','MonthsInRepayment_bins','IsFirstTime']
for i in obj:
  data[i]=encoder.fit_transform(data[i])

In [38]:
columns_to_encode = ['ServicerName', 'SellerName', 'MSA']
encoder = OneHotEncoder(sparse=False, drop='first')
encoded_features = encoder.fit_transform(data[columns_to_encode])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(columns_to_encode))
data = pd.concat([data.drop(columns=columns_to_encode), encoded_df], axis=1)

In [39]:
features = data.drop(columns=['ppr','EverDelinquent','MonthsDelinquent'])

In [40]:
target = data['ppr']

In [41]:
features.columns

Index(['CreditScore', 'MIP', 'Units', 'OCLTV', 'DTI', 'OrigUPB', 'LTV',
       'OrigInterestRate', 'NumBorrowers', 'MonthsInRepayment',
       ...
       'MSA_48900.0', 'MSA_49020.0', 'MSA_49180.0', 'MSA_49340.0',
       'MSA_49420.0', 'MSA_49500.0', 'MSA_49620.0', 'MSA_49660.0',
       'MSA_49700.0', 'MSA_49740.0'],
      dtype='object', length=451)

In [42]:
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.4, random_state=100)

# **Linear Regression Model**

In [43]:
Steps = [('scaler',StandardScaler()),
         ('pca', PCA(n_components=30)),
         ('model',LinearRegression())]

In [44]:
pipelineModelLinear = Pipeline(Steps)
pipelineModelLinear.fit(x_train, y_train)

In [45]:
print('pipeline model score for training data is',pipelineModelLinear.score(x_train, y_train)*100)
print('pipeline model score for testing data is',pipelineModelLinear.score(x_test, y_test)*100)

pipeline model score for training data is 93.42516167542763
pipeline model score for testing data is 93.26155942054025


In [46]:
predicted_v = pipelineModelLinear.predict(x_test)
mse = mean_squared_error(y_test, predicted_v)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 9.692079368554288e-05


# **Ridge Regression Model**

In [47]:
Steps = [('scaler',StandardScaler()),
         ('pca', PCA(n_components=30)),
         ('model',Ridge(alpha=1.0))]

In [48]:
pipelineModelRidge = Pipeline(Steps)
pipelineModelRidge.fit(x_train, y_train)

In [49]:
print('pipeline model score for training data is',pipelineModelRidge.score(x_train, y_train)*100)
print('pipeline model score for testing data is',pipelineModelRidge.score(x_test, y_test)*100)

pipeline model score for training data is 93.27130603370777
pipeline model score for testing data is 93.16807738639419


In [50]:
predicted_v = pipelineModelRidge.predict(x_test)
mse = mean_squared_error(y_test, predicted_v)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 9.826537079324831e-05


# **Save Model**

In [51]:
joblib.dump(pipelineModelLinear,'pipelineModelLinear.pk1')
joblib.dump(pipelineModelRidge,'pipelineModelRidge.pk1')

['pipelineModelRidge.pk1']