# Target variable creation && Regression modelling

## Libraries

In [1]:
#!pip show scikit-learn
#!pip uninstall scikit-learn==1.0.2

In [2]:
#!pip install scikit-learn==1.3.1
#!pip show scikit-learn

In [3]:
#pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [4]:
#!pip install xgboost

Collecting xgboost
  Using cached xgboost-2.0.0-py3-none-win_amd64.whl (99.7 MB)
Installing collected packages: xgboost
Successfully installed xgboost-2.0.0


In [5]:
#%pip install xgboost

In [33]:
import sklearn
print(sklearn.__version__)


1.0.2


In [6]:
import pandas as pd
from pandas.api.types import is_numeric_dtype
import numpy as np

# plotting 
import seaborn as sns
import matplotlib.pyplot as plt


# Preprocessing
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV

# Feature selection
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
# Models
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier


from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Scores
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, fbeta_score

# Settings & warning handling
import warnings
pd.options.display.max_columns
warnings.filterwarnings("always")
warnings.filterwarnings("ignore")


#--------------------------------------
import pickle

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

In [7]:
import sklearn
print(sklearn.__version__)

1.0.2


### Loading Data

In [8]:
df = pd.read_csv("../Datasets/X_full.csv", index_col=0)
raw_df = pd.read_csv("../Datasets/LoanExport.csv", index_col=0)

In [9]:
df.head()

Unnamed: 0,CreditScore,FirstTimeHomebuyer,MSA,MIP,Units,Occupancy,OCLTV,DTI,OrigUPB,LTV,...,FirstPayment_Year,FirstPayment_Month,Maturity_Year,Maturity_Month,LTV_range,Credit_range,YearsInRepayment,Repay_range,IsFirstTimeHomebuyer,Duration
0,711.841009,0,16974,25,1,0,89,27.0,117000,89.0,...,1999,2,2029,1,0,0,4.333333,2,0,30
1,711.841009,0,19740,0,1,0,73,17.0,109000,73.0,...,1999,2,2029,1,0,0,12.0,4,0,30
2,711.841009,0,29940,0,1,0,75,16.0,88000,75.0,...,1999,2,2029,1,0,0,5.583333,2,0,30
3,711.841009,0,31084,0,1,0,76,14.0,160000,76.0,...,1999,2,2029,1,0,0,2.916667,1,0,30
4,711.841009,0,35644,0,1,0,78,18.0,109000,78.0,...,1999,2,2029,1,0,0,4.5,2,0,30


In [10]:
raw_df.head()

Unnamed: 0_level_0,FirstPaymentDate,FirstTimeHomebuyer,MaturityDate,MSA,MIP,Units,Occupancy,OCLTV,DTI,OrigUPB,...,PostalCode,LoanSeqNum,LoanPurpose,OrigLoanTerm,NumBorrowers,SellerName,ServicerName,EverDelinquent,MonthsDelinquent,MonthsInRepayment
CreditScore,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,199902,N,202901,16974,25,1,O,89,27,117000,...,60400,F199Q1268030,P,360,2,FL,WASHINGTONMUTUALBANK,0,0,52
0,199902,N,202901,19740,0,1,O,73,17,109000,...,80200,F199Q1015092,N,360,1,FT,CHASEHOMEFINANCELLC,0,0,144
0,199902,N,202901,29940,0,1,O,75,16,88000,...,66000,F199Q1266886,N,360,2,FL,WASHINGTONMUTUALBANK,0,0,67
0,199902,N,202901,31084,0,1,O,76,14,160000,...,90700,F199Q1178167,N,360,2,GM,GMACMTGECORP,0,0,35
0,199902,N,202901,35644,0,1,O,78,18,109000,...,7600,F199Q1178517,N,360,2,GM,GMACMTGECORP,0,0,54


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 291449 entries, 0 to 291450
Data columns (total 32 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   CreditScore           291449 non-null  float64
 1   FirstTimeHomebuyer    291449 non-null  int64  
 2   MSA                   291449 non-null  int64  
 3   MIP                   291449 non-null  int64  
 4   Units                 291449 non-null  int64  
 5   Occupancy             291449 non-null  int64  
 6   OCLTV                 291449 non-null  int64  
 7   DTI                   291449 non-null  float64
 8   OrigUPB               291449 non-null  int64  
 9   LTV                   291449 non-null  float64
 10  OrigInterestRate      291449 non-null  float64
 11  Channel               291449 non-null  int64  
 12  PPM                   291449 non-null  int64  
 13  PropertyState         291449 non-null  int64  
 14  PropertyType          291449 non-null  int64  
 15  

# Creating target Variable

### Equated Monthly Installment - EMI

EMI for short - is the amount payable every month to the bank or any other financial institution until the loan amount is fully paid off. It consists of the interest on loan as well as part of the principal amount to be repaid.



**Here the EMI formula:** 
<center>
  <img src="https://github-production-user-asset-6210df.s3.amazonaws.com/89405673/263004315-582c9aab-6129-4372-8939-edc73b75119b.jpeg" width="300" height="120"/>
</center>


Where:  
P: Principal loan amount (= OrigUPB)  
n: Loan tenure in months (= OrigLoanTerm)  
r: Interest rate per month (= OrigInterestRate)  
DEBT: EMI (so income = EMI / DTI)  

In [12]:
# Let's use X**(-n) instead of X**(n) because n values are really big for X**(n) operations
def calculate_EMI(P, r, n):
    return P * r * (1 / (1 - (1 + r) ** (-n)))

df['EMI'] = np.vectorize(calculate_EMI)(df["OrigUPB"], df["OrigInterestRate"] , df["OrigLoanTerm"])

# first 5 rows of EMI column
print(df["EMI"].head())

0     789750.0
1     708500.0
2     605000.0
3    1100000.0
4     776625.0
Name: EMI, dtype: float64


### Total Payment and Interest Amount
We can calculate total payment by multiplying the EMI with the loan tenure and interest amount by simply subtracting total payment and principal amount.


In [13]:
df["totalPayment"]    = df["EMI"] * df["OrigLoanTerm"]
df["interestAmount"]  = df["totalPayment"] - df["OrigUPB"]
df["monthlyInterest"] = df["interestAmount"] / df["OrigLoanTerm"]


print(df[["EMI", "OrigLoanTerm", "OrigUPB", "interestAmount", "monthlyInterest"]].head())

         EMI  OrigLoanTerm  OrigUPB  interestAmount  monthlyInterest
0   789750.0           360   117000     284193000.0     7.894250e+05
1   708500.0           360   109000     254951000.0     7.081972e+05
2   605000.0           360    88000     217712000.0     6.047556e+05
3  1100000.0           360   160000     395840000.0     1.099556e+06
4   776625.0           360   109000     279476000.0     7.763222e+05


### Current principal
The principal amount remaining depends on the months in repayment, the more months you have paid the less will be the left over principal. We can find out the current principal for each person with the help of monthly interest rate, actual principal, EMI they pay per month and months in repayment.

In [14]:
def principal(r, amount, emi, month):
    for i in range(month):
        interest = r * amount
        p = emi - interest
        amount -= p
    return amount

df["currentPrincipal"] = np.vectorize(principal)(df["OrigInterestRate"], df["OrigUPB"], df["EMI"], df["MonthsInRepayment"])
print(df["currentPrincipal"].head())

0    117000.0
1    109000.0
2     88000.0
3    160000.0
4    109000.0
Name: currentPrincipal, dtype: float64


# Monthly Income
We are give DTI which is the ratio of debt to income, and because we know the monthly debt i.e. the monthly EMI we can calculate the monthly income by swapping it with DTI .

In [15]:
def compute_monthly_income(dti, emi):
    if dti == 0:
        return 0.01
    return  emi / dti

df["monthlyIncome"] = np.vectorize(compute_monthly_income)(df["DTI"], df["EMI"])
print(df["monthlyIncome"].head())

0    29250.000000
1    41676.470588
2    37812.500000
3    78571.428571
4    43145.833333
Name: monthlyIncome, dtype: float64


### prePayment

For having a simple model, we are assuming that a person is going to save up money based on their DTI for 2 years and pre-pay the residual amount after subtracting it with the EMI of those 2 years. We have also assumed that a person with DTI less than 40 is going to save up 50% of his income while a person with DTI more than 40% is going to save up 75% of his income and these savings are inclusive of EMI, so for example if a person has DTI of 25% and he is going to pre-pay 50-25=25% of his income.

In [16]:
def compute_pre_payment(dti, income):
    if dti < 40:
        return income/2
    else:
        return income * (3/4)

df["prePayment"] = np.vectorize(compute_pre_payment)(df["DTI"], df["monthlyIncome"] * 24)

M = max(df["prePayment"])
m = min(df["prePayment"])
print(f"max : {M}")
print(f"min : {m}")

max : 22680000.0
min : 21710.526315789473


In [17]:
df["SPP"] = (df["EMI"]-df["monthlyInterest"]) * df["OrigLoanTerm"]
df["APP"] = (df["EMI"]-df["monthlyInterest"]) * df["MonthsInRepayment"] + df["prePayment"]
df["OPB"] = df["currentPrincipal"]

df["PPR"] = abs(df["SPP"] - df["APP"]) / df["OPB"]

features = ["SPP", "APP", "OPB", "PRR"]
M = max(df["PPR"])
m = min(df["PPR"])

print(f"max : {M}")
print(f"min : {m}")

max : 98.37500000000003
min : 0.6443404837973886


In [18]:
"""plt.figure(figsize=(12, 4))

# Kernel Density Plot - KDE
sns.kdeplot(data=df['PPR'], color='blue', fill=True)
plt.xlabel('PPR (Prepayment Risk Ratio)')
plt.ylabel('Density')
plt.title('Distribution of PPR KDEplot')
plt.show()"""

"plt.figure(figsize=(12, 4))\n\n# Kernel Density Plot - KDE\nsns.kdeplot(data=df['PPR'], color='blue', fill=True)\nplt.xlabel('PPR (Prepayment Risk Ratio)')\nplt.ylabel('Density')\nplt.title('Distribution of PPR KDEplot')\nplt.show()"

# Preprocessing step

In [19]:
data = df.copy()

In [20]:
X = data[[  'FirstPayment_Month', 'IsFirstTimeHomebuyer', 'Maturity_Month', 'MIP', 'Occupancy',
            'DTI', 'OrigUPB', 'OrigInterestRate', 'Channel', 'PPM',
            'PropertyState', 'PropertyType', 'LoanPurpose',
            'OrigLoanTerm', 'NumBorrowers', 'MonthsDelinquent',
            'Credit_range', 'LTV_range', 'Repay_range'
            ]]
y = data["PPR"]

In [21]:
cat_cols = X.select_dtypes("object").columns
num_cols = X.select_dtypes(exclude="object").columns
target = 'PPR'


# pipelines categorical variables
categorical_pipeline = Pipeline(
    steps=[
        ("impute", SimpleImputer(strategy="constant")),
        ("one_hot_encode", OneHotEncoder(handle_unknown="ignore")),
    ]
)

# pipeline of num variables
numeric_pipeline = Pipeline(
    steps=[
        ("impute", SimpleImputer(strategy="mean"))
    ]
)

# preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_pipeline, num_cols),
        ("categorical", categorical_pipeline, cat_cols),
    ]
)

In [22]:
"""from sklearn.feature_selection import SelectKBest, SelectPercentile, mutual_info_regression

def make_mi_score_and_plot(X, y):
    mi_scores = mutual_info_regression(X, y)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)

    # ploting the scores for each feature

    plt.figure(dpi=100, figsize=(10, 6))
    mi_scores = mi_scores.sort_values(ascending=True)
    width = np.arange(len(mi_scores))
    ticks = list(mi_scores.index)
    plt.barh(width, mi_scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")
    return mi_scores


mi_scores = make_mi_score_and_plot(X, y)"""

'from sklearn.feature_selection import SelectKBest, SelectPercentile, mutual_info_regression\n\ndef make_mi_score_and_plot(X, y):\n    mi_scores = mutual_info_regression(X, y)\n    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)\n    mi_scores = mi_scores.sort_values(ascending=False)\n\n    # ploting the scores for each feature\n\n    plt.figure(dpi=100, figsize=(10, 6))\n    mi_scores = mi_scores.sort_values(ascending=True)\n    width = np.arange(len(mi_scores))\n    ticks = list(mi_scores.index)\n    plt.barh(width, mi_scores)\n    plt.yticks(width, ticks)\n    plt.title("Mutual Information Scores")\n    return mi_scores\n\n\nmi_scores = make_mi_score_and_plot(X, y)'

In [23]:
"""from sklearn.decomposition import PCA

# new dimensionality
n_components = 15

# Create principal components
pca = PCA(n_components=n_components) # n_components = n_features by default
X_pca = pca.fit_transform(X)

# Convert to dataframe
pca_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=pca_names)

X_pca_mi_score = make_mi_score_and_plot(X_pca, y)"""

'from sklearn.decomposition import PCA\n\n# new dimensionality\nn_components = 15\n\n# Create principal components\npca = PCA(n_components=n_components) # n_components = n_features by default\nX_pca = pca.fit_transform(X)\n\n# Convert to dataframe\npca_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]\nX_pca = pd.DataFrame(X_pca, columns=pca_names)\n\nX_pca_mi_score = make_mi_score_and_plot(X_pca, y)'

In [24]:
"""loadings = pd.DataFrame(
    pca.components_.T,
    columns=pca_names,
    index=X.columns,
)
loadings"""

'loadings = pd.DataFrame(\n    pca.components_.T,\n    columns=pca_names,\n    index=X.columns,\n)\nloadings'

In [25]:
# splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
DT_model = DecisionTreeRegressor(criterion='squared_error')
DT_model.fit(X_train, y_train)
DT_pred = DT_model.predict(X_test)

DT_MSE = mean_squared_error(y_test, DT_pred)
DT_R2 = r2_score(y_test, DT_pred)

In [27]:
print(f"MSE ------>: {DT_MSE}")
print(f"R2 score ------->: {DT_R2}")

MSE ------>: 0.0034534515813850713
R2 score ------->: 0.9992920320122389


In [28]:
XGB_model = XGBRegressor( max_depth=7, n_estimators=300, random_state=40 )
XGB_model.fit(X_train, y_train)
XGB_pred = XGB_model.predict(X_test)

XGB_MSE = mean_squared_error(y_test, XGB_pred)
XGB_R2 = r2_score(y_test, XGB_pred)

In [29]:
print(f"MSE ------>: {XGB_MSE}")
print(f"R2 score ------->: {XGB_R2}")

MSE ------>: 0.0013700007955260998
R2 score ------->: 0.9997191457058011


# Main Pipeline with XGBoost

In [30]:
# now let's build our main pipeline with XGBoost
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("scaler", StandardScaler()),
        ("pca", PCA(n_components=15)),
        ("model", XGBRegressor()),
    ]
)


In [31]:
pipeline.fit(X_train, y_train)
accuracy = pipeline.score(X_test, y_test)

print("Accuracy of the XGBoost model:", accuracy)

Accuracy of the XGBoost model: 0.6410161885261072


# Saving the model to deploy

In [32]:
pickle.dump(pipeline, open("XGBreg_pipeline.pkl", "wb"))