<div style="text-align:right">Update date: Mar 30, 2024</div><br>

# Risk scoring<br>
## Production code<br>
### Objetive<br>
Prepare training and execution script for the production phase that allows projecting the expected loss(EL) when providing a bank loan, and thus, assess the risk before granting it to the client.<br><br>
### The general outline of this notebook is as follows:<br>

    1. Imports
    2. Load data
    3. Dataset structure
    4. Generate training and execution pipelines
    5. Evaluate risk scoring models
    6. Save training and execution pipelines
    
### Main work tools<br>

|Package|                           Version|
|:---------------------------------|--------:|
|matplotlib                        |3.8.0|
|numpy                             | 1.26.4|
|notebook                          |6.5.4|
|pandas                            | 2.2.1|
|python                            |3.12.2|
|pickle                            |4.0|
|sklearn                           |1.3.0|


### Imports

In [1]:
import numpy as np
import pandas as pd
import pickle

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Binarizer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import roc_auc_score

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

%config IPCompleter.greedy = True

### Load data

In [149]:
df = pd.read_csv('../data/original/prestamos.csv', index_col=0)
print(f'{df.shape=}')
df.head()

df.shape=(200000, 24)


Unnamed: 0_level_0,empleo,antigüedad_empleo,ingresos,ingresos_verificados,rating,dti,vivienda,num_hipotecas,num_lineas_credito,porc_tarjetas_75p,...,id_prestamo,descripcion,finalidad,principal,tipo_interes,num_cuotas,imp_cuota,imp_amortizado,estado,imp_recuperado
id_cliente,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
137387967,Hvac technician,3 years,54000.0,Source Verified,A,19.31,MORTGAGE,2.0,10.0,33.3,...,,,debt_consolidation,15000.0,7.21,36 months,464.6,2669.06,Current,0.0
4798121,"Target Promotions and Marketing,Inc",10+ years,65000.0,Not Verified,D,25.4,RENT,1.0,15.0,85.7,...,,,debt_consolidation,10000.0,17.77,36 months,360.38,6362.96,Charged Off,0.0
46641215,Banker,5 years,135000.0,Verified,A,14.68,RENT,0.0,19.0,0.0,...,,,debt_consolidation,24000.0,6.39,36 months,734.38,24000.0,Fully Paid,0.0
87998444,executive director,9 years,188000.0,Source Verified,B,11.69,MORTGAGE,3.0,15.0,0.0,...,,,credit_card,27000.0,8.99,60 months,560.35,12443.0,Current,0.0
132883631,Subsea Technician,7 years,125000.0,Source Verified,B,9.0,MORTGAGE,1.0,6.0,33.3,...,,,debt_consolidation,22000.0,10.9,36 months,719.22,22000.0,Fully Paid,0.0


#### Select variables

In [150]:
final_variables = [
    'ingresos_verificados',
    'vivienda',
    'finalidad',
    'num_cuotas',
    'antigüedad_empleo',
    'rating',
    'ingresos',
    'dti',
    'num_lineas_credito',
    'porc_uso_revolving',
    'principal',
    'tipo_interes',
    'imp_cuota',
    'num_derogatorios',
    'estado',
    'imp_amortizado',
    'imp_recuperado'
]

### Datasets structure

#### Delete records

In [151]:
df.drop_duplicates(inplace=True)
to_eliminate = df.loc[df.ingresos > 300_000].index.values
df = df[~df.index.isin(to_eliminate)]

Select final variables

In [152]:
df = df[final_variables]

### Generate training and execution pipeline

#### Generate data processing functions

In [153]:
def data_quality(df):
    temp = df.copy()
    temp['antigüedad_empleo'] = temp['antigüedad_empleo'].fillna('unknown')
    numeric_columns = temp.select_dtypes('number').columns
    temp[numeric_columns] = temp[numeric_columns].fillna(0)
    temp['vivienda'] = temp['vivienda'].replace(['ANY', 'NONE', 'OTHER'], 'MORTGAGE')
    temp['finalidad'] = temp['finalidad'].replace(['educational', 'reneweable_energy', 'wedding'], 'others')

    return temp

def generate_variables_pd(df):
    """Generate PD (Probability of Default) related variables from a given DataFrame.
    Returns:
    tuple: A pair of DataFrames, the first one containing the predictor variables
    and the second one containing the PD target.
    """
    temp = df.copy()
    default = ['Charged Off', 'Does not meet the credit policy. Status:Charged Off', 'Default']
    temp['target_pd'] = np.where(temp.estado.isin(default), 1, 0)
    temp.drop(columns=['estado', 'imp_amortizado', 'imp_recuperado'], inplace=True)
    
    return temp.iloc[:,:-1], temp.iloc[:,-1]

def generate_variables_ead(df):
    """Generate EAD (Exposure at Default) related variables from a given DataFrame.
    Returns:
    tuple: A pair of DataFrames, the first one containing the predictor variables
    and the second one containing the EAD target.
    """
    temp = df.copy()
    temp['pendiente'] = temp.principal - temp.imp_amortizado
    temp['target_ead'] = temp.pendiente / temp.principal
    temp.drop(columns=['estado', 'imp_amortizado', 'imp_recuperado', 'pendiente'], inplace=True)
    
    return temp.iloc[:, :-1], temp.iloc[:, -1]

def generate_variables_lgd(df):
    """Generate Loss Given Default (LGD) related variables from a given DataFrame.
    Returns:
    tuple: A pair of DataFrames, the first one containing the predictor variables
    and the second one containing the LGD target.
    """
    temp = df.copy()
    temp['pendiente'] = temp['principal'] - temp['imp_amortizado']
    temp['target_lgd'] = 1 - (temp.imp_recuperado / temp.pendiente)
    temp['target_lgd'] = temp['target_lgd'].fillna(0)
    temp.drop(columns=['estado', 'imp_amortizado', 'imp_recuperado', 'pendiente'], inplace=True)
    
    return temp.iloc[:, :-1], temp.iloc[:, -1]

#### Prepare training dataset for each model

In [154]:
x_pd, y_pd = generate_variables_pd(data_quality(df))
x_ead, y_ead = generate_variables_ead(data_quality(df))
x_lgd, y_lgd = generate_variables_lgd(data_quality(df))

#### Instantiate variable transformation

In [155]:
# ONE HOT ENCODIG
var_ohe = ['ingresos_verificados', 'vivienda', 'finalidad', 'num_cuotas']
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# ORDINAL ENCODING
var_oe = ['antigüedad_empleo', 'rating']
order_length_employment = [
    'unknown', '< 1 year', '1 year', '2 years', '3 years', '4 years',
    '5 years', '6 years', '7 years', '8 years', '9 years', '10+ years', 
]
order_rating = ['A', 'B', 'C', 'D', 'E', 'F', 'G']

oe = OrdinalEncoder(
    categories=[order_length_employment, order_rating],
    handle_unknown='use_encoded_value',
    unknown_value=12
)

# BINARIZER
var_bin = ['num_derogatorios']
binarizer = Binarizer(threshold=0)

# MIN-MAX SCALER
vars_mmx = [
    'ingresos', 'dti', 'num_lineas_credito', 'porc_uso_revolving',
    'principal', 'tipo_interes', 'imp_cuota'
]
mms = MinMaxScaler()

#### Generate the preprocessing pipeline

##### Generate the column transformer

In [156]:
ct = make_column_transformer(
    (ohe, var_ohe),
    (oe, var_oe),
    (binarizer, var_bin),
    (mms, vars_mmx),
    remainder='passthrough'
)

#### Instantiate the modelos

##### Instantiate the algorithms

In [157]:
model_pd = LogisticRegression(C=0.25, penalty='l1', solver='saga', n_jobs=-1)

model_ead = HistGradientBoostingRegressor(
    l2_regularization=0,
    max_depth=10,
    max_iter=200,
    min_samples_leaf=100,
    scoring='neg_mean_absolute_percentage_error',
    learning_rate=0.1
)

model_lgd = HistGradientBoostingRegressor(
    l2_regularization=1,
    max_depth=20,
    max_iter=200,
    min_samples_leaf=100,
    scoring='neg_mean_absolute_percentage_error',
    learning_rate=0.1
)

##### Generate the final training pipeline

In [158]:
training_pipe_pd = make_pipeline(ct, model_pd)
training_pipe_ead = make_pipeline(ct, model_ead)
training_pipe_lgd = make_pipeline(ct, model_lgd)

##### Generate the final execution pipeline

In [159]:
execution_pipe_pd = training_pipe_pd.fit(x_pd, y_pd)
execution_pipe_ead = training_pipe_ead.fit(x_ead, y_ead)
execution_pipe_lgd = training_pipe_lgd.fit(x_lgd, y_lgd)

#### Evaluate risk scoring models

In [160]:
# Load data
import os
project_path = '../'
file_name = 'validation.csv'
full_path = os.path.join(project_path, 'data/validation', file_name)
val = pd.read_csv(full_path, index_col='id_cliente').drop(columns='Unnamed: 0')

# Select variables
final_variables = [
    'ingresos_verificados',
    'vivienda',
    'finalidad',
    'num_cuotas',
    'antigüedad_empleo',
    'rating',
    'ingresos',
    'dti',
    'num_lineas_credito',
    'porc_uso_revolving',
    'principal',
    'tipo_interes',
    'imp_cuota',
    'num_derogatorios',
]
# Prepare data
val.drop_duplicates(inplace=True)
to_eliminate = val.loc[val.ingresos > 300_000].index.values
val = val[~val.index.isin(to_eliminate)]
val = data_quality(val)

# Probability at Default(PD) variable
default = ['Charged Off', 'Does not meet the credit policy. Status:Charged Off', 'Default']
pd_real = np.where(val.estado.isin(default), 1, 0)

# Exposure at Default (EAD) variable
ead_real = (val.principal - val.imp_amortizado) / val.principal

# Loss Given Default (LGD) variable
lgd_real = 1 - (val.imp_recuperado / (val.principal - val.imp_amortizado))
lgd_real = lgd_real.fillna(0)
val = val[final_variables]

# Execution
scoring_pd = execution_pipe_pd.predict_proba(val)[:, 1]
ead = execution_pipe_ead.predict(val)
lgd = execution_pipe_lgd.predict(val)

# Expected Loss(EL)
principal = val.principal
EL = pd.DataFrame({
    'principal': principal,
    'pd_real': pd_real,
    'pd': scoring_pd,
    'ead_real': ead_real,
    'ead': ead,
    'lgd_real': lgd_real,
    'lgd': lgd
})
print('AUC Probability of Default:', round(roc_auc_score(pd_real, EL.pd), 2))
print('MAE Exposure at Default:', round(mean_absolute_error(ead_real, EL.ead) , 2))
print('MAE Loss Given Default:', round(mean_absolute_error(lgd_real, EL.lgd), 2))
EL['expected_loss'] = round(EL.pd * EL.principal * EL.ead * EL.lgd, 2)
EL

AUC Probability of Default: 0.7
MAE Exposure at Default: 0.23
MAE Loss Given Default: 0.36


Unnamed: 0_level_0,principal,pd_real,pd,ead_real,ead,lgd_real,lgd,expected_loss
id_cliente,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
115689874,6000.0,0,0.140361,0.519170,0.420112,1.0,0.719972,254.73
6738540,15000.0,0,0.091108,0.000000,0.070068,0.0,0.235840,22.58
81930243,20000.0,0,0.099020,0.093958,0.112924,1.0,0.363643,81.32
132881023,20000.0,0,0.083092,0.864169,0.689053,1.0,0.764504,875.43
113835738,18000.0,0,0.086231,0.000000,0.394988,0.0,0.728987,446.93
...,...,...,...,...,...,...,...,...
98127133,7100.0,0,0.115081,0.000000,0.192610,0.0,0.445963,70.18
51937821,8000.0,0,0.050883,0.000000,0.181837,0.0,0.408658,30.25
93545269,8000.0,0,0.045744,0.000000,0.195988,0.0,0.455012,32.63
85513948,12975.0,0,0.173938,0.000000,0.154522,0.0,0.374197,130.49


#### Save training and execution pipelines

##### Save the final training pipeline

In [161]:
with open('../models/training_pipe_pd.pickle', mode='wb') as file:
    pickle.dump(training_pipe_pd, file)

with open('../models/training_pipe_ead.pickle', mode='wb') as file:
    pickle.dump(training_pipe_ead, file)

with open('../models/training_pipe_lgd.pickle', mode='wb') as file:
    pickle.dump(training_pipe_lgd, file)

##### Save the final execution pipeline

In [162]:
with open('../models/execution_pipe_pd.pickle', mode='wb') as file:
    pickle.dump(execution_pipe_pd, file)

with open('../models/execution_pipe_ead.pickle', mode='wb') as file:
    pickle.dump(execution_pipe_ead, file)

with open('../models/execution_pipe_lgd.pickle', mode='wb') as file:
    pickle.dump(execution_pipe_lgd, file)