In [101]:
import numpy as np
import pandas as pd
import pickle
%config IPCompleter.greedy=True
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

## UPLOAD DATA

In [102]:
path = '../../'

In [103]:
name_data_file = 'Loans.csv'

In [104]:
full_path = path + '/02_Data/01_Original/' + name_data_file

df = pd.read_csv(full_path,index_col=0)

### Final Variables Selection

In [105]:
final_variables = ['ingresos_verificados',
                     'vivienda',
                     'finalidad',
                     'num_cuotas',
                     'antigüedad_empleo',
                     'rating',
                     'ingresos',
                     'dti',
                     'num_lineas_credito',
                     'porc_uso_revolving',
                     'principal',
                     'tipo_interes',
                     'imp_cuota',
                     'num_derogatorios',
                     'estado',
                     'imp_amortizado',
                     'imp_recuperado'
                  ]

### Drop rows

#### By outliers

In [106]:
to_drop = df.loc[df.ingresos > 300000].index.values

In [107]:
df = df[~df.index.isin(to_drop)]

### variables selection

In [108]:
df = df[final_variables]

##  MAKE PIPELINE

### Data Quality

In [123]:
def data_quality(temp):
    
    temp['antigüedad_empleo'] = temp['antigüedad_empleo'].fillna('anknown')
    temp['dti'].fillna(0,inplace=True)
    temp['num_lineas_credito'].fillna(0,inplace=True)
    temp['porc_uso_revolving'].fillna(0,inplace=True)
    temp['num_derogatorios'].fillna(0,inplace=True)
    #temp.select_dtypes('number').fillna(0,inplace=True)
    
    return(temp)

#### Functions for models

In [110]:
def make_variables_pd(df):
    
    temp = df.copy()
    
    temp['target_pd'] = np.where(temp.estado.isin(['Charged Off','Does not meet the credit policy. Status:Charged Off','Default']), 1, 0)
    
    temp.vivienda = temp.vivienda.replace(['ANY','NONE','OTHER'],'MORTGAGE')
    
    temp.finalidad = temp.finalidad.replace(['wedding','educational','renewable_energy'],'OTHERS')
    
    #Drop variables
    temp.drop(columns = ['estado','imp_amortizado','imp_recuperado'],inplace = True)
    
    temp_x = temp.iloc[:,:-1]
    temp_y = temp.iloc[:,-1]
    
    return(temp_x,temp_y)

In [111]:
def make_variables_ead(df):
    
    temp = df.copy()
    
    temp['remaining'] = temp.principal - temp.imp_amortizado
    
    temp['target_ead'] = temp.remaining / temp.principal
    
    temp.vivienda = temp.vivienda.replace(['ANY','NONE','OTHER'],'MORTGAGE')
    
    temp.finalidad = temp.finalidad.replace(['wedding','educational','renewable_energy'],'OTHERS')
    
    temp.drop(columns = ['estado','imp_amortizado','imp_recuperado','remaining'],inplace = True)
    
    #predict and target
    temp_x = temp.iloc[:,:-1]
    temp_y = temp.iloc[:,-1]
    
    return(temp_x,temp_y)

In [112]:
def make_variables_lgd(df):
    
    temp = df.copy()
    
    temp['remaining'] = temp.principal - temp.imp_amortizado
    
    temp['target_lgd'] = 1 - (temp.imp_recuperado / temp.remaining)
    
    temp['target_lgd'].fillna(0,inplace=True)
    
    temp.vivienda = temp.vivienda.replace(['ANY','NONE','OTHER'],'MORTGAGE')
    
    temp.finalidad = temp.finalidad.replace(['wedding','educational','renewable_energy'],'OTHERS')
 
    temp.drop(columns = ['estado','imp_amortizado','imp_recuperado','remaining'],inplace = True)
    
    # Predictors and target
    temp_x = temp.iloc[:,:-1]
    temp_y = temp.iloc[:,-1]
    
    return(temp_x,temp_y)

#### Make dataframes de X e y

In [113]:
x_pd, y_pd = make_variables_pd(data_quality(df))
                                  
x_ead, y_ead = make_variables_ead(data_quality(df))

x_lgd, y_lgd = make_variables_lgd(data_quality(df))

In [114]:
x_lgd.num_derogatorios.isna().value_counts()

False    198679
Name: num_derogatorios, dtype: int64

### variables Transformation

In [115]:
#ONE HOT ENCODING
var_ohe = [ 'ingresos_verificados', 'vivienda','finalidad','num_cuotas']
ohe = OneHotEncoder(sparse = False, handle_unknown='ignore')


#ORDINAL ENCODING
var_oe = ['antigüedad_empleo','rating']

order_seniority_employment = ['anknown','< 1 year','1 year','2 years','3 years','4 years',
                           '5 years','6 years','7 years','8 years','9 years','10+ years']

order_rating = ['A','B','C','D','E','F','G']

oe = OrdinalEncoder(categories = [order_seniority_employment ,order_rating],
                    handle_unknown = 'use_encoded_value',
                    unknown_value = 12)

#BINNING
var_bin = ['num_derogatorios']
bina = Binarizer(threshold=0)


#MIN-MAX SCALING
var_mms = ['ingresos','dti','num_lineas_credito','porc_uso_revolving',
            'principal','tipo_interes','imp_cuota']
mms = MinMaxScaler()


### Make procesing pipe

#### Crear el column transformer

In [116]:
ct = make_column_transformer(
    (ohe, var_ohe),
    (oe, var_oe),
    (bina, var_bin),
    (mms, var_mms),
    remainder='passthrough')

#### Algorithms

In [117]:
model_pd = LogisticRegression(solver = 'saga', n_jobs=-1, C = 0.25, penalty = 'l1')

model_ead = HistGradientBoostingRegressor(learning_rate = 0.1,
                                          max_iter = 200,
                                          max_depth = 10,
                                          min_samples_leaf = 100,
                                          scoring = 'neg_mean_absolute_percentage_error',
                                          l2_regularization = 0.75)

model_lgd = HistGradientBoostingRegressor(learning_rate = 0.1,
                                          max_iter = 200,
                                          max_depth = 20,
                                          min_samples_leaf = 100,
                                          scoring = 'neg_mean_absolute_percentage_error',
                                          l2_regularization = 0)

#### Make training pipes

In [118]:
pipe_training_pd = make_pipeline(ct,model_pd)

pipe_training_ead = make_pipeline(ct,model_ead)

pipe_training_lgd = make_pipeline(ct,model_lgd)

#### Save training pipe

In [119]:
path_pipe_training_pd = path + '/04_Models/pipe_training_pd.pickle'

with open(path_pipe_training_pd, mode='wb') as file:
   pickle.dump(path_pipe_training_pd, file)

In [120]:
path_pipe_training_ead = path + '/04_Models/pipe_training_ead.pickle'

with open(path_pipe_training_ead, mode='wb') as file:
   pickle.dump(path_pipe_training_ead, file)

In [121]:
path_pipe_training_lgd = path + '/04_Models/pipe_training_lgd.pickle'

with open(path_pipe_training_lgd, mode='wb') as file:
   pickle.dump(path_pipe_training_lgd, file)

#### Fit  pipes

In [122]:
pipe_execution_pd = pipe_training_pd.fit(x_pd,y_pd)
pipe_execution_ead = pipe_training_ead.fit(x_ead,y_ead)
pipe_execution_lgd = pipe_training_lgd.fit(x_lgd,y_lgd)



## Save dataset

In [124]:
path_pipe_execution_pd = path + '/04_Models/pipe_execution_pd.pickle'

with open(path_pipe_execution_pd, mode='wb') as file:
   pickle.dump(pipe_execution_pd, file)

In [125]:
path_pipe_execution_ead = path + '/04_Models/pipe_execution_ead.pickle'

with open(path_pipe_execution_ead, mode='wb') as file:
   pickle.dump(pipe_execution_ead, file)

In [126]:
path_pipe_execution_lgd = path + '/04_Models/pipe_execution_lgd.pickle'

with open(path_pipe_execution_lgd, mode='wb') as file:
   pickle.dump(pipe_execution_lgd, file)