In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv("../data/raw/loan_data.csv")
data.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


# <span style = "color: orange"> Selección de Variables </span>

In [3]:
data.dtypes

person_age                        float64
person_gender                      object
person_education                   object
person_income                     float64
person_emp_exp                      int64
person_home_ownership              object
loan_amnt                         float64
loan_intent                        object
loan_int_rate                     float64
loan_percent_income               float64
cb_person_cred_hist_length        float64
credit_score                        int64
previous_loan_defaults_on_file     object
loan_status                         int64
dtype: object

Variable a predecir: loan_status
### Conversión de Variables

In [4]:
# Variable objetivo
data['loan_status'] = data['loan_status'].astype('category')
print(f"Loan_status: {data['loan_status'].dtype}")

Loan_status: category


In [5]:
# Variables predictoras
col_categoricas = ['person_gender', 'person_education', 'person_home_ownership', 'loan_intent', 'previous_loan_defaults_on_file']

for col in col_categoricas:
    data[col] = data[col].astype('category')
    print(f'{col}: {data[col].dtype}')

person_gender: category
person_education: category
person_home_ownership: category
loan_intent: category
previous_loan_defaults_on_file: category


In [6]:
data.dtypes

person_age                         float64
person_gender                     category
person_education                  category
person_income                      float64
person_emp_exp                       int64
person_home_ownership             category
loan_amnt                          float64
loan_intent                       category
loan_int_rate                      float64
loan_percent_income                float64
cb_person_cred_hist_length         float64
credit_score                         int64
previous_loan_defaults_on_file    category
loan_status                       category
dtype: object

In [7]:
# Definir umbral de clasificación 
valores_unicos = data.nunique()
print(valores_unicos)

person_age                           60
person_gender                         2
person_education                      5
person_income                     33989
person_emp_exp                       63
person_home_ownership                 4
loan_amnt                          4483
loan_intent                           6
loan_int_rate                      1302
loan_percent_income                  64
cb_person_cred_hist_length           29
credit_score                        340
previous_loan_defaults_on_file        2
loan_status                           2
dtype: int64


In [8]:
# Clasificar variables numéricas
col_numericas = data.select_dtypes(include = ['int64', 'float64']).columns.tolist()

In [9]:
var_continuas = []
var_discretas = []

for col in col_numericas:
    n_unicos = data[col].nunique()
    if n_unicos < 10: 
        var_discretas.append(col)
    elif n_unicos < 100: 
        var_discretas.append(col)
    else:
        var_continuas.append(col)

In [10]:
print("Variables discretas: ")
print(var_discretas)
print()

print("Variables continuas: ")
print(var_continuas)

Variables discretas: 
['person_age', 'person_emp_exp', 'loan_percent_income', 'cb_person_cred_hist_length']

Variables continuas: 
['person_income', 'loan_amnt', 'loan_int_rate', 'credit_score']


In [11]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [12]:
le = LabelEncoder()
data['loan_status'] = le.fit_transform(data['loan_status'])

In [13]:
data_encoded = pd.get_dummies(data, columns = ['person_gender', 'person_education', 'person_home_ownership', 'loan_intent', 'previous_loan_defaults_on_file'], drop_first=True)
data_encoded.head()

Unnamed: 0,person_age,person_income,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,loan_status,person_gender_male,...,person_education_Master,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,previous_loan_defaults_on_file_Yes
0,22.0,71948.0,0,35000.0,16.02,0.49,3.0,561,1,False,...,True,False,False,True,False,False,False,True,False,False
1,21.0,12282.0,0,1000.0,11.14,0.08,2.0,504,0,False,...,False,False,True,False,True,False,False,False,False,True
2,25.0,12438.0,3,5500.0,12.87,0.44,3.0,635,1,False,...,False,False,False,False,False,False,True,False,False,False
3,23.0,79753.0,0,35000.0,15.23,0.44,2.0,675,1,False,...,False,False,False,True,False,False,True,False,False,False
4,24.0,66135.0,1,35000.0,14.27,0.53,4.0,586,1,True,...,True,False,False,True,False,False,True,False,False,False


In [14]:
print(f"Dimensiones después del One-Hot Encoding: {data.shape}")

Dimensiones después del One-Hot Encoding: (45000, 14)


In [15]:
scaler = StandardScaler()
data_continuas = data[var_continuas]
data_continuas_scaled = scaler.fit_transform(data_continuas)
data[var_continuas] = data_continuas_scaled

In [16]:
print(f"Dimensiones después del One-Hot Encoding: {data_encoded.shape}")

Dimensiones después del One-Hot Encoding: (45000, 23)


In [17]:
data_encoded.head()

Unnamed: 0,person_age,person_income,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,loan_status,person_gender_male,...,person_education_Master,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,previous_loan_defaults_on_file_Yes
0,22.0,71948.0,0,35000.0,16.02,0.49,3.0,561,1,False,...,True,False,False,True,False,False,False,True,False,False
1,21.0,12282.0,0,1000.0,11.14,0.08,2.0,504,0,False,...,False,False,True,False,True,False,False,False,False,True
2,25.0,12438.0,3,5500.0,12.87,0.44,3.0,635,1,False,...,False,False,False,False,False,False,True,False,False,False
3,23.0,79753.0,0,35000.0,15.23,0.44,2.0,675,1,False,...,False,False,False,True,False,False,True,False,False,False
4,24.0,66135.0,1,35000.0,14.27,0.53,4.0,586,1,True,...,True,False,False,True,False,False,True,False,False,False


### Tratamiento de Outliers

In [18]:
per_bajo = 0.05
per_alto = 0.95

In [19]:
for col in var_continuas:
    limite_inf = data[col].quantile(per_bajo)
    limite_sup = data[col].quantile(per_alto)
    data[col] = data[col].clip(lower = limite_inf, upper = limite_sup)

# <span style="color: orange"> Crear Archivo CSV Procesado </span>

In [20]:
data_encoded.head()

Unnamed: 0,person_age,person_income,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,loan_status,person_gender_male,...,person_education_Master,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,previous_loan_defaults_on_file_Yes
0,22.0,71948.0,0,35000.0,16.02,0.49,3.0,561,1,False,...,True,False,False,True,False,False,False,True,False,False
1,21.0,12282.0,0,1000.0,11.14,0.08,2.0,504,0,False,...,False,False,True,False,True,False,False,False,False,True
2,25.0,12438.0,3,5500.0,12.87,0.44,3.0,635,1,False,...,False,False,False,False,False,False,True,False,False,False
3,23.0,79753.0,0,35000.0,15.23,0.44,2.0,675,1,False,...,False,False,False,True,False,False,True,False,False,False
4,24.0,66135.0,1,35000.0,14.27,0.53,4.0,586,1,True,...,True,False,False,True,False,False,True,False,False,False


In [21]:
data_encoded.to_csv("../data/processed/loan_data_prepared.csv", index = False)
print("Archivo CSV creado")

Archivo CSV creado
