In [48]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import pandas as pd

dataset = pd.read_csv('./credit_risk_dataset.csv')

In [49]:
imputer = SimpleImputer(strategy='mean')

preprocessed_data = pd.DataFrame()

for column in dataset.columns:
  if dataset[column].dtype == 'int64' or dataset[column].dtype == 'float64':
    preprocessed_data[[column]] = imputer.fit_transform(dataset[[column]])

preprocessed_data

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length
0,22.0,59000.0,123.0,35000.0,16.02,1.0,0.59,3.0
1,21.0,9600.0,5.0,1000.0,11.14,0.0,0.10,2.0
2,25.0,9600.0,1.0,5500.0,12.87,1.0,0.57,3.0
3,23.0,65500.0,4.0,35000.0,15.23,1.0,0.53,2.0
4,24.0,54400.0,8.0,35000.0,14.27,1.0,0.55,4.0
...,...,...,...,...,...,...,...,...
32576,57.0,53000.0,1.0,5800.0,13.16,0.0,0.11,30.0
32577,54.0,120000.0,4.0,17625.0,7.49,0.0,0.15,19.0
32578,65.0,76000.0,3.0,35000.0,10.99,1.0,0.46,28.0
32579,56.0,150000.0,5.0,15000.0,11.48,0.0,0.10,26.0


Se escogio el simple imputer para reemplazar posibles valores faltantes en el dataset

In [50]:
label_encoder = LabelEncoder()
preprocessed_data = pd.DataFrame()

for column in dataset.columns:
  preprocessed_data[column] = label_encoder.fit_transform(dataset[column])

preprocessed_data

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,2,2238,3,35,4,3,752,239,1,59,1,1
1,1,25,2,5,1,1,7,92,0,10,0,0
2,5,25,0,1,3,2,176,141,1,57,0,1
3,3,2539,3,4,3,2,752,217,1,53,0,0
4,4,1980,3,8,3,2,752,186,1,55,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...
32576,37,1890,0,1,4,2,188,149,0,11,0,28
32577,34,3800,0,4,4,0,579,21,0,15,0,17
32578,45,2933,3,3,2,1,752,87,1,46,0,26
32579,36,3984,0,5,4,1,516,100,0,10,0,24


Se escogio el simple imputer porque permite convertir en valores numericos todas las columnas, independientemente de su tipo, lo que ayuda a trabajar estos datos de mejor manera

In [51]:
one_hot_encoder = OneHotEncoder()
preprocessed_data = pd.DataFrame()

for column in dataset.columns:
  if dataset[column].dtype == 'object':
    encoded_data = one_hot_encoder.fit_transform(dataset[[column]]).toarray()
    for i in range(encoded_data.shape[1]):
      preprocessed_data[f'{column}_{i}'] = encoded_data[:, i]

preprocessed_data

Unnamed: 0,person_home_ownership_0,person_home_ownership_1,person_home_ownership_2,person_home_ownership_3,loan_intent_0,loan_intent_1,loan_intent_2,loan_intent_3,loan_intent_4,loan_intent_5,loan_grade_0,loan_grade_1,loan_grade_2,loan_grade_3,loan_grade_4,loan_grade_5,loan_grade_6,cb_person_default_on_file_0,cb_person_default_on_file_1
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32576,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
32577,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
32578,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
32579,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


Se eligio one hot encoder porque permite que los modelos tengan un mejor entrenamiento

In [52]:
scaler = MinMaxScaler()
preprocessed_data = pd.DataFrame()

for column in dataset.columns:
  if dataset[column].dtype == 'int64' or dataset[column].dtype == 'float64':
    preprocessed_data[[column]] = scaler.fit_transform(dataset[[column]])

preprocessed_data

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length
0,0.016129,0.009173,1.000000,1.000000,0.595506,1.0,0.710843,0.035714
1,0.008065,0.000934,0.040650,0.014493,0.321348,0.0,0.120482,0.000000
2,0.040323,0.000934,0.008130,0.144928,0.418539,1.0,0.686747,0.035714
3,0.024194,0.010257,0.032520,1.000000,0.551124,1.0,0.638554,0.000000
4,0.032258,0.008406,0.065041,1.000000,0.497191,1.0,0.662651,0.071429
...,...,...,...,...,...,...,...,...
32576,0.298387,0.008172,0.008130,0.153623,0.434831,0.0,0.132530,1.000000
32577,0.274194,0.019346,0.032520,0.496377,0.116292,0.0,0.180723,0.607143
32578,0.362903,0.012008,0.024390,1.000000,0.312921,1.0,0.554217,0.928571
32579,0.290323,0.024350,0.040650,0.420290,0.340449,0.0,0.120482,0.857143


Se escogio min max escale por que es el mas comun y facil de entender, ademas que evita que los datos se sesguen por la escala de los datos

In [53]:
scaler = StandardScaler()
preprocessed_data = pd.DataFrame()

for column in dataset.columns:
  if dataset[column].dtype == 'int64' or dataset[column].dtype == 'float64':
    preprocessed_data[[column]] = scaler.fit_transform(dataset[[column]])

preprocessed_data

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length
0,-0.903374,-0.114143,28.535538,4.019404,1.545580,1.893069,3.931411,-0.691554
1,-1.060904,-0.911147,0.050769,-1.358650,0.039595,-0.528243,-0.657458,-0.938167
2,-0.430783,-0.911147,-0.914816,-0.646849,0.573479,1.893069,3.744110,-0.691554
3,-0.745843,-0.009274,-0.190627,4.019404,1.301784,1.893069,3.369508,-0.938167
4,-0.588313,-0.188358,0.774958,4.019404,1.005524,1.893069,3.556809,-0.444942
...,...,...,...,...,...,...,...,...
32576,4.610190,-0.210945,-0.914816,-0.599395,0.662974,-0.528243,-0.563808,5.966992
32577,4.137599,0.870011,-0.190627,1.271060,-1.086807,-0.528243,-0.189207,3.254251
32578,5.870433,0.160129,-0.432024,4.019404,-0.006695,1.893069,2.713956,5.473767
32579,4.452660,1.354021,0.050769,0.855843,0.144521,-0.528243,-0.657458,4.980541


Se escogio standar scaler ya que es el mas comunmente usado y porque evita que los datos se sesguen por la escala de los datos
