In [1]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.feature_selection import SelectPercentile, chi2

In [2]:
test_path = 'C:/Users/diego/OneDrive/Escritorio/mlops_projects/mlops/loan_appro_ml_service/artifacts/test.csv'
train_path = 'C:/Users/diego/OneDrive/Escritorio/mlops_projects/mlops/loan_appro_ml_service/artifacts/train.csv'

In [3]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [4]:
train_df.head(20)

Unnamed: 0,loan_amnt,term,int_rate,installment,sub_grade,purpose,home_ownership,annual_inc,verification_status,dti,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,application_type,mort_acc,pub_rec_bankruptcies,loan_status
0,25000.0,36 months,8.39,787.92,A5,debt_consolidation,RENT,85000.0,Source Verified,11.03,9.0,0.0,17943.0,37.1,22.0,w,INDIVIDUAL,4.0,0.0,1
1,12000.0,36 months,7.89,375.43,A5,debt_consolidation,MORTGAGE,124000.0,Not Verified,10.82,14.0,0.0,23020.0,46.3,30.0,w,INDIVIDUAL,3.0,0.0,1
2,23200.0,60 months,13.99,539.71,C4,credit_card,MORTGAGE,140000.0,Source Verified,15.25,14.0,1.0,33270.0,53.5,41.0,w,INDIVIDUAL,2.0,0.0,1
3,16800.0,60 months,15.31,402.41,C2,debt_consolidation,MORTGAGE,82000.0,Source Verified,14.24,13.0,0.0,10634.0,72.3,40.0,f,INDIVIDUAL,2.0,0.0,0
4,18000.0,36 months,6.03,547.84,A1,debt_consolidation,RENT,220000.0,Source Verified,12.29,8.0,4.0,36989.0,60.5,12.0,w,INDIVIDUAL,0.0,0.0,1
5,7800.0,36 months,18.25,282.97,E1,credit_card,MORTGAGE,85000.0,Verified,25.91,23.0,1.0,11857.0,56.2,57.0,w,INDIVIDUAL,3.0,1.0,0
6,2800.0,36 months,16.78,99.53,C5,home_improvement,MORTGAGE,80000.0,Not Verified,19.2,9.0,0.0,16263.0,86.0,13.0,f,INDIVIDUAL,1.0,0.0,0
7,21325.0,60 months,15.61,514.18,D1,debt_consolidation,MORTGAGE,47444.0,Verified,15.1,8.0,0.0,22034.0,61.0,19.0,f,INDIVIDUAL,1.0,0.0,0
8,11700.0,60 months,28.99,371.31,G5,debt_consolidation,RENT,35000.0,Source Verified,20.13,12.0,0.0,15669.0,58.5,25.0,w,INDIVIDUAL,4.0,0.0,0
9,10000.0,36 months,6.97,308.64,A2,debt_consolidation,OWN,40000.0,Not Verified,18.54,14.0,0.0,11127.0,20.0,33.0,f,INDIVIDUAL,0.0,0.0,1


In [61]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 316824 entries, 0 to 316823
Data columns (total 20 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   loan_amnt             316824 non-null  float64
 1   term                  316824 non-null  object 
 2   int_rate              316824 non-null  float64
 3   installment           316824 non-null  float64
 4   sub_grade             316824 non-null  object 
 5   purpose               316824 non-null  object 
 6   home_ownership        316824 non-null  object 
 7   annual_inc            316824 non-null  float64
 8   verification_status   316824 non-null  object 
 9   dti                   316824 non-null  float64
 10  open_acc              316824 non-null  float64
 11  pub_rec               316824 non-null  float64
 12  revol_bal             316824 non-null  float64
 13  revol_util            316596 non-null  float64
 14  total_acc             316824 non-null  float64
 15  

In [62]:
numerical_columns = ['loan_amnt','int_rate','installment','annual_inc','dti','open_acc','pub_rec',
                     'revol_bal','total_acc','revol_util','mort_acc','pub_rec_bankruptcies']

categorical_columns = [
                'term','sub_grade','purpose', 'home_ownership','verification_status',
                'initial_list_status','application_type'
                ]

In [63]:
# Crear el pipeline numérico
num_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]
)

# Crear el pipeline categórico
cat_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("one_hot_encoder", OneHotEncoder(sparse_output=False)),
        ("scaler", StandardScaler(with_mean=False))
    ]
)

# Crear el preprocesador usando ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipeline", num_pipeline, numerical_columns),
        ("cat_pipeline", cat_pipeline, categorical_columns)
    ]
)

In [64]:
preprocessing_obj = preprocessor

In [65]:
preprocessing_obj 

In [66]:
# LABEL
target_columns_name = "loan_status"

In [67]:
input_feature_train_df = train_df.drop(columns=[target_columns_name], axis = 1)
target_feature_train_df = train_df[target_columns_name]

In [68]:
input_feature_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 316824 entries, 0 to 316823
Data columns (total 19 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   loan_amnt             316824 non-null  float64
 1   term                  316824 non-null  object 
 2   int_rate              316824 non-null  float64
 3   installment           316824 non-null  float64
 4   sub_grade             316824 non-null  object 
 5   purpose               316824 non-null  object 
 6   home_ownership        316824 non-null  object 
 7   annual_inc            316824 non-null  float64
 8   verification_status   316824 non-null  object 
 9   dti                   316824 non-null  float64
 10  open_acc              316824 non-null  float64
 11  pub_rec               316824 non-null  float64
 12  revol_bal             316824 non-null  float64
 13  revol_util            316596 non-null  float64
 14  total_acc             316824 non-null  float64
 15  

In [69]:
input_feature_train_arr = preprocessing_obj.fit_transform(input_feature_train_df)

In [70]:
input_feature_train_arr

array([[ 1.30259488, -1.17161078,  1.42061718, ...,  0.        ,
        23.33322173,  0.        ],
       [-0.25273589, -1.28337356, -0.22478919, ...,  0.        ,
        23.33322173,  0.        ],
       [ 1.08724139,  0.08013229,  0.43051727, ...,  0.        ,
        23.33322173,  0.        ],
       ...,
       [ 0.17797109, -0.31774319,  0.35077794, ...,  0.        ,
        23.33322173,  0.        ],
       [ 1.24576549,  0.78423777,  1.77236373, ...,  0.        ,
        23.33322173,  0.        ],
       [ 0.1061866 ,  0.44224368, -0.27967728, ...,  0.        ,
        23.33322173,  0.        ]])

In [71]:
train_arr = np.c_[input_feature_train_arr, np.array(target_feature_train_df)]