### Pré-processamento

In [1]:
import sys
import os
import pickle
from pathlib import Path

# Caminho da raiz do projeto
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
if ROOT_DIR not in sys.path:
    sys.path.append(ROOT_DIR)

In [2]:
import pandas as pd
from src.utils.paths import data_path
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import joblib

In [3]:
csv_file = data_path("dados_novos.csv", "interim")
dados = pd.read_csv(csv_file)

### Tratamento de Outlier

Será realizado um tratamento de dados para deteccção e remoção de outliers. A técnica utilizada será o IQR (Intervalo interquartil) onde define os quartis do conjunto de dados e cria dois limites, um superior e inferior, o que tiver acima ou abaixo deles, será considerado outlier.

In [4]:
dados.head()

Unnamed: 0,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,faixa_etaria
0,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3,20-29
1,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2,20-29
2,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3,20-29
3,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2,20-29
4,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4,20-29


In [5]:
dados.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_income               32581 non-null  int64  
 1   person_home_ownership       32581 non-null  object 
 2   person_emp_length           31686 non-null  float64
 3   loan_intent                 32581 non-null  object 
 4   loan_grade                  32581 non-null  object 
 5   loan_amnt                   32581 non-null  int64  
 6   loan_int_rate               29465 non-null  float64
 7   loan_status                 32581 non-null  int64  
 8   loan_percent_income         32581 non-null  float64
 9   cb_person_default_on_file   32581 non-null  object 
 10  cb_person_cred_hist_length  32581 non-null  int64  
 11  faixa_etaria                32576 non-null  object 
dtypes: float64(3), int64(4), object(5)
memory usage: 3.0+ MB


In [6]:
# Separando dados categóricos e numéricos
num_features = []
cat_features = []

for col in dados.columns:
    if ((dados[col].dtype == 'int64') | (dados[col].dtype == 'float64')) & (col != 'loan_status'):
        num_features.append(col)
    elif (dados[col].dtype == 'object') & (col != 'loan_status'):
        cat_features.append(col)

print(num_features)
print(cat_features)

['person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']
['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file', 'faixa_etaria']


In [7]:
def remove_outliers(df, cols):
    df_clean = df.copy()
    outliers_count = {}

    for col in cols:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR

        # Conta quantos valores estão fora do intervalo
        outliers = df_clean[(df_clean[col] < lower) | (df_clean[col] > upper)]
        outliers_count[col] = len(outliers)

        # Mantém apenas os valores dentro do intervalo
        df_clean = df_clean[(df_clean[col] >= lower) & (df_clean[col] <= upper)]

    return df_clean, outliers_count

dados_new, total_outliers = remove_outliers(dados, num_features)

In [8]:
total_outliers

{'person_income': 1484,
 'person_emp_length': 759,
 'loan_amnt': 1287,
 'loan_int_rate': 10,
 'loan_percent_income': 639,
 'cb_person_cred_hist_length': 740}

In [9]:
dados_new.head()

Unnamed: 0,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,faixa_etaria
1,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2,20-29
5,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2,20-29
9,10000,OWN,6.0,VENTURE,D,1600,14.74,1,0.16,N,3,20-29
19,10800,MORTGAGE,8.0,EDUCATION,B,1750,10.99,1,0.16,N,2,20-29
23,10980,OWN,0.0,PERSONAL,A,1500,7.29,0,0.14,N,3,20-29


In [20]:
dados_new.columns

Index(['person_income', 'person_home_ownership', 'person_emp_length',
       'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate',
       'loan_status', 'loan_percent_income', 'cb_person_default_on_file',
       'cb_person_cred_hist_length', 'faixa_etaria'],
      dtype='object')

### Pipeline de Pré-processamento

Será aplicado inputação nos dados faltantes com o valor da mediana daqueles dados, essa abordagem trás uma maior segurança que a média, pois ela pode ser facilmente distorcida por valores maiores.



In [10]:
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OrdinalEncoder())
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_features),
        ("cat", categorical_transformer, cat_features),
    ]
)

In [11]:
cat_features

['person_home_ownership',
 'loan_intent',
 'loan_grade',
 'cb_person_default_on_file',
 'faixa_etaria']

In [12]:
# Fazendo a separação da variável alvo das demais
X = dados_new.drop('loan_status', axis = 1)
y = dados_new['loan_status']

In [None]:
X_preprocessed = preprocessor.fit_transform(X)

In [22]:
X_preprocessed

array([[-1.8311246 ,  0.19173156, -1.53322885, ...,  1.        ,
         0.        ,  0.        ],
       [-1.81982882, -0.69478743, -1.22286587, ...,  0.        ,
         0.        ,  0.        ],
       [-1.81606356,  0.48723789, -1.40908365, ...,  3.        ,
         0.        ,  0.        ],
       ...,
       [-0.53587555, -0.10377477, -0.08486828, ...,  3.        ,
         0.        ,  1.        ],
       [ 1.19614353,  1.96476954, -0.91250289, ...,  0.        ,
         0.        ,  1.        ],
       [ 1.94919531,  0.19173156,  1.57040095, ...,  1.        ,
         0.        ,  1.        ]])

In [None]:
num_cols = num_features
cat_cols = preprocessor.named_transformers_['cat'] \
    .named_steps['encoder'] \
    .get_feature_names_out(cat_features)

all_cols = list(num_cols) + list(cat_cols)
X_transformed = pd.DataFrame(X_preprocessed.toarray() if hasattr(X_preprocessed, "toarray") else X_preprocessed,
                             columns=all_cols)

In [21]:
X_transformed.columns

Index(['person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate',
       'loan_percent_income', 'cb_person_cred_hist_length',
       'person_home_ownership', 'loan_intent', 'loan_grade',
       'cb_person_default_on_file', 'faixa_etaria'],
      dtype='object')

In [15]:
# Separação de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.3, random_state=42, stratify=y)

In [16]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16843, 11), (7219, 11), (16843,), (7219,))

In [17]:
import pickle

scalers_dir = data_path("", "scalers")
scalers_dir.mkdir(parents=True, exist_ok=True)
with open(scalers_dir / "preprocessor.pkl", "wb") as f:
    pickle.dump(preprocessor, f)

In [18]:
X_train.columns

Index(['person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate',
       'loan_percent_income', 'cb_person_cred_hist_length',
       'person_home_ownership', 'loan_intent', 'loan_grade',
       'cb_person_default_on_file', 'faixa_etaria'],
      dtype='object')

In [19]:
# Salvar datasets pré-processados em "processed"
X_train.to_pickle(data_path("X_train.pkl", "processed"))
X_test.to_pickle(data_path("X_test.pkl", "processed"))
y_train.to_pickle(data_path("y_train.pkl", "processed"))
y_test.to_pickle(data_path("y_test.pkl", "processed"))