In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
credit_card = pd.read_csv('credit_card_balance.csv')
payments_train = pd.read_csv('payments_train.csv')
payments_test = pd.read_csv('payments_test.csv')

In [3]:
credit_card.shape, payments_train.shape, payments_test.shape

((3840312, 23), (128427, 49), (61503, 49))

In [4]:
credit_card.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,...,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,2562384,378907,-6,56.97,135000,0.0,877.5,0.0,877.5,1700.325,...,0.0,0.0,0.0,1,0.0,1.0,35.0,Active,0,0
1,2582071,363914,-1,63975.555,45000,2250.0,2250.0,0.0,0.0,2250.0,...,64875.555,64875.555,1.0,1,0.0,0.0,69.0,Active,0,0
2,1740877,371185,-7,31815.225,450000,0.0,0.0,0.0,0.0,2250.0,...,31460.085,31460.085,0.0,0,0.0,0.0,30.0,Active,0,0
3,1389973,337855,-4,236572.11,225000,2250.0,2250.0,0.0,0.0,11795.76,...,233048.97,233048.97,1.0,1,0.0,0.0,10.0,Active,0,0
4,1891521,126868,-1,453919.455,450000,0.0,11547.0,0.0,11547.0,22924.89,...,453919.455,453919.455,0.0,1,0.0,1.0,101.0,Active,0,0


In [5]:
payments_train.head()

Unnamed: 0,SK_ID_CURR,TARGET,CODE_GENDER,AMT_INCOME_TOTAL,AMT_ANNUITY_x,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_REGISTRATION,...,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,MONTHS_BALANCE,SK_DPD,SK_DPD_DEF,NUM_INSTALMENT_NUMBER
0,243191,0,F,171000.0,16366.5,Pensioner,Secondary / secondary special,0.035792,-23349,-3595.0,...,high,POS mobile with interest,-1694.714286,-1030.428571,51107.428571,51111.857143,-96.0,0.0,0.0,9.768237
1,111778,0,M,157500.0,23638.5,Working,Secondary / secondary special,0.010032,-10921,-4281.0,...,low_normal,POS household with interest,-1169.666667,-919.666667,-989.666667,-979.0,-56.0,0.4,0.4,4.727273
2,372147,0,M,164133.0,36787.5,Commercial associate,Secondary / secondary special,0.030755,-10703,-2618.0,...,other,POS mobile with interest,-925.714286,-510.0,103609.142857,103613.714286,-77.0,0.886792,0.0,9.768237
3,135529,0,F,405000.0,30631.5,Working,Higher education,0.010006,-16721,-1432.0,...,low_normal,POS industry with interest,-156.0,174.0,365243.0,365243.0,-56.0,0.0,0.0,3.5
4,277437,0,F,180000.0,24201.0,Commercial associate,Secondary / secondary special,0.0228,-17676,-472.0,...,other,Card X-Sell,-1199.5,-779.5,181765.0,181768.5,-67.0,0.0,0.0,6.8


In [6]:
payments_test.head()

Unnamed: 0,SK_ID_CURR,TARGET,CODE_GENDER,AMT_INCOME_TOTAL,AMT_ANNUITY_x,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_REGISTRATION,...,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,MONTHS_BALANCE,SK_DPD,SK_DPD_DEF,NUM_INSTALMENT_NUMBER
0,384575,0,M,207000.0,52641.0,Commercial associate,Secondary / secondary special,0.00963,-13297,-637.0,...,,,,,,,,,,
1,214010,0,F,247500.0,48946.5,Commercial associate,Higher education,0.006852,-14778,-1610.0,...,,,,,,,,,,
2,142232,0,F,202500.0,39109.5,Working,Secondary / secondary special,0.035792,-17907,-2507.0,...,,,,,,,-47.0,0.142857,0.0,
3,389171,0,F,247500.0,24939.0,State servant,Secondary / secondary special,0.04622,-19626,-11167.0,...,,,,,,,-66.0,73.32,0.0,6.923077
4,283617,0,M,112500.0,15862.5,Working,Secondary / secondary special,0.01885,-20327,-7299.0,...,,,,,,,,,,


# Dados Duplicados

In [39]:
print(credit_card.SK_ID_CURR.value_counts())
print(payments_train.SK_ID_CURR.value_counts())

SK_ID_CURR
186401    192
311118    178
120076    140
128827    129
191826    128
         ... 
430884      1
354403      1
327067      1
336149      1
382749      1
Name: count, Length: 103558, dtype: int64
SK_ID_CURR
243191    1
354068    1
168670    1
100897    1
167619    1
         ..
201979    1
356318    1
109269    1
327446    1
253016    1
Name: count, Length: 128427, dtype: int64


In [40]:
# Percebi que temos 103 mil IDs no conjunto credit_card, 25 mil clientes a menos
# Vamos descobrir quem são esses clientes que não pertencem ao conjunto original e adiciona-los para ampliação nos registros

In [45]:
# Supondo que `set1` e `set2` sejam os dois conjuntos de IDs
set1 = credit_card['SK_ID_CURR']  # primeiro conjunto
set2 = payments_train['SK_ID_CURR']  # segundo conjunto

# Encontrar IDs que estão em ambos os conjuntos
common_ids = set1[set1.isin(set2)]

In [48]:
# De todo o dataframe credit_card(3840312), 1474700 registros de clientes estão em ambos os dataframes
# No caso, são estes que eu preciso trabalhar
common_ids.shape

(1474700,)

In [50]:
# No entanto, não dá para acrescentar informações apenas a esses clientes, pois possuo 128 mil clientes
# Pois, a grande maioria ficaria com registros nulos, o que seria inviável
common_ids.value_counts()

SK_ID_CURR
186401    192
192917    126
449948    123
303407    123
407269    123
         ... 
432241      1
311917      1
316033      1
338359      1
382749      1
Name: count, Length: 38597, dtype: int64

In [55]:
# Vou acrescentar os clientes que não existem na base de treino e teste, assim terei um numero maior de clientes para o modelo
# Encontrar IDs que estão em set1 mas não em set2
unique_to_set1 = set1[~set1.isin(set2)]

# Se quiser, você pode exibir ou armazenar esses IDs
unique_to_set1.value_counts()

SK_ID_CURR
311118    178
120076    140
128827    129
432607    128
191826    128
         ... 
179659      1
453660      1
255124      1
256232      1
315041      1
Name: count, Length: 64961, dtype: int64

In [58]:
credit_card[credit_card['SK_ID_CURR'] == 311118].value_counts().sum()

177

In [56]:
payments_train[payments_train['SK_ID_CURR'] == 311118].value_counts()

Series([], Name: count, dtype: int64)

In [59]:
# Confirmado, estes clientes que existem no credit_card não existem no payments_train
# Portanto, posso acrescentar esses registros na base de dados original, aumentando minha lista de clientes

In [61]:
# Filtrando os registros que estão em unique_to_set1
registros_unicos_set1 = credit_card[credit_card['SK_ID_CURR'].isin(unique_to_set1)]

# Exibindo os registros
registros_unicos_set1.SK_ID_CURR.value_counts()

SK_ID_CURR
311118    178
120076    140
128827    129
432607    128
191826    128
         ... 
179659      1
453660      1
255124      1
256232      1
315041      1
Name: count, Length: 64961, dtype: int64

In [65]:
registros_unicos_set1.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,...,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,2562384,378907,-6,56.97,135000,0.0,877.5,0.0,877.5,1700.325,...,0.0,0.0,0.0,1,0.0,1.0,35.0,Active,0,0
2,1740877,371185,-7,31815.225,450000,0.0,0.0,0.0,0.0,2250.0,...,31460.085,31460.085,0.0,0,0.0,0.0,30.0,Active,0,0
4,1891521,126868,-1,453919.455,450000,0.0,11547.0,0.0,11547.0,22924.89,...,453919.455,453919.455,0.0,1,0.0,1.0,101.0,Active,0,0
5,2646502,380010,-7,82903.815,270000,0.0,0.0,0.0,0.0,4449.105,...,82773.315,82773.315,0.0,0,0.0,0.0,2.0,Active,7,0
6,1079071,171320,-6,353451.645,585000,67500.0,67500.0,0.0,0.0,14684.175,...,351881.145,351881.145,1.0,1,0.0,0.0,6.0,Active,0,0


In [63]:
registros_unicos_set1[registros_unicos_set1['SK_ID_CURR'] == 311118].value_counts().sum()

177

In [64]:
payments_train[payments_train['SK_ID_CURR'] == 311118].value_counts()

Series([], Name: count, dtype: int64)

In [66]:
registros_unicos_set1.columns

Index(['SK_ID_PREV', 'SK_ID_CURR', 'MONTHS_BALANCE', 'AMT_BALANCE',
       'AMT_CREDIT_LIMIT_ACTUAL', 'AMT_DRAWINGS_ATM_CURRENT',
       'AMT_DRAWINGS_CURRENT', 'AMT_DRAWINGS_OTHER_CURRENT',
       'AMT_DRAWINGS_POS_CURRENT', 'AMT_INST_MIN_REGULARITY',
       'AMT_PAYMENT_CURRENT', 'AMT_PAYMENT_TOTAL_CURRENT',
       'AMT_RECEIVABLE_PRINCIPAL', 'AMT_RECIVABLE', 'AMT_TOTAL_RECEIVABLE',
       'CNT_DRAWINGS_ATM_CURRENT', 'CNT_DRAWINGS_CURRENT',
       'CNT_DRAWINGS_OTHER_CURRENT', 'CNT_DRAWINGS_POS_CURRENT',
       'CNT_INSTALMENT_MATURE_CUM', 'NAME_CONTRACT_STATUS', 'SK_DPD',
       'SK_DPD_DEF'],
      dtype='object')

In [67]:
# Função para calcular a moda
def mode_func(x):
    return x.mode().iloc[0] if not x.empty else None

# Supondo que o DataFrame já esteja carregado
# payments = pd.read_csv('seu_arquivo.csv')

# Definindo o dicionário com as funções de agregação por coluna
agg_dict = {
    'SK_ID_PREV': 'first',  
    'MONTHS_BALANCE': 'mean',
    'AMT_BALANCE': 'mean',
    'AMT_CREDIT_LIMIT_ACTUAL': 'mean',
    'AMT_DRAWINGS_ATM_CURRENT': 'mean',
    'AMT_DRAWINGS_CURRENT': 'mean',
    'AMT_DRAWINGS_OTHER_CURRENT': 'mean',
    'AMT_DRAWINGS_POS_CURRENT': 'mean',
    'AMT_INST_MIN_REGULARITY': 'mean',
    'AMT_PAYMENT_CURRENT': 'mean',
    'AMT_PAYMENT_TOTAL_CURRENT': 'mean',
    'AMT_RECEIVABLE_PRINCIPAL': 'mean',
    'AMT_RECIVABLE': 'mean',
    'AMT_TOTAL_RECEIVABLE': 'mean',
    'CNT_DRAWINGS_ATM_CURRENT': 'mean',
    'CNT_DRAWINGS_CURRENT': 'mean',
    'CNT_DRAWINGS_OTHER_CURRENT': 'mean',
    'CNT_DRAWINGS_POS_CURRENT': 'mean',
    'CNT_INSTALMENT_MATURE_CUM': 'mean',
    'NAME_CONTRACT_STATUS': mode_func,  
    'SK_DPD': 'mean',
    'SK_DPD_DEF': 'mean'
}

# Aplicando a agregação por grupo (SK_ID_CURR)
credit_card_agg = registros_unicos_set1.groupby('SK_ID_CURR').agg(agg_dict).reset_index()

In [69]:
credit_card_agg.shape

(64961, 23)

In [68]:
credit_card_agg.head()

Unnamed: 0,SK_ID_CURR,SK_ID_PREV,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,...,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,100006,1489396,-3.5,0.0,270000.0,,0.0,,,0.0,...,0.0,0.0,,0.0,,,0.0,Active,0.0,0.0
1,100013,2038692,-48.5,18159.919219,131718.75,6350.0,5953.125,0.0,0.0,1454.539551,...,18101.079844,18101.079844,0.255556,0.239583,0.0,0.0,18.719101,Active,0.010417,0.010417
2,100021,2594025,-10.0,0.0,675000.0,,0.0,,,0.0,...,0.0,0.0,,0.0,,,0.0,Completed,0.0,0.0
3,100023,1499902,-7.5,0.0,135000.0,,0.0,,,0.0,...,0.0,0.0,,0.0,,,0.0,Active,0.0,0.0
4,100028,1914954,-25.0,8085.058163,225000.0,613.636364,6156.400408,0.0,6242.355,6133.363929,...,7968.609184,7968.609184,0.045455,2.387755,0.0,2.613636,19.547619,Active,0.0,0.0


In [73]:
credit_card[credit_card['SK_ID_CURR'] == 311118].describe()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,...,AMT_RECEIVABLE_PRINCIPAL,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,SK_DPD,SK_DPD_DEF
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,177.0,...,178.0,178.0,178.0,178.0,178.0,178.0,178.0,177.0,178.0,178.0
mean,2091871.0,311118.0,-46.202247,24869.501039,85449.438202,1182.640449,1182.640449,0.0,0.0,1317.489661,...,23889.907416,24705.948287,24705.948287,0.123596,0.123596,0.0,0.0,14.768362,0.0,0.0
std,503474.9,0.0,26.111097,56789.557996,75229.462349,8010.566983,8010.566983,0.0,0.0,2869.999835,...,54608.184952,56863.608006,56863.608006,0.482934,0.482934,0.0,0.0,7.064437,0.0,0.0
min,1622591.0,311118.0,-96.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-2667.285,-2667.285,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25%,1622591.0,311118.0,-68.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0
50%,1622591.0,311118.0,-46.0,0.0,135000.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0
75%,2628998.0,311118.0,-24.0,0.0,157500.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.0,0.0,0.0
max,2628998.0,311118.0,-2.0,163601.505,157500.0,94500.0,94500.0,0.0,0.0,7875.0,...,156393.54,163601.505,163601.505,5.0,5.0,0.0,0.0,24.0,0.0,0.0


In [75]:
# Perfeito
credit_card_agg[credit_card_agg['SK_ID_CURR'] == 311118]

Unnamed: 0,SK_ID_CURR,SK_ID_PREV,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,...,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
38407,311118,2628998,-46.202247,24869.501039,85449.438202,1182.640449,1182.640449,0.0,0.0,1317.489661,...,24705.948287,24705.948287,0.123596,0.123596,0.0,0.0,14.768362,Active,0.0,0.0


In [76]:
# Deu certo, os nulos não são exorbitantes
credit_card_agg.isnull().sum()

SK_ID_CURR                        0
SK_ID_PREV                        0
MONTHS_BALANCE                    0
AMT_BALANCE                       0
AMT_CREDIT_LIMIT_ACTUAL           0
AMT_DRAWINGS_ATM_CURRENT      19517
AMT_DRAWINGS_CURRENT              0
AMT_DRAWINGS_OTHER_CURRENT    19517
AMT_DRAWINGS_POS_CURRENT      19517
AMT_INST_MIN_REGULARITY           0
AMT_PAYMENT_CURRENT           19608
AMT_PAYMENT_TOTAL_CURRENT         0
AMT_RECEIVABLE_PRINCIPAL          0
AMT_RECIVABLE                     0
AMT_TOTAL_RECEIVABLE              0
CNT_DRAWINGS_ATM_CURRENT      19517
CNT_DRAWINGS_CURRENT              0
CNT_DRAWINGS_OTHER_CURRENT    19517
CNT_DRAWINGS_POS_CURRENT      19517
CNT_INSTALMENT_MATURE_CUM         0
NAME_CONTRACT_STATUS              0
SK_DPD                            0
SK_DPD_DEF                        0
dtype: int64

In [83]:
payments_train.columns

Index(['SK_ID_CURR', 'TARGET', 'CODE_GENDER', 'AMT_INCOME_TOTAL',
       'AMT_ANNUITY_x', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_REGISTRATION',
       'DAYS_ID_PUBLISH', 'FLAG_EMP_PHONE', 'REG_CITY_NOT_WORK_CITY',
       'ORGANIZATION_TYPE', 'FLAG_DOCUMENT_3', 'EXT_SOURCE_MEAN',
       'CREDIT_ACTIVE', 'DAYS_CREDIT', 'CREDIT_DAY_OVERDUE',
       'DAYS_CREDIT_ENDDATE', 'DAYS_ENDDATE_FACT', 'AMT_CREDIT_SUM',
       'AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM_LIMIT', 'DAYS_CREDIT_UPDATE',
       'SK_ID_PREV', 'NAME_CONTRACT_TYPE', 'AMT_ANNUITY_y', 'AMT_CREDIT_y',
       'AMT_GOODS_PRICE_y', 'HOUR_APPR_PROCESS_START',
       'NAME_CONTRACT_STATUS_x', 'DAYS_DECISION', 'CODE_REJECT_REASON',
       'NAME_GOODS_CATEGORY', 'CHANNEL_TYPE', 'SELLERPLACE_AREA',
       'NAME_SELLER_INDUSTRY', 'CNT_PAYMENT', 'NAME_YIELD_GROUP',
       'PRODUCT_COMBINATION', 'DAYS_FIRST_DUE', 'DAYS_LAST_DUE_1ST_VERSION',
       'DAYS_LAST_DUE', 'DAYS_TERMINATION

In [84]:
credit_card_agg.columns

Index(['SK_ID_CURR', 'SK_ID_PREV', 'MONTHS_BALANCE', 'AMT_BALANCE',
       'AMT_CREDIT_LIMIT_ACTUAL', 'AMT_DRAWINGS_ATM_CURRENT',
       'AMT_DRAWINGS_CURRENT', 'AMT_DRAWINGS_OTHER_CURRENT',
       'AMT_DRAWINGS_POS_CURRENT', 'AMT_INST_MIN_REGULARITY',
       'AMT_PAYMENT_CURRENT', 'AMT_PAYMENT_TOTAL_CURRENT',
       'AMT_RECEIVABLE_PRINCIPAL', 'AMT_RECIVABLE', 'AMT_TOTAL_RECEIVABLE',
       'CNT_DRAWINGS_ATM_CURRENT', 'CNT_DRAWINGS_CURRENT',
       'CNT_DRAWINGS_OTHER_CURRENT', 'CNT_DRAWINGS_POS_CURRENT',
       'CNT_INSTALMENT_MATURE_CUM', 'NAME_CONTRACT_STATUS', 'SK_DPD',
       'SK_DPD_DEF'],
      dtype='object')

Justificativas para o não uso desse dataframe:

1. Não é possível concatenar ou fazer um merge entre payments_train e credit_card devido aos erros cometidos nos processos anteriores.
2. A princípio parecia lógico agregar os valores de cada ID dos datasets e obter um unico valor baseado na média, porém, existia outras opções de trabalhar com grande base de dados, mas optei por essa.
3. Continuando o erro, no tratamento de dados nulos, optei pela a exclusão das linhas esquecendo que cada linha era um cliente, resultando no não uso do credit_card para o modelo, poderia trata-los com a média ou escolher outras opções que não fossem eliminar esses registros importantes.
4. Talvez faltou uma melhor compreensão de cada dataset e o que cada um representava para application_train, devido a maior tempo ao estudo do dataset principal, faltou um estudo mais aprofundado desses registros.
5. Com base nos fatos abordados, o processo será o uso do dataset payments_train e payments_test.