# Análise Exploratória dos Dados (EDA)

Este notebook apresenta a análise exploratória das bases de dados do desafio de análise
de crédito, com foco na compreensão das variáveis, distribuição da variável alvo e
qualidade dos dados.

Os dados utilizados neste notebook foram previamente ingeridos e armazenados em
formato Parquet pelo notebook `00_data_ingestion.ipynb`.


In [6]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style="whitegrid")

In [7]:
PROJECT_ROOT = Path.cwd().parent
DATA_PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"

DATA_PROCESSED_DIR

WindowsPath('c:/Users/ggrinspun/Github/desafio-analise-de-credito/data/processed')

In [8]:
df_train = pd.read_parquet(DATA_PROCESSED_DIR / "train.parquet")
df_test  = pd.read_parquet(DATA_PROCESSED_DIR / "test.parquet")
df_oot   = pd.read_parquet(DATA_PROCESSED_DIR / "oot.parquet")

print("Train:", df_train.shape)
print("Test :", df_test.shape)
print("OOT  :", df_oot.shape)

Train: (120750, 151)
Test : (51751, 151)
OOT  : (91965, 150)


In [11]:
df_train.describe()

Unnamed: 0,TARGET,IDADE,VAR6,VAR7,VAR11,VAR12,VAR13,VAR14,VAR15,VAR16,...,VAR41,VAR42,VAR44,VAR46,VAR47,VAR141,VAR145,VAR146,VAR147,ID
count,120750.0,107040.0,117394.0,117394.0,74488.0,65724.0,15530.0,95197.0,58269.0,44981.0,...,117811.0,107048.0,401.0,294.0,120750.0,120750.0,679.0,168.0,120750.0,120750.0
mean,0.245027,42.125255,-14.411389,-45.90348,0.235917,0.290241,0.31385,0.241245,0.185754,2.044374,...,0.631175,0.504619,1691.738429,6984.218469,0.256543,1854.833006,4018.743785,1942.649762,101.841656,165324.864199
std,0.430105,15.198476,8.995077,7.529788,0.625609,0.308937,0.241229,0.262687,0.19532,3.170869,...,0.222607,0.069614,2177.830516,4621.964093,0.406746,893.999792,3700.836248,3143.75785,0.540016,95488.44232
min,0.0,18.014,-33.521563,-72.900276,-4.0,0.0,0.0,0.0,0.0,0.083333,...,0.0,0.006658,0.0,0.0,0.0,0.0,0.0,0.0,100.0,3.0
25%,0.0,30.05725,-22.842778,-49.903564,0.0,0.055,0.056,0.105,0.078,0.25,...,0.440572,0.463579,642.0,3888.995,0.0,1513.2274,1633.195,0.0,102.0,82727.25
50%,0.0,39.867,-13.01059,-46.574908,0.0,0.159,0.318,0.14,0.112,0.833333,...,0.542298,0.496036,769.0,5386.31,0.003,1627.157652,3024.48,935.12,102.0,165298.0
75%,0.0,52.997,-6.357067,-39.023621,0.0,0.505,0.507,0.168,0.153,2.416667,...,0.898052,0.522576,1747.5,9601.39,0.421,1820.670284,5217.67,2260.125,102.0,248248.0
max,1.0,105.477,4.602823,-32.429516,4.0,1.5,1.0,1.7,1.0,15.999999,...,1.0,0.910865,17374.94,26523.92,1.0,33954.14,33954.14,17229.2,102.0,330581.0


## Analise de dados ausentes

In [19]:
missing_df = (
    df_train
    .isnull()
    .mean()
    .mul(100)
    .sort_values(ascending=False)
    .to_frame(name="missing_pct")
)

missing_df.head(10)


Unnamed: 0,missing_pct
VAR146,99.86087
VAR4,99.832712
VAR46,99.756522
VAR45,99.756522
VAR44,99.667909
VAR43,99.667909
VAR145,99.437681
VAR38,99.363147
VAR26,99.165217
VAR27,99.13706


In [23]:
print(missing_df)

        missing_pct
VAR146    99.860870
VAR4      99.832712
VAR46     99.756522
VAR45     99.756522
VAR44     99.667909
...             ...
VAR141     0.000000
VAR147     0.000000
VAR148     0.000000
VAR149     0.000000
ID         0.000000

[151 rows x 1 columns]


In [24]:
missing_info = (
    df_train
    .isnull()
    .mean()
    .mul(100)
    .to_frame(name="missing_pct")
    .join(df_train.dtypes.to_frame(name="dtype"))
    .sort_values("missing_pct", ascending=False)
)

missing_info.head(10)


Unnamed: 0,missing_pct,dtype
VAR146,99.86087,float64
VAR4,99.832712,object
VAR46,99.756522,float64
VAR45,99.756522,object
VAR44,99.667909,float64
VAR43,99.667909,object
VAR145,99.437681,float64
VAR38,99.363147,object
VAR26,99.165217,float64
VAR27,99.13706,float64


In [22]:
missing_test = (
    df_test
    .isnull()
    .mean()
    .mul(100)
    .to_frame(name="missing_pct_test")
)

missing_compare = missing_df.join(missing_test)

missing_compare["diff_pct"] = (
    missing_compare["missing_pct_test"] - missing_compare["missing_pct"]
).abs()

missing_compare.sort_values("diff_pct", ascending=False).head(10)


Unnamed: 0,missing_pct,missing_pct_test,diff_pct
VAR12,45.570186,46.012637,0.442451
VAR22,51.69441,51.313018,0.381392
VAR16,62.748654,63.119553,0.370899
VAR19,51.744099,51.38065,0.36345
VAR25,51.744099,51.38065,0.36345
VAR15,51.744099,51.38065,0.36345
VAR32,65.174327,65.469266,0.294939
VAR11,38.312215,38.034048,0.278168
VAR33,92.860455,92.672605,0.18785
VAR34,92.860455,92.672605,0.18785
