In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
%matplotlib inline

## 1. Carga de Datos

In [None]:
data_path = Path("../data/parquet")
df_train = pd.read_parquet(data_path / "application_train.parquet")

print(f"Shape: {df_train.shape}")
df_train.head()

## 2. Análisis de la Variable Target

In [None]:
target_dist = df_train['TARGET'].value_counts(normalize=True)
print("Distribución del TARGET:")
print(target_dist)

plt.figure(figsize=(8, 5))
df_train['TARGET'].value_counts().plot(kind='bar')
plt.title('Distribución de la Variable Target')
plt.xlabel('Target (0: No Default, 1: Default)')
plt.ylabel('Frecuencia')
plt.xticks(rotation=0)
plt.show()

## 3. Análisis de Missing Values

In [None]:
missing = df_train.isnull().sum() / len(df_train) * 100
missing = missing[missing > 0].sort_values(ascending=False)

print(f"Columnas con valores faltantes: {len(missing)}")
print("\nTop 10 columnas con más missing values:")
print(missing.head(10))

## 4. Análisis de Tipos de Datos

In [None]:
print("Tipos de datos:")
print(df_train.dtypes.value_counts())

numeric_cols = df_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df_train.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"\nColumnas numéricas: {len(numeric_cols)}")
print(f"Columnas categóricas: {len(categorical_cols)}")

## 5. Estadísticas Descriptivas

In [None]:
df_train.describe()

## 6. Análisis de Correlaciones

In [None]:
if 'TARGET' in df_train.columns:
    correlations = df_train[numeric_cols].corrwith(df_train['TARGET']).abs().sort_values(ascending=False)
    print("Top 15 features correlacionadas con TARGET:")
    print(correlations.head(15))
   
    plt.figure(figsize=(10, 8))
    correlations.head(20).plot(kind='barh')
    plt.title('Correlación Absoluta con TARGET')
    plt.xlabel('Correlación Absoluta')
    plt.tight_layout()
    plt.show()