In [None]:
import math
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import gaussian_kde
from plotly.subplots import make_subplots

In [3]:
df = pd.read_csv("../data/transformed/transformed_finance_data.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32424 entries, 0 to 32423
Data columns (total 35 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   user_id                      32424 non-null  object 
 1   age                          32424 non-null  int64  
 2   gender                       32424 non-null  object 
 3   education_level              32424 non-null  object 
 4   employment_status            32424 non-null  object 
 5   job_title                    32424 non-null  object 
 6   monthly_income_usd           32424 non-null  float64
 7   monthly_expenses_usd         32424 non-null  float64
 8   savings_usd                  32424 non-null  float64
 9   has_loan                     32424 non-null  object 
 10  loan_type                    12995 non-null  object 
 11  loan_amount_usd              32424 non-null  float64
 12  loan_term_months             32424 non-null  int64  
 13  monthly_emi_usd 

##### Análise Univariada

In [5]:
categorical_cols = [col for col in df.columns if df[col].dtype == "object"]
numeric_cols = [col for col in df.columns if df[col].dtype in ["int64", "float64"]]

In [13]:
categorical_cols = list(set(categorical_cols) - {"user_id", "record_date"})

In [7]:
def plot_multiple_categorical(df, cols):
    n = len(cols)
    cols_per_row = 3
    rows = math.ceil(n / cols_per_row)

    fig = make_subplots(
        rows=rows, 
        cols=cols_per_row,
        subplot_titles=[col.replace("_", " ").title() for col in cols],
        horizontal_spacing=0.08,
        vertical_spacing=0.12
    )

    palette = px.colors.qualitative.Bold  

    for i, col in enumerate(cols):
        row = i // cols_per_row + 1
        col_pos = i % cols_per_row + 1

        counts = df[col].value_counts().reset_index()
        counts.columns = [col, "count"]
        counts["percent"] = counts["count"] / counts["count"].sum() * 100

        fig.add_trace(
            go.Bar(
                x=counts[col],
                y=counts["count"],
                text=[f"{c} ({p:.1f}%)" for c, p in zip(counts["count"], counts["percent"])],
                textposition="auto",
                marker_color=palette[i % len(palette)],
                name=col.replace("_", " ").title()
            ),
            row=row, col=col_pos
        )

    fig.update_layout(
        height=400*rows, 
        width=2100,
        title_text="Distribuição de Variáveis Categóricas",
        title_x=0.5,
        title_font=dict(size=24),
        legend_title="Variável",
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=-0.2,
            xanchor="center",
            x=0.5
        ),
        margin=dict(l=50, r=50, t=80, b=50)
    )

    fig.show()

In [None]:
def plot_multiple_numerical(df, cols=None, cols_per_row=3, bins=80):
    if cols is None:
        cols = df.select_dtypes(include=[np.number]).columns.tolist()
    cols = [c for c in cols if df[c].nunique(dropna=True) > 1]

    n = len(cols)
    if n == 0:
        raise ValueError("Nenhuma coluna numérica válida para plotar.")

    rows = math.ceil(n / cols_per_row)
    max_vspace = 1.0 / (rows - 1) if rows > 1 else 0.3
    vspace = min(0.12, max_vspace)

    titles = [
        f"{c.replace('_',' ').title()}<br>Média: {df[c].mean():.2f} | Mediana: {df[c].median():.2f}"
        for c in cols
    ]
    titles += [""] * (rows * cols_per_row - len(titles))

    fig = make_subplots(
        rows=rows,
        cols=cols_per_row,
        subplot_titles=tuple(titles),
        horizontal_spacing=0.07,
        vertical_spacing=vspace
    )

    for i, c in enumerate(cols):
        r = i // cols_per_row + 1
        k = i % cols_per_row + 1
        x = df[c].dropna().values

        fig.add_trace(
            go.Histogram(x=x, nbinsx=bins, opacity=0.75, name=c, showlegend=False),
            row=r, col=k
        )

        if x.size > 1 and np.isfinite(x).all():
            kde = gaussian_kde(x)
            xs = np.linspace(x.min(), x.max(), 200)
            width = (x.max() - x.min()) / bins if bins > 0 else 1
            ys = kde(xs) * len(x) * width 
            fig.add_trace(
                go.Scatter(x=xs, y=ys, mode="lines", name=f"{c} density", showlegend=False),
                row=r, col=k
            )

    fig.update_layout(
        height=max(380 * rows, 500),
        width=360 * cols_per_row + 360,
        title_text="Distribuições de Variáveis Numéricas",
        title_x=0.5,
        margin=dict(l=40, r=40, t=70, b=40)
    )
    fig.show()


In [14]:
plot_multiple_categorical(df, categorical_cols)