In [14]:
# Import the necessarie libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
# Read the data 
file_path = 'C:\\Users\\cesar\\mlops-equipo-50\\data\\interim\\bike_sharing_cleaned.csv'

df = pd.read_csv(file_path)

In [16]:
df.head(5)

Unnamed: 0,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,2011-01-01,1,0,1,0.0,0.0,6,0,1.0,0.24,0.2879,0.81,0.0,3,13,16
1,2011-01-01,1,0,1,1.0,0.0,6,0,1.0,0.22,0.2727,0.8,0.0,8,32,40
2,2011-01-01,1,0,1,2.0,0.0,6,0,1.0,0.22,0.2727,0.8,0.0,5,27,32
3,2011-01-01,1,0,1,3.0,0.0,6,0,1.0,0.24,0.2879,0.75,0.0,3,10,13
4,2011-01-01,1,0,1,4.0,0.0,6,0,1.0,0.24,0.2879,0.75,0.0,0,1,1


In [53]:
def df_overview(df: pd.DataFrame, round_ndigits: int = 4) -> pd.DataFrame:
    """
    Resumen general del DataFrame:
    - Dimensiones, memoria total y tipos de columnas
    - Celdas nulas y porcentaje
    - Estadísticos globales del .describe()
    Retorna un DataFrame con una fila y el describe transpuesto.
    """
    n_rows, n_cols = df.shape
    total_mem_mb = df.memory_usage(deep=True).sum() / (1024 ** 2)
    null_cells = df.isna().sum().sum()
    pct_null_cells = (null_cells / (n_rows * n_cols) * 100.0) if n_rows and n_cols else 0.0

    summary_data = {
        "n_rows": n_rows,
        "n_cols": n_cols,
        "total_memory_mb": round(total_mem_mb, 6),
        "total_null_cells": int(null_cells),
        "%null_cells": round(pct_null_cells, round_ndigits),
        "n_numeric_cols": int(sum(pd.api.types.is_numeric_dtype(df[c]) for c in df.columns)),
        "n_object_cols": int(sum(pd.api.types.is_object_dtype(df[c]) for c in df.columns)),
        "n_datetime_cols": int(sum(pd.api.types.is_datetime64_any_dtype(df[c]) for c in df.columns)),
    }

    overview_df = pd.DataFrame([summary_data])
    
    return overview_df

In [None]:
def df_report(
    df: pd.DataFrame,
    include_examples: bool = True,
    max_examples: int = 3,
    round_ndigits: int = 4,
) -> pd.DataFrame:
    """
    Genera un reporte detallado por columna con:
    - Nulos, porcentaje de nulos
    - Tipo de dato y memoria usada
    - Conteo y porcentaje de valores únicos
    - Estadísticos numéricos (min, max, media, std, skew, kurt)
    - Modo, frecuencia, fechas min/max y ejemplos
    """
    n_rows = len(df)
    mem_per_col = df.memory_usage(deep=True)

    rows = []
    for col in df.columns:
        s = df[col]
        dtype = s.dtype
        non_null = s.notna().sum()
        nulls = n_rows - non_null
        pct_null = (nulls / n_rows * 100.0) if n_rows else 0.0
        nunique = s.nunique(dropna=True)
        unique_ratio = (nunique / non_null * 100.0) if non_null else 0.0
        mem_mb = mem_per_col.get(col, 0) / (1024 ** 2)

        # Cardinalidad descriptiva
        if unique_ratio >= 95:
            cardinality = "alta (casi ID)"
        elif unique_ratio >= 40:
            cardinality = "media"
        else:
            cardinality = "baja"

        # Ejemplos
        examples = None
        if include_examples:
            examples = (
                s.dropna().astype(str).unique()[:max_examples].tolist()
                if non_null
                else []
            )

        # Estadísticos
        top_val = top_freq = top_pct = None
        n_zero = n_negative = None
        min_val = max_val = mean_val = std_val = skew = kurt = None
        min_date = max_date = None

        if pd.api.types.is_numeric_dtype(s):
            n_zero = int((s == 0).sum(skipna=True))
            n_negative = int((s < 0).sum(skipna=True))
            desc = s.describe(percentiles=[])

            if not desc.empty:
                min_val = desc.get("min", np.nan)
                max_val = desc.get("max", np.nan)
                mean_val = desc.get("mean", np.nan)
                std_val = desc.get("std", np.nan)

            with np.errstate(all="ignore"):
                skew = s.skew(skipna=True)
                kurt = s.kurtosis(skipna=True)

        elif pd.api.types.is_datetime64_any_dtype(s):
            if non_null:
                min_date = s.min()
                max_date = s.max()

        # Modo
        if non_null:
            vc = s.value_counts(dropna=True)
            if not vc.empty:
                top_val = vc.index[0]
                top_freq = int(vc.iloc[0])
                top_pct = round((top_freq / non_null) * 100.0, round_ndigits)

        rows.append({
            "column": col,
            "dtype": str(dtype),
            "n_rows": n_rows,
            "non_null": int(non_null),
            "nulls": int(nulls),
            "%nulls": round(pct_null, round_ndigits),
            "unique": int(nunique),
            "%unique_on_non_null": round(unique_ratio, round_ndigits),
            "cardinality": cardinality,
            "memory_mb": round(mem_mb, 6),
            "is_constant": bool(nunique == 1 and non_null > 0),
            "n_zero": n_zero,
            "n_negative": n_negative,
            "min": min_val,
            "max": max_val,
            "mean": mean_val,
            "std": std_val,
            "skew": skew,
            "kurtosis": kurt,
            "top": top_val,
            "top_freq": top_freq,
            "top_%": top_pct,
            "min_date": min_date,
            "max_date": max_date,
            "examples": examples,
        })

    report = pd.DataFrame(rows)

    num_cols_to_round = ["min", "max", "mean", "std", "skew", "kurtosis", "memory_mb"]
    for c in num_cols_to_round:
        if c in report.columns:
            report[c] = report[c].apply(
                lambda x: round(x, round_ndigits)
                if isinstance(x, (int, float, np.floating)) and pd.notna(x)
                else x
            )

    return report   

In [54]:
overview_df = df_overview(df)
report_df = df_report(df)

In [55]:
display(overview_df)

Unnamed: 0,n_rows,n_cols,total_memory_mb,total_null_cells,%null_cells,n_numeric_cols,n_object_cols,n_datetime_cols
0,15033,16,2.566375,0,0.0,15,1,0


In [56]:
display(report_df)

Unnamed: 0,column,dtype,n_rows,non_null,nulls,%nulls,unique,%unique_on_non_null,cardinality,memory_mb,...,mean,std,skew,kurtosis,top,top_freq,top_%,min_date,max_date,examples
0,dteday,object,15033,15033,0,0.0,730,4.856,baja,0.8459,...,,,,,2012-09-13,25,0.1663,,,"[2011-01-01, 2011-01-02, 2011-01-03]"
1,season,int64,15033,15033,0,0.0,4,0.0266,baja,0.1147,...,2.4953,1.1079,0.002,-1.3367,3,3873,25.7633,,,"[1, 2, 3]"
2,yr,int64,15033,15033,0,0.0,2,0.0133,baja,0.1147,...,0.5025,0.5,-0.01,-2.0002,1,7554,50.2495,,,"[0, 1]"
3,mnth,int64,15033,15033,0,0.0,12,0.0798,baja,0.1147,...,6.5279,3.4406,-0.0059,-1.2059,3,1302,8.6609,,,"[1, 2, 3]"
4,hr,float64,15033,15033,0,0.0,24,0.1596,baja,0.1147,...,11.5523,6.91,-0.01,-1.195,14.0,648,4.3105,,,"[0.0, 1.0, 2.0]"
5,holiday,float64,15033,15033,0,0.0,2,0.0133,baja,0.1147,...,0.029,0.1678,5.6139,29.5194,0.0,14597,97.0997,,,"[0.0, 1.0]"
6,weekday,int64,15033,15033,0,0.0,7,0.0466,baja,0.1147,...,3.0052,2.0039,-0.0044,-1.2544,6,2165,14.4016,,,"[6, 0, 1]"
7,workingday,int64,15033,15033,0,0.0,2,0.0133,baja,0.1147,...,0.6835,0.4651,-0.7891,-1.3775,1,10275,68.3496,,,"[0, 1]"
8,weathersit,float64,15033,15033,0,0.0,4,0.0266,baja,0.1147,...,1.4138,0.635,1.2713,0.4638,1.0,10016,66.6268,,,"[1.0, 2.0, 3.0]"
9,temp,float64,15033,15033,0,0.0,206,1.3703,baja,0.1147,...,3.4219,43.371,17.1901,311.9339,0.5,667,4.4369,,,"[0.24, 0.22, 0.32]"
