**Ingestão e Validação de Dados**

In [1]:
import pandas as pd
import logging
from pandera import DataFrameSchema, Column, Check

# Configuração de logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s — %(levelname)s — %(message)s"
)
logger = logging.getLogger(__name__)

# 1. Definição do schema com Pandera
schema = DataFrameSchema(
    {
        "Entity": Column(str, nullable=False),
        "Code": Column(str, nullable=True),
        "Year": Column(
            int,
            Check(lambda y: 1800 <= y <= 2022, element_wise=True),
            nullable=False
        ),
        "GDP per capita": Column(
            float,
            Check(lambda x: x >= 0, element_wise=True),
            nullable=False
        ),
        "Value of global merchandise exports as a share of GDP": Column(
            float,
            Check(lambda x: x >= 0, element_wise=True),
            nullable=True
        ),
        "Government expenditure (% of GDP)": Column(
            float,
            Check(lambda x: x >= 0, element_wise=True),
            nullable=True
        ),
        "Trade as a Share of GDP": Column(
            float,
            Check(lambda x: x >= 0, element_wise=True),
            nullable=True
        ),
        "Inflation, consumer prices (annual %)": Column(
            float,
            nullable=True
        ),
    },
    strict=True,  # não permite colunas extras
)

def load_csv(path: str) -> pd.DataFrame:
    logger.info(f"Carregando dados de {path}")
    df = pd.read_csv(path)
    logger.info(f"Dados carregados: {df.shape[0]} linhas, {df.shape[1]} colunas")
    return df

def validate_schema(df: pd.DataFrame) -> pd.DataFrame:
    logger.info("Validando esquema de dados com Pandera")
    validated = schema.validate(df, lazy=True)
    logger.info("Validação de esquema concluída sem erros")
    return validated

def handle_missing(df: pd.DataFrame) -> pd.DataFrame:
    miss_gdp = df["GDP per capita"].isna().sum()
    if miss_gdp > 0:
        logger.warning(f"Descartando {miss_gdp} registros com PIB per capita faltando")
        df = df.dropna(subset=["GDP per capita"])

    cols_to_interp = [
        "Value of global merchandise exports as a share of GDP",
        "Government expenditure (% of GDP)",
        "Trade as a Share of GDP",
        "Inflation, consumer prices (annual %)"
    ]
    df = (
        df
        .sort_values(["Entity", "Year"])
        .groupby("Entity")[cols_to_interp]
        .apply(lambda g: g.interpolate(method="linear", limit_direction="both"))
        .reset_index(level=1, drop=True)
        .join(df.drop(columns=cols_to_interp))
    )
    logger.info("Interpolação de valores faltantes concluída")
    return df

def remove_duplicates_and_outliers(df: pd.DataFrame) -> pd.DataFrame:
    before = df.shape[0]
    df = df.drop_duplicates()
    logger.info(f"Removidas {before - df.shape[0]} duplicatas exatas")

    mask = df["Inflation, consumer prices (annual %)"].between(-50, 500) | df["Inflation, consumer prices (annual %)"].isna()
    outliers = (~mask).sum()
    if outliers > 0:
        logger.warning(f"Descartando {outliers} registros com inflação fora do intervalo razoável")
        df = df[mask]
    return df

def ingest_and_validate(path: str) -> pd.DataFrame:
    df = load_csv(path)
    df = validate_schema(df)
    df = handle_missing(df)
    df = remove_duplicates_and_outliers(df)
    logger.info(f"Pipeline concluído. DataFrame final: {df.shape[0]} linhas, {df.shape[1]} colunas")
    return df

if __name__ == "__main__":
    INPUT_PATH = "data/gdp_per_capita.csv"
    df_clean = ingest_and_validate(INPUT_PATH)
    df_clean.to_parquet("data/gdp_per_capita_clean.parquet", index=False)
    logger.info("Arquivo limpo salvo como data/gdp_per_capita_clean.parquet")

top-level pandera module will be **removed in a future version of pandera**.
If you're using pandera to validate pandas objects, we highly recommend updating
your import:

```
# old import
import pandera as pa

# new import
import pandera.pandas as pa
```

If you're using pandera to validate objects from other compatible libraries
like pyspark or polars, see the supported libraries section of the documentation
for more information on how to import pandera:

https://pandera.readthedocs.io/en/stable/supported_libraries.html


```
```

2025-05-20 19:54:16,576 — INFO — Carregando dados de data/gdp_per_capita.csv
2025-05-20 19:54:16,685 — INFO — Dados carregados: 147615 linhas, 8 colunas
2025-05-20 19:54:16,686 — INFO — Validando esquema de dados com Pandera
2025-05-20 19:54:16,874 — INFO — Validação de esquema concluída sem erros
2025-05-20 19:54:17,125 — INFO — Interpolação de valores faltantes concluída
2025-05-20 19:54:17,154 — INFO — Removidas 66668 duplicatas exatas
2025-05-20 19:54:1