# PNADC Exploration Notebook

Goals: quick data preview, schema inference, and baseline EDA. Uses scripts/parse_pnadc.py to sniff delimiter, summarize files, and create a sample CSV.

In [None]:
import sys, json, os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn-v0_8')
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 120)

def find_project_root(start: Path) -> Path:
    for p in [start, *start.parents]:
        if (p / 'scripts' / 'parse_pnadc.py').exists():
            return p
    return start

ROOT = find_project_root(Path.cwd())
SCRIPTS = ROOT / 'scripts'
if str(SCRIPTS) not in sys.path:
    sys.path.insert(0, str(SCRIPTS))

from parse_pnadc import sniff_delimiter, summarize_file, write_sample_csv


## Locate Input File
Select a PNADC file present in the repo (e.g., PNADC_012025.txt).

In [None]:
# Try to pick a suitable input file automatically
candidates = sorted([p for p in ROOT.glob('PNADC_*.txt') if p.is_file()])
if candidates:
    input_path = candidates[0]
else:
    # fall back to a tiny sample file
    input_path = ROOT / 'samples' / 'sample_pnadc.txt'
input_path

## Summarize & Create Sample
Detect delimiter/header, count rows/columns, and write a `sample.csv` for quick inspection.

In [None]:
out_dir = ROOT / 'out'
sample_path = write_sample_csv(input_path, out_dir, sample_rows=200)
summary = summarize_file(input_path)
(out_dir / 'summary.json').write_text(json.dumps(summary, indent=2, ensure_ascii=False), encoding='utf-8')
summary, sample_path

## Load Sample for a Quick Look

In [None]:
df_sample = pd.read_csv(sample_path)
display(df_sample.head())
df_sample.shape, df_sample.dtypes.head()

## Read a Larger Slice (Optional)
Read the first N rows of the full file using the detected delimiter for wider EDA.

In [None]:
delimiter = summary.get('delimiter', ',')
has_header = summary.get('has_header', True)
read_kwargs = dict(sep=delimiter, engine='python', encoding='utf-8', on_bad_lines='skip')
if not has_header:
    # Create temporary names if header is missing
    read_kwargs.update(header=None)
    # infer number of cols from sample
    ncols = df_sample.shape[1]
    read_kwargs.update(names=[f'col_{i+1}' for i in range(ncols)])
df_small = pd.read_csv(input_path, nrows=100000, **read_kwargs)
df_small.shape, df_small.head(3)

## Basic EDA
Distributions, missingness, and value frequencies for a few columns.

In [None]:
df = df_small if 'df_small' in globals() else df_sample
display(df.describe(include='all').T.head(20))
nulls = df.isna().mean().sort_values(ascending=False)
nulls.head(20)

In [None]:
# Plot the top few numeric columns
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()[:4]
if num_cols:
    _ = df[num_cols].hist(bins=30, figsize=(10,6))
    plt.tight_layout()
else:
    print('No numeric columns found in the sample.')


## Next Steps
- Parse `INPUT_SNIPC_PNADC*.txt` to derive column names/types.
- Apply recoding for categorical variables (per docs).
- Build reusable loaders with schema validation (tests already in `tests/`).