# 🧪 Data utility smoke tests
This notebook exercises the helper functions in `workhealthlab.data_utils`.

In [None]:
from pathlib import Path
import shutil
import pandas as pd
import numpy as np

from workhealthlab import data_utils as du

# Create a temporary ANALYSIS/DATASETS structure with sample .dta files
tmp_root = Path('tests/_tmp_utils')
if tmp_root.exists():
    shutil.rmtree(tmp_root)
data_dir = tmp_root / 'ANALYSIS' / 'DATASETS'
data_dir.mkdir(parents=True)

df_alpha = pd.DataFrame({
    'engagement': np.linspace(3.2, 4.1, 5),
    'age': [25, 31, 42, 37, 29],
    'tenure': [1, 3, 6, 4, 2]
})
df_beta = pd.DataFrame({
    'resilience': np.linspace(2.4, 4.8, 6),
    'age': [28, 33, 39, 45, 52, 60],
    'team': ['A', 'B', 'A', 'C', 'B', 'C']
})
df_alpha.to_stata(data_dir / 'survey_alpha.dta', write_index=False)
df_beta.to_stata(data_dir / 'survey_beta.dta', write_index=False)

files = du.discover_dta_files(data_dir)
print('Discovered files:', [f.name for f in files])

summary = du.summarize_datasets(data_dir)
print(summary[['dataset', 'size_MB']])

loaded = du.load_dta(files[0])
print('Loaded columns:', list(loaded.columns))

matches = du.find_variable_across_datasets('engagement', data_dir)
print('Matched datasets:', matches['dataset'].tolist())

pipeline = du.discover_data(
    data_dir=data_dir,
    construct_names=['engagement'],
    predictors=['age']
)
print('Pipeline results:', {k: v.shape for k, v in pipeline.items()})
