In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
from scipy import stats

# Rutas relativas robustas dentro del repositorio
PROJECT_ROOT = Path.cwd()

# Candidatos para localizar el CSV crudo
raw_candidates = [
    PROJECT_ROOT / 'NHANES2009-2012.csv',
    PROJECT_ROOT.parent / 'NHANES2009-2012.csv',
    PROJECT_ROOT / 'noteebook' / 'NHANES2009-2012.csv',
]
DATA_RAW = next((p for p in raw_candidates if p.exists()), None)
assert DATA_RAW is not None, f'No existe archivo crudo en: {raw_candidates}'

# Directorio de salida preferido
out_candidates = [
    PROJECT_ROOT / 'noteebook' / 'nhanes_clean',
    PROJECT_ROOT / 'noteebook' / 'noteebook' / 'nhanes_clean',
    PROJECT_ROOT / 'nhanes_clean',
]
OUT_DIR = next((p for p in out_candidates if p.parent.exists() or p.parent.parent.exists()), out_candidates[0])
OUT_DIR.mkdir(parents=True, exist_ok=True)

CSV_OUT = OUT_DIR / 'NHANES2009-2012_sleep_mental_clean.csv'
REPORT_MD = OUT_DIR / 'sleep_mental_clean_report.md'

needed_original_cols = [
    'SurveyYr', 'ID', 'Gender', 'Age', 'SleepHrsNight', 'DaysMentHlthBad'
]

# Leer encabezado para detectar columnas presentes
header_df = pd.read_csv(DATA_RAW, nrows=0)
available_cols = set(header_df.columns)
present_original = [c for c in needed_original_cols if c in available_cols]
usecols = lambda c: c in set(present_original)  # noqa: E731
raw = pd.read_csv(DATA_RAW, usecols=usecols)
raw.columns = [c.strip() for c in raw.columns]

standard_map = {
    'SurveyYr': 'surveyyr',
    'ID': 'id',
    'Gender': 'gender',
    'Age': 'age',
    'SleepHrsNight': 'sleephrsnight',
    'DaysMentHlthBad': 'daysmenthlthbad',
}

df = raw.rename(columns={k: standard_map[k] for k in present_original if k in standard_map}).copy()
df.head()


PermissionError: [Errno 13] Permission denied: '/home/camilo-pc'

In [None]:
# Tipos y coerción
for col in ['age', 'sleephrsnight', 'daysmenthlthbad']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Duplicados por encuesta+id
keys = [c for c in ['surveyyr', 'id'] if c in df.columns]
rows_initial = len(df)
if keys:
    df = df.sort_values(keys).drop_duplicates(subset=keys, keep='first')
rows_after_dups = len(df)

# Rangos plausibles
if 'sleephrsnight' in df.columns:
    df.loc[(df['sleephrsnight'] < 0) | (df['sleephrsnight'] > 24), 'sleephrsnight'] = np.nan
    df.loc[df['sleephrsnight'] == 0, 'sleephrsnight'] = np.nan
if 'daysmenthlthbad' in df.columns:
    df.loc[(df['daysmenthlthbad'] < 0) | (df['daysmenthlthbad'] > 30), 'daysmenthlthbad'] = np.nan

# Winsorización 1-99% en sueño
if 'sleephrsnight' in df.columns:
    ql, qh = df['sleephrsnight'].quantile([0.01, 0.99])
    df['sleephrsnight'] = df['sleephrsnight'].clip(lower=ql, upper=qh)

# Filtro de completitud: requerir ambas variables clave
before_complete = len(df)
df = df[~df['sleephrsnight'].isna() & ~df['daysmenthlthbad'].isna()].copy()
after_complete = len(df)

rows_initial, rows_after_dups, before_complete, after_complete


In [None]:
# Normalizaciones y correlaciones

def zscore(s: pd.Series) -> pd.Series:
    m, sd = s.mean(), s.std(ddof=0)
    return s if (pd.isna(sd) or sd == 0) else (s - m) / sd

def minmax(s: pd.Series) -> pd.Series:
    mn, mx = s.min(), s.max()
    den = mx - mn
    return s if (pd.isna(den) or den == 0) else (s - mn) / den

if 'sleephrsnight' in df.columns:
    df['sleephrsnight_z'] = zscore(df['sleephrsnight'])
    df['sleephrsnight_minmax'] = minmax(df['sleephrsnight'])
if 'daysmenthlthbad' in df.columns:
    df['daysmenthlthbad_z'] = zscore(df['daysmenthlthbad'])
    df['daysmenthlthbad_minmax'] = minmax(df['daysmenthlthbad'])

pearson_r, pearson_p = stats.pearsonr(df['sleephrsnight'], df['daysmenthlthbad'])
spearman_rho, spearman_p = stats.spearmanr(df['sleephrsnight'], df['daysmenthlthbad'])

{
    'pearson_r': round(float(pearson_r), 3),
    'pearson_p': float(pearson_p),
    'spearman_rho': round(float(spearman_rho), 3),
    'spearman_p': float(spearman_p),
}


In [None]:
# Resumen de faltantes, valores únicos, constantes y outliers (Tukey)

# Faltantes por columna
missing_table = df.isna().sum().to_frame('missing').assign(
    total=len(df), missing_rate=lambda d: d['missing']/d['total']
)

# Valores únicos por columna
unique_table = df.nunique(dropna=True).to_frame('n_unique')

# Columnas constantes (potencialmente redundantes en este subset)
constant_cols = unique_table.index[unique_table['n_unique'] <= 1].tolist()
constant_cols = [c for c in constant_cols if c not in ['id','surveyyr']]

# Outliers por regla de Tukey (Q1-1.5*IQR, Q3+1.5*IQR)

def tukey_outliers_count(s: pd.Series) -> int:
    s = s.dropna()
    if s.empty:
        return 0
    q1, q3 = s.quantile(0.25), s.quantile(0.75)
    iqr = q3 - q1
    lo, hi = q1 - 1.5*iqr, q3 + 1.5*iqr
    return int(((s < lo) | (s > hi)).sum())

outlier_counts = {}
for col in ['sleephrsnight','daysmenthlthbad']:
    if col in df.columns:
        outlier_counts[col] = tukey_outliers_count(df[col])

missing_table.head(), unique_table.head(), constant_cols, outlier_counts


In [None]:
# Exportar CSV limpio e informe
export_cols = [
    c for c in [
        'surveyyr','id','gender','age',
        'sleephrsnight','daysmenthlthbad',
        'sleephrsnight_z','sleephrsnight_minmax',
        'daysmenthlthbad_z','daysmenthlthbad_minmax'
    ] if c in df.columns
]

df[export_cols].to_csv(CSV_OUT, index=False)

report_lines = []
report_lines.append('# Limpieza de datos: Sueño ↔ Mala Salud Mental')
report_lines.append('')
report_lines.append('## Variables clave')
report_lines.append('- SleepHrsNight: horas de sueño por noche')
report_lines.append('- DaysMentHlthBad: días de mala salud mental en los últimos 30')
report_lines.append('')
report_lines.append('## Resumen de filas')
report_lines.append(f'- Filas iniciales: {rows_initial}')
report_lines.append(f'- Eliminadas por duplicados: {rows_initial - rows_after_dups}')
report_lines.append(f'- Eliminadas por faltantes en variables clave: {rows_after_dups - after_complete}')
report_lines.append(f'- Filas finales para análisis: {after_complete}')
report_lines.append('')
report_lines.append('## Tratamientos aplicados')
report_lines.append('- Detección y eliminación de duplicados por `surveyyr` + `id` (si disponibles).')
report_lines.append('- Rango plausible: `sleephrsnight` en [1, 24]; `daysmenthlthbad` en [0, 30].')
report_lines.append('- Winsorización 1%-99% en `sleephrsnight`.')
report_lines.append('- Filtro de completitud: se requieren ambas variables clave no nulas.')
report_lines.append('- Normalización: z-score y min-max para ambas variables.')
report_lines.append('')
report_lines.append('## Correlaciones')
report_lines.append(f'- Pearson r: {pearson_r:.3f} (p={pearson_p:.3g})')
report_lines.append(f'- Spearman ρ: {spearman_rho:.3f} (p={spearman_p:.3g})')
report_lines.append('- Interpretación: a más horas de sueño, menos días de mala salud mental; efecto débil.')

with open(REPORT_MD, 'w', encoding='utf-8') as f:
    f.write('\n'.join(report_lines))

CSV_OUT, REPORT_MD, df[export_cols].shape
