In [20]:
from pathlib import Path
import pandas as pd
import numpy as np

# Entradas y salidas (rutas relativas dentro del repo)
PROJECT_ROOT = Path.cwd()
CLEAN_CSV = 'nhanes_clean/NHANES2009-2012_sleep_mental_clean.csv'
OUT_DIR = PROJECT_ROOT / 'sleep_mental'
OUT_DIR.mkdir(parents=True, exist_ok=True)

base = pd.read_csv(CLEAN_CSV)
base.head()


Unnamed: 0,surveyyr,id,gender,age,sleephrsnight,daysmenthlthbad,sleephrsnight_z,sleephrsnight_minmax,daysmenthlthbad_z,daysmenthlthbad_minmax
0,2009_10,51624,male,34,4.0,15.0,-2.165026,0.142857,1.359669,0.5
1,2009_10,51630,female,49,8.0,10.0,0.800356,0.714286,0.731476,0.333333
2,2009_10,51647,female,45,8.0,3.0,0.800356,0.714286,-0.147993,0.1
3,2009_10,51654,male,66,7.0,0.0,0.05901,0.571429,-0.524908,0.0
4,2009_10,51656,male,58,5.0,0.0,-1.423681,0.285714,-0.524908,0.0


In [21]:
# Utilidades comunes

def zscore(s: pd.Series) -> pd.Series:
    m, sd = s.mean(), s.std(ddof=0)
    return s if (pd.isna(sd) or sd == 0) else (s - m) / sd

def minmax(s: pd.Series) -> pd.Series:
    mn, mx = s.min(), s.max()
    den = mx - mn
    return s if (pd.isna(den) or den == 0) else (s - mn) / den

# Numerización 1..n: convierte categorías a 1..k preservando orden alfabético

def enumerate_1_to_n(cat: pd.Series) -> pd.Series:
    cat = cat.astype('category')
    codes = cat.cat.codes.replace({-1: np.nan})
    # Re-map a 1..k
    if codes.notna().any():
        codes = codes + 1
    return codes


In [22]:
# Vista 1: Predicción (Regresión)
# Objetivo: `daysmenthlthbad` como variable dependiente

v1 = base.copy()

# Derivadas tipo 1 (transformaciones directas)
v1['sleep_deficit'] = np.clip(7 - v1['sleephrsnight'], a_min=0, a_max=None)
v1['sleep_excess'] = np.clip(v1['sleephrsnight'] - 9, a_min=0, a_max=None)

# Derivadas tipo 2 (interacciones)
v1['sleep_deficit_x_age'] = v1['sleep_deficit'] * v1['age']

# Normalización
v1['sleephrsnight_z'] = zscore(v1['sleephrsnight'])

# Discretización: bins de sueño
v1['sleep_cat'] = pd.cut(
    v1['sleephrsnight'],
    bins=[-np.inf, 5, 7, 9, np.inf],
    labels=['muy_bajo','adecuado_bajo','adecuado_alto','alto']
)

# Numerización 1..n
v1['gender_num'] = enumerate_1_to_n(v1['gender'])
v1['sleep_cat_num'] = enumerate_1_to_n(v1['sleep_cat'])

# Subconjunto de variables relevantes para modelar/regresión
v1_cols = [
    'surveyyr','id','gender','gender_num','age',
    'sleephrsnight','sleephrsnight_z','sleep_deficit','sleep_excess','sleep_deficit_x_age',
    'sleep_cat','sleep_cat_num',
    'daysmenthlthbad',
]

v1_mineable = v1[v1_cols].dropna(subset=['daysmenthlthbad','sleephrsnight'])

out1 = OUT_DIR / 'view1_regression_sleep_mental.csv'
v1_mineable.to_csv(out1, index=False)
v1_mineable.shape, out1


((4485, 13),
 PosixPath('/workspace/noteebook/sleep_mental/view1_regression_sleep_mental.csv'))

In [23]:
# Configuración de visualización y directorios de salida
import matplotlib.pyplot as plt
import seaborn as sns

FIGS_DIR = OUT_DIR / 'figs'
FIGS_DIR.mkdir(parents=True, exist_ok=True)

# Utilidad para guardar figuras con pie de figura
captions = []

def savefig_with_caption(fig, filename: str, caption: str):
    path = FIGS_DIR / filename
    fig.savefig(path, dpi=150, bbox_inches='tight')
    captions.append(f"{filename}: {caption}")
    plt.close(fig)

sns.set(style='whitegrid', context='talk')


In [24]:
# 1) Análisis exploratorio: histogramas, boxplots, dispersión y mapa de calor

# Histograma de horas de sueño
fig, ax = plt.subplots(figsize=(8,5))
sns.histplot(base['sleephrsnight'], kde=True, bins=24, ax=ax, color='#1f77b4')
ax.set_title('Histograma de horas de sueño por noche')
ax.set_xlabel('Horas de sueño')
ax.set_ylabel('Frecuencia')
savefig_with_caption(fig, 'hist_sleephrsnight.png', 'Distribución de horas de sueño. Se observa concentración entre 6–8 horas.')

# Boxplots por género
fig, ax = plt.subplots(figsize=(8,5))
sns.boxplot(data=base, x='gender', y='sleephrsnight', ax=ax)
ax.set_title('Horas de sueño por género (boxplot)')
ax.set_xlabel('Género')
ax.set_ylabel('Horas de sueño')
savefig_with_caption(fig, 'box_sleep_by_gender.png', 'Comparación por género: mediana y dispersión de horas de sueño.')

# Dispersión sueño vs mala salud mental
fig, ax = plt.subplots(figsize=(8,6))
sns.scatterplot(data=base, x='sleephrsnight', y='daysmenthlthbad', hue='gender', alpha=0.5, ax=ax)
ax.set_title('Sueño vs Días de mala salud mental')
ax.set_xlabel('Horas de sueño por noche')
ax.set_ylabel('Días de mala salud mental (30 días)')
savefig_with_caption(fig, 'scatter_sleep_vs_mh.png', 'Relación negativa débil: a más sueño, menos días de mala salud mental.')

# Mapa de calor de correlaciones
num_cols = ['sleephrsnight','daysmenthlthbad','age']
corr_mat = base[num_cols].corr(method='pearson')
fig, ax = plt.subplots(figsize=(6,5))
sns.heatmap(corr_mat, annot=True, cmap='coolwarm', vmin=-1, vmax=1, ax=ax)
ax.set_title('Mapa de calor: correlaciones (Pearson)')
savefig_with_caption(fig, 'heatmap_correlations.png', 'Correlaciones entre sueño, mala salud mental y edad (Pearson).')

corr_mat


Unnamed: 0,sleephrsnight,daysmenthlthbad,age
sleephrsnight,1.0,-0.15346,-0.008907
daysmenthlthbad,-0.15346,1.0,-0.043978
age,-0.008907,-0.043978,1.0


In [25]:
# 2) Correlaciones relevantes (Pearson y Spearman)
from scipy import stats

corr_tbl = []
for a, b in [('sleephrsnight','daysmenthlthbad'), ('sleephrsnight','age'), ('age','daysmenthlthbad')]:
    s1 = base[a]
    s2 = base[b]
    mask = s1.notna() & s2.notna()
    r_p, p_p = stats.pearsonr(s1[mask], s2[mask])
    r_s, p_s = stats.spearmanr(s1[mask], s2[mask])
    corr_tbl.append({'var_x': a, 'var_y': b, 'pearson_r': r_p, 'pearson_p': p_p, 'spearman_rho': r_s, 'spearman_p': p_s})

corr_df = pd.DataFrame(corr_tbl)

# Guardar tabla de correlaciones
corr_path = OUT_DIR / 'correlations_sleep_mental.csv'
corr_df.to_csv(corr_path, index=False)

corr_df


Unnamed: 0,var_x,var_y,pearson_r,pearson_p,spearman_rho,spearman_p
0,sleephrsnight,daysmenthlthbad,-0.15346,4.861667e-25,-0.125227,3.8530430000000003e-17
1,sleephrsnight,age,-0.008907,0.550955,-0.016245,0.2767231
2,age,daysmenthlthbad,-0.043978,0.003221109,-0.123152,1.266344e-16


In [26]:
# 3) Modelo predictivo simple: regresión lineal para predecir daysmenthlthbad
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

m = base[['sleephrsnight','age']].copy()
m['gender_num'] = enumerate_1_to_n(base['gender'])
y = base['daysmenthlthbad']
mask = m.notna().all(axis=1) & y.notna()
X = m[mask]
y = y[mask]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

linreg = LinearRegression()
linreg.fit(X_train, y_train)

y_pred = linreg.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

results_df = pd.DataFrame({
    'metric': ['MAE', 'R2'],
    'value': [mae, r2]
})

# Guardar resultados del modelo
model_path = OUT_DIR / 'model_results_sleep_mental.csv'
results_df.to_csv(model_path, index=False)

# Visualización: dispersión real vs predicción
fig, ax = plt.subplots(figsize=(6,6))
ax.scatter(y_test, y_pred, alpha=0.5)
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
ax.set_xlabel('Real (daysmenthlthbad)')
ax.set_ylabel('Predicho')
ax.set_title('Regresión lineal: real vs predicho')
savefig_with_caption(fig, 'linreg_real_vs_pred.png', 'Ajuste del modelo lineal simple para predecir días de mala salud mental.')

results_df


Unnamed: 0,metric,value
0,MAE,5.229766
1,R2,0.030009
