# 01 - EDA e Inferencia Inicial

Este notebook cubre la **Fase 4** del proyecto:
- Carga y validacion de datos procesados
- KPIs base
- Cruces clave
- Pruebas estadisticas (Chi-cuadrada, Spearman)
- Modelo logístico exploratorio


## Disclaimer
> Este dataset fue levantado en **2021**, durante el punto mas critico de la pandemia por COVID-19, con una **muestra pequena y no probabilistica**.
> Este proyecto reutiliza esa base de una actividad academica para demostrar un flujo profesional de Data Analyst (limpieza, analisis, inferencia y dashboard).


In [None]:
from pathlib import Path
import subprocess
import warnings

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency, spearmanr
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 200)
sns.set_theme(style='whitegrid')


In [None]:
project_root = Path.cwd()
if not (project_root / 'data').exists() and (project_root.parent / 'data').exists():
    project_root = project_root.parent

processed_csv = project_root / 'data/processed/survey_analytics.csv'

if not processed_csv.exists():
    print('No existe data/processed/survey_analytics.csv. Ejecutando limpieza...')
    cmd = ['python3', str(project_root / 'src/data/clean_survey.py')]
    result = subprocess.run(cmd, capture_output=True, text=True, cwd=str(project_root))
    print(result.stdout)
    if result.returncode != 0:
        raise RuntimeError(f'Error en limpieza: {result.stderr}')

if not processed_csv.exists():
    raise FileNotFoundError('No se pudo generar data/processed/survey_analytics.csv')

df = pd.read_csv(processed_csv)
print('Shape:', df.shape)
df.head()


In [None]:
quality = pd.DataFrame({
    'dtype': df.dtypes.astype(str),
    'nulls': df.isna().sum(),
    'null_pct': (df.isna().mean() * 100).round(2),
    'n_unique': df.nunique(dropna=True)
}).sort_values('null_pct', ascending=False)

quality


In [None]:
def pct(series, value):
    valid = series.dropna()
    if len(valid) == 0:
        return np.nan
    return round((valid == value).mean() * 100, 2)

kpis = {
    'n_respuestas': len(df),
    'pct_impacto_negativo': pct(df.get('q3_impact', pd.Series(dtype='object')), 'Negativa'),
    'pct_impacto_positivo': pct(df.get('q3_impact', pd.Series(dtype='object')), 'Positiva'),
    'pct_reporta_problemas_si': pct(df.get('q4_problems', pd.Series(dtype='object')), 'Si'),
    'pct_bienestar_final_si': pct(df.get('q15_wellbeing_final', pd.Series(dtype='object')), 'Si'),
    'pct_bienestar_final_no_talvez': round(
        df.get('q15_wellbeing_final', pd.Series(dtype='object')).isin(['No', 'Tal vez']).mean() * 100, 2
    ) if 'q15_wellbeing_final' in df.columns else np.nan
}

kpi_df = pd.DataFrame([kpis]).T.reset_index()
kpi_df.columns = ['kpi', 'value']
kpi_df


In [None]:
plot_cols = ['q3_impact', 'q4_problems', 'q15_wellbeing_final']
existing = [c for c in plot_cols if c in df.columns]

fig, axes = plt.subplots(1, len(existing), figsize=(5 * len(existing), 4))
if len(existing) == 1:
    axes = [axes]

for ax, col in zip(axes, existing):
    order = df[col].value_counts(dropna=False).index
    sns.countplot(data=df, x=col, order=order, ax=ax, palette='viridis')
    title_map = {
        'q3_impact': 'Percepcion del impacto de la pandemia en la muestra',
        'q4_problems': 'Personas que reportan problemas durante la pandemia',
        'q15_wellbeing_final': 'Estado final de bienestar reportado',
    }
    ax.set_title(title_map.get(col, col))
    ax.set_xlabel('')
    ax.tick_params(axis='x', rotation=25)

plt.tight_layout()
plt.show()


In [None]:
if {'gender', 'q15_wellbeing_final'}.issubset(df.columns):
    ct = pd.crosstab(df['gender'], df['q15_wellbeing_final'], margins=True)
    ct_pct = pd.crosstab(df['gender'], df['q15_wellbeing_final'], normalize='index').round(3)
    display(ct)
    display(ct_pct)

    ct_plot = pd.crosstab(df['gender'], df['q15_wellbeing_final'], normalize='index')
    ct_plot.plot(kind='bar', stacked=True, figsize=(8, 4), colormap='viridis')
    plt.title('Distribucion proporcional de bienestar final por genero')
    plt.ylabel('Proporcion')
    plt.xlabel('Genero')
    plt.legend(title='Bienestar final', bbox_to_anchor=(1.02, 1), loc='upper left')
    plt.tight_layout()
    plt.show()
else:
    print('No se encontraron columnas gender y q15_wellbeing_final')


In [None]:
def chi_square_test(data, col_a, col_b):
    if not {col_a, col_b}.issubset(data.columns):
        return None
    subset = data[[col_a, col_b]].dropna()
    if subset.empty:
        return None
    table = pd.crosstab(subset[col_a], subset[col_b])
    if table.shape[0] < 2 or table.shape[1] < 2:
        return None
    chi2, p_value, dof, _ = chi2_contingency(table)
    return {'col_a': col_a, 'col_b': col_b, 'chi2': chi2, 'p_value': p_value, 'dof': dof, 'n': len(subset)}

tests = [
    chi_square_test(df, 'q3_impact', 'q15_wellbeing_final'),
    chi_square_test(df, 'q4_problems', 'q15_wellbeing_final'),
]

if 'q14_anxiety_score' in df.columns:
    temp = df.copy()
    temp['anxiety_high'] = np.where(temp['q14_anxiety_score'] >= 3, 'Alta', 'No alta')
    tests.append(chi_square_test(temp, 'q4_problems', 'anxiety_high'))

chi_results = pd.DataFrame([t for t in tests if t is not None]).sort_values('p_value')
chi_results


In [None]:
score_cols = [
    'q10_stress_score',
    'q11_optimism_score',
    'q12_control_score',
    'q13_protocols_score',
    'q14_anxiety_score',
    'q15_wellbeing_score'
]
score_cols = [c for c in score_cols if c in df.columns]

if len(score_cols) >= 2:
    corr = df[score_cols].corr(method='spearman')
    display(corr)

    plt.figure(figsize=(8, 6))
    sns.heatmap(corr, annot=True, cmap='coolwarm', center=0, fmt='.2f')
    plt.title('Mapa de correlaciones Spearman entre indicadores emocionales')
    plt.tight_layout()
    plt.show()

    pairs = []
    for i, a in enumerate(score_cols):
        for b in score_cols[i + 1:]:
            subset = df[[a, b]].dropna()
            if len(subset) >= 10:
                rho, p = spearmanr(subset[a], subset[b])
                pairs.append({'a': a, 'b': b, 'rho': rho, 'p_value': p, 'n': len(subset)})

    spearman_pairs = pd.DataFrame(pairs).sort_values('p_value')
    spearman_pairs.head(10)
else:
    print('No hay suficientes columnas de score para Spearman')


In [None]:
required = [
    'q15_wellbeing_final',
    'q10_stress_score',
    'q11_optimism_score',
    'q12_control_score',
    'q13_protocols_score',
    'q14_anxiety_score'
]

if set(required).issubset(df.columns):
    model_df = df.copy()
    model_df['target_low_wellbeing'] = model_df['q15_wellbeing_final'].isin(['No', 'Tal vez']).astype(int)

    feature_cols = [
        'q10_stress_score',
        'q11_optimism_score',
        'q12_control_score',
        'q13_protocols_score',
        'q14_anxiety_score',
        'q3_impact',
        'q4_problems',
        'gender'
    ]
    feature_cols = [c for c in feature_cols if c in model_df.columns]

    X = model_df[feature_cols]
    y = model_df['target_low_wellbeing']

    numeric_features = [c for c in feature_cols if str(model_df[c].dtype) != 'object']
    categorical_features = [c for c in feature_cols if c not in numeric_features]

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', Pipeline([('imputer', SimpleImputer(strategy='median'))]), numeric_features),
            ('cat', Pipeline([
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))
            ]), categorical_features),
        ]
    )

    model = Pipeline(
        steps=[
            ('prep', preprocessor),
            ('clf', LogisticRegression(max_iter=200, random_state=42))
        ]
    )

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=42, stratify=y
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print('Matriz de confusion:')
    print(confusion_matrix(y_test, y_pred))
    print('\nReporte de clasificacion:')
    print(classification_report(y_test, y_pred, digits=3))
else:
    print('No hay columnas suficientes para el modelo logístico exploratorio.')


In [None]:
from IPython.display import Markdown, display

summary_lines = []
summary_lines.append('## Conclusiones Automaticas del Notebook')
summary_lines.append('')

if 'kpi_df' in globals():
    kpi_map = dict(zip(kpi_df['kpi'], kpi_df['value']))
    summary_lines.append(f"- Respuestas analizadas: **{int(kpi_map.get('n_respuestas', 0))}**")
    summary_lines.append(f"- % impacto negativo: **{kpi_map.get('pct_impacto_negativo', 'NA')}%**")
    summary_lines.append(f"- % reporta problemas: **{kpi_map.get('pct_reporta_problemas_si', 'NA')}%**")
    summary_lines.append(f"- % bienestar final No/Tal vez: **{kpi_map.get('pct_bienestar_final_no_talvez', 'NA')}%**")

if 'chi_results' in globals() and not chi_results.empty:
    sig = chi_results[chi_results['p_value'] < 0.05]
    if len(sig) > 0:
        summary_lines.append('')
        summary_lines.append('- Asociaciones significativas (alpha=0.05):')
        for _, row in sig.iterrows():
            summary_lines.append(f"  - `{row['col_a']} vs {row['col_b']}` (p={row['p_value']:.6f})")
    else:
        summary_lines.append('')
        summary_lines.append('- No se detectaron asociaciones significativas en las pruebas chi-cuadrada ejecutadas.')

if 'spearman_pairs' in globals() and not spearman_pairs.empty:
    top = spearman_pairs.iloc[0]
    summary_lines.append('')
    summary_lines.append(
        f"- Relacion ordinal mas fuerte detectada: `{top['a']} vs {top['b']}` "
        f"(rho={top['rho']:.3f}, p={top['p_value']:.2e})."
    )

summary_lines.append('')
summary_lines.append('**Nota metodologica:** resultados asociativos, no causales; base 2021 con muestra acotada.')

display(Markdown('\n'.join(summary_lines)))

