# Imputacion de datos faltanes

## Importar librerias

In [1]:
import janitor
import matplotlib.pyplot as plt
import missingno
import nhanes.load
import numpy as np
import pandas as pd
import scipy.stats
import seaborn as sns
import session_info
import sklearn.compose
import sklearn.impute
import sklearn.preprocessing
import statsmodels.api as sm
import statsmodels.datasets
import statsmodels.formula.api as smf

from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import BayesianRidge, Ridge
from sklearn.neighbors import KNeighborsRegressor
from statsmodels.graphics.mosaicplot import mosaic

from utils.missing import MissingMethods

  import sre_constants


## Configurar graficos

In [2]:
%matplotlib inline

sns.set_style(
    rc = {
        "figure.figsize": (8, 6)
    }
)

sns.set_style("whitegrid")
sns.set_style("whitegrid")

## El problema de trabajar con valores faltantes

In [3]:
airquality_df:pd.DataFrame = (
    sm.datasets.get_rdataset("airquality")
    .data
    .clean_names(
        case_type = "snake"
    )
    .add_column("year", 1973)
    .assign(
        date = lambda df: pd.to_datetime(df[["year", "month", "day"]])
    )
    .sort_values(by = "date")
    .set_index("date")
)

airquality_df.head()

  return method(self._obj, *args, **kwargs)


Unnamed: 0_level_0,ozone,solar_r,wind,temp,month,day,year
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1973-05-01,41.0,190.0,7.4,67,5,1,1973
1973-05-02,36.0,118.0,8.0,72,5,2,1973
1973-05-03,12.0,149.0,12.6,74,5,3,1973
1973-05-04,18.0,313.0,11.5,62,5,4,1973
1973-05-05,,,14.3,56,5,5,1973


In [4]:
(
    smf.ols(
        formula="temp ~ ozone",
        data=airquality_df
    )
    .fit()
    .summary()
    .tables[0]
)

0,1,2,3
Dep. Variable:,temp,R-squared:,0.488
Model:,OLS,Adj. R-squared:,0.483
Method:,Least Squares,F-statistic:,108.5
Date:,"Tue, 25 Jun 2024",Prob (F-statistic):,2.93e-18
Time:,20:12:20,Log-Likelihood:,-386.27
No. Observations:,116,AIC:,776.5
Df Residuals:,114,BIC:,782.1
Df Model:,1,,
Covariance Type:,nonrobust,,


In [5]:
(
    smf.ols(
        formula="temp ~ ozone + solar_r",
        data=airquality_df
    )
    .fit()
    .summary()
    .tables[0]
)

0,1,2,3
Dep. Variable:,temp,R-squared:,0.491
Model:,OLS,Adj. R-squared:,0.481
Method:,Least Squares,F-statistic:,52.07
Date:,"Tue, 25 Jun 2024",Prob (F-statistic):,1.47e-16
Time:,20:12:22,Log-Likelihood:,-369.78
No. Observations:,111,AIC:,745.6
Df Residuals:,108,BIC:,753.7
Df Model:,2,,
Covariance Type:,nonrobust,,


## Cargar los datos de Nhanes

In [6]:
nhanes_raw_df:pd.DataFrame = (
    nhanes.load.load_NHANES_data(year = '2017-2018')
    .clean_names(case_type = 'snake')
)

In [8]:
transformation_1:pd.DataFrame = nhanes_raw_df.select(
    "general_health_condition", 
    "age_in_years_at_screening", 
    "gender", 
    "current_selfreported_height_inches", 
    "current_selfreported_weight_pounds", 
    "doctor_told_you_have_diabetes",
    "60_sec_pulse30_sec_pulse2",
    "total_cholesterol_mgdl"
    ).rename(
        columns = 
        {
            "age_in_years_at_screening":"age", 
            "current_selfreported_height_inches":"height", 
            "current_selfreported_weight_pounds":"weight", 
            "doctor_told_you_have_diabetes":"diabetes",
            "60_sec_pulse30_sec_pulse2":"pulse",
            "total_cholesterol_mgdl":"total_cholesterol"
        }
    ).replace(
        {
            "height":{
                9999:np.nan,
                7777:np.nan
            },
            "weight":{
                9999:np.nan,
                7777:np.nan
            },
            "diabetes":{
                "Borderline":np.nan
            }
        }
    )
## Asumimos en un MCAR
transformation_2:pd.DataFrame = transformation_1.missing.sort_variables_by_missingness().dropna(subset = ["diabetes"], how = "any")

nhanes_df:pd.DataFrame = transformation_2.transform_column(
    column_name = "diabetes",
    function = lambda s: s.astype(int),
    elementwise = False
)

del transformation_1, transformation_2