In [1]:
import pandas as pd
import pyreadr
import seaborn as sns
import sys
import os
import requests
import importlib
print(os.path.abspath(os.path.join(os.getcwd(), "..", "..")))
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..", "..")))

import src.utils  
import src.pandas_missing_extension  
# Recargar el módulo para reflejar cambios
importlib.reload(src.utils)
importlib.reload(src.pandas_missing_extension)
# import runpy

from src.utils import make_dir_function
from src.pandas_missing_extension import MissingMethods
# runpy.run_path("pandas_missing_extension.ipynb")

/home/alex/courses/data_scientist/handling_missing_data


# Configure the overall appearance of the project's graphs

In [63]:
%matplotlib inline

sns.set(
    rc={
        "figure.figsize": (10, 10)
    }
)

sns.set_style("whitegrid")



# Load dataset

### Pima Indians Diabetes

In [64]:
data_dir = make_dir_function("data")
pima_indians_diabetes_file = data_dir("raw", "pima-indians-diabetes.csv")
diabetes_df = pd.read_csv(
  pima_indians_diabetes_file,
  names=[
    "Pregnancies",
    "Glucose",
    "BloodPressure",
    "SkinThickness",
    "Insulin",
    "BMI",
    "DiabetesPedigreeFunction",
    "Age",
    "Outcome"
  ]
)


### naniar (oceanbuoys, pedestrian, riskfactors)

In [65]:
base_url = "https://github.com/njtierney/naniar/raw/master/data/"
datasets_names = ("oceanbuoys", "pedestrian", "riskfactors")
extension = ".rda"

In [66]:
datasets_dfs = {}

for dataset_name in datasets_names:
    dataset_file = f"{dataset_name}{extension}"
    dataset_output_file = data_dir("raw", dataset_file)
    dataset_url = f"{base_url}{dataset_file}"

    # Descargar el archivo usando requests
    response = requests.get(dataset_url)
    if response.status_code == 200:
        with open(dataset_output_file, "wb") as f:
            f.write(response.content)
        print(f"Descargado: {dataset_file}")
    else:
        print(f"⚠️ Error al descargar {dataset_file}")

    # # Leer el archivo RDA con pyreadr
    result = pyreadr.read_r(dataset_output_file)

    # # Guardar el DataFrame en el diccionario
    datasets_dfs[f"{dataset_name}_df"] = result[dataset_name]

# Ver las claves de los datasets cargados
print(datasets_dfs.keys())

Descargado: oceanbuoys.rda
Descargado: pedestrian.rda
Descargado: riskfactors.rda
dict_keys(['oceanbuoys_df', 'pedestrian_df', 'riskfactors_df'])


Incluir conjuntos de datos en nuestro ambiente local

In [67]:
locals().update(**datasets_dfs)
del datasets_dfs

In [68]:
oceanbuoys_df.shape, pedestrian_df.shape, riskfactors_df.shape, diabetes_df.shape

((736, 8), (37700, 9), (245, 34), (768, 9))

In [69]:
riskfactors_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 245 entries, 0 to 244
Data columns (total 34 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   state             245 non-null    category
 1   sex               245 non-null    category
 2   age               245 non-null    int32   
 3   weight_lbs        235 non-null    object  
 4   height_inch       243 non-null    object  
 5   bmi               234 non-null    float64 
 6   marital           244 non-null    category
 7   pregnant          30 non-null     category
 8   children          245 non-null    int32   
 9   education         244 non-null    category
 10  employment        245 non-null    category
 11  income            245 non-null    category
 12  veteran           242 non-null    category
 13  hispanic          243 non-null    category
 14  health_general    245 non-null    category
 15  health_physical   245 non-null    int32   
 16  health_mental     245 non-

# Extend Pandas API

Create src/pandas_missing_extension.py, import it and enjoy de magic

In [70]:
riskfactors_df.missing.number_missing()

1186

# Tabulation of missing data

basic summary

In [71]:
print(riskfactors_df.size)
print(riskfactors_df.shape)

8330
(245, 34)


Complete and missing values

In [72]:
print(riskfactors_df.missing.number_missing())
print(riskfactors_df.missing.number_complete())

1186
7144


### Summary by column

In [73]:
riskfactors_df.missing.missing_variable_summary()

Unnamed: 0,variable,n_missing,n_cases,pct_missing
0,state,0,245,0.0
1,sex,0,245,0.0
2,age,0,245,0.0
3,weight_lbs,10,245,4.081633
4,height_inch,2,245,0.816327
5,bmi,11,245,4.489796
6,marital,1,245,0.408163
7,pregnant,215,245,87.755102
8,children,0,245,0.0
9,education,1,245,0.408163


In [74]:
riskfactors_df.missing.missing_variable_table()


Unnamed: 0,n_missing_in_variable,n_variables,pct_variables
0,0,10,29.411765
1,8,6,17.647059
2,2,4,11.764706
3,3,3,8.823529
4,1,2,5.882353
5,10,1,2.941176
6,11,1,2.941176
7,113,1,2.941176
8,128,1,2.941176
9,134,1,2.941176


In [75]:
riskfactors_df.missing.missing_case_summary()


Unnamed: 0,case,n_missing,pct_missing
0,0,6,16.666667
1,1,6,16.666667
2,2,7,19.444444
3,3,12,33.333333
4,4,5,13.888889
...,...,...,...
240,240,6,16.666667
241,241,5,13.888889
242,242,3,8.333333
243,243,2,5.555556


In [76]:
riskfactors_df.missing.missing_case_table()

Unnamed: 0,n_missing_in_case,n_cases,pct_case
0,4,49,20.0
1,5,45,18.367347
2,7,39,15.918367
3,6,36,14.693878
4,2,31,12.653061
5,3,30,12.244898
6,1,4,1.632653
7,8,3,1.22449
8,12,3,1.22449
9,15,2,0.816327


### Intervalo de valores faltantes

In [80]:
riskfactors_df.missing.missing_variable_span(
    variable="weight_lbs",
    span_every=50
)

Unnamed: 0,span_counter,n_missing,n_complete,pct_missing,pct_complete
0,0,1,49,2.0,98.0
1,1,5,45,10.0,90.0
2,2,1,49,2.0,98.0
3,3,1,49,2.0,98.0
4,4,2,43,4.444444,95.555556


### Run lenght of missing values

In [81]:
riskfactors_df.missing.missing_variable_run(
    variable="weight_lbs"
)

Unnamed: 0,run_length,is_na
0,14,complete
1,1,missing
2,45,complete
3,1,missing
4,5,complete
5,1,missing
6,12,complete
7,1,missing
8,10,complete
9,2,missing
