## Importación de librerías

In [1]:
import janitor
import matplotlib.pyplot as plt
import missingno
import numpy as np
import pandas as pd
import pyreadr
import seaborn as sns
import session_info
import upsetplot

## Importar funciones personalizadas

## Configurar el aspecto general de las gráficas del proyecto

In [2]:
%matplotlib inline

sns.set(
    rc = {
        "figure.figsize": (10, 10)
    }
)

sns.set_style("whitegrid")

## Operar con valores faltantes

### Python

In [9]:
print(None or True)
print(None or False)
print(None is None)
print(None == None)
# print(None + 1) No soporta este tipo de operacion


True
False
True
True


### NumPy

In [15]:
print(np.nan or True)
print(np.nan == np.nan)
print(np.nan is np.nan)
print(np.nan / 2)
print(type(np.nan)) # np.nan es de clase float, por lo que se puede operar con el

print(np.isnan(np.nan))

nan
False
True
nan
<class 'float'>
True


### Pandas

In [17]:
test_missing_df = pd.DataFrame.from_dict(
    {
        "x": [0, 1, np.nan, np.nan, None],
        "y": [0, 1, pd.NA, np.nan, None]
    }
)

test_missing_df

Unnamed: 0,x,y
0,0.0,0.0
1,1.0,1.0
2,,
3,,
4,,


La evolución de pandas representa los NA como son.
Los dos siguientes son sinónimos.

In [18]:
test_missing_df.isnull()

Unnamed: 0,x,y
0,False,False
1,False,False
2,True,True
3,True,True
4,True,True


In [19]:
test_missing_df.isna()

Unnamed: 0,x,y
0,False,False
1,False,False
2,True,True
3,True,True
4,True,True


In [20]:
# Creando diferentes series de datos.
pd.Series([1, np.nan])

0    1.0
1    NaN
dtype: float64

In [21]:
pd.Series([pd.to_datetime("2022-01-01"), np.nan])

0   2022-01-01
1          NaT
dtype: datetime64[ns]

¡En algunas ocasiones -1 es una forma de escribir valores nulos, no se detectan así por defecto!

In [22]:
pd.Series([-1]).isnull()

0    False
dtype: bool

# Cargar los conjuntos de datos

In [52]:
raw = "../data/raw"

pima_indians_diabetes = pd.read_csv(f"{raw}/diabetes.csv")
riskfactors = pyreadr.read_r(f"{raw}/riskfactors.rda")['riskfactors']
pedestrian =  pyreadr.read_r(f"{raw}/pedestrian.rda")['pedestrian']
oceanbuoys = pyreadr.read_r(f"{raw}/oceanbuoys.rda")['oceanbuoys']

## Verificar la carga

In [53]:
pima_indians_diabetes.head()
riskfactors.head()
pedestrian.head()
oceanbuoys.head()

Unnamed: 0,year,latitude,longitude,sea_temp_c,air_temp_c,humidity,wind_ew,wind_ns
0,1997.0,0.0,-110.0,27.59,27.15,79.599998,-6.4,5.4
1,1997.0,0.0,-110.0,27.549999,27.02,75.800003,-5.3,5.3
2,1997.0,0.0,-110.0,27.57,27.0,76.5,-5.1,4.5
3,1997.0,0.0,-110.0,27.620001,26.93,76.199997,-4.9,2.5
4,1997.0,0.0,-110.0,27.65,26.84,76.400002,-3.5,4.1


## Observar el número de variables no nulas

In [54]:
pima_indians_diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [56]:
pedestrian.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37700 entries, 0 to 37699
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   hourly_counts  35152 non-null  object        
 1   date_time      37700 non-null  datetime64[ns]
 2   year           37700 non-null  int32         
 3   month          37700 non-null  category      
 4   month_day      37700 non-null  int32         
 5   week_day       37700 non-null  category      
 6   hour           37700 non-null  int32         
 7   sensor_id      37700 non-null  int32         
 8   sensor_name    37700 non-null  object        
dtypes: category(2), datetime64[ns](1), int32(4), object(2)
memory usage: 1.5+ MB


In [55]:
riskfactors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 245 entries, 0 to 244
Data columns (total 34 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   state             245 non-null    category
 1   sex               245 non-null    category
 2   age               245 non-null    int32   
 3   weight_lbs        235 non-null    object  
 4   height_inch       243 non-null    object  
 5   bmi               234 non-null    float64 
 6   marital           244 non-null    category
 7   pregnant          30 non-null     category
 8   children          245 non-null    int32   
 9   education         244 non-null    category
 10  employment        245 non-null    category
 11  income            245 non-null    category
 12  veteran           242 non-null    category
 13  hispanic          243 non-null    category
 14  health_general    245 non-null    category
 15  health_physical   245 non-null    int32   
 16  health_mental     245 non-

In [57]:
oceanbuoys.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 736 entries, 0 to 735
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   year        736 non-null    float64
 1   latitude    736 non-null    float64
 2   longitude   736 non-null    float64
 3   sea_temp_c  733 non-null    float64
 4   air_temp_c  655 non-null    float64
 5   humidity    643 non-null    float64
 6   wind_ew     736 non-null    float64
 7   wind_ns     736 non-null    float64
dtypes: float64(8)
memory usage: 46.1 KB
