In [1]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Imputación de nulos usando métodos avanzados estadísticos
# -----------------------------------------------------------------------
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Librerías de visualización
# -----------------------------------------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt
# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

In [2]:
df_coral = pd.read_csv('realistic_ocean_climate_dataset.csv')

In [3]:
df_coral

Unnamed: 0,Date,Location,Latitude,Longitude,SST (°C),pH Level,Bleaching Severity,Species Observed,Marine Heatwave
0,2015-01-01,Red Sea,20.0248,38.4931,29.47,8.107,,106,False
1,2015-01-07,Great Barrier Reef,-18.2988,147.7782,29.65,8.004,High,116,False
2,2015-01-14,Caribbean Sea,14.9768,-75.0233,28.86,7.947,High,90,False
3,2015-01-20,Great Barrier Reef,-18.3152,147.6486,28.97,7.995,Medium,94,False
4,2015-01-27,Galápagos,-0.8805,-90.9769,28.60,7.977,,110,False
...,...,...,...,...,...,...,...,...,...
495,2023-12-04,Galápagos,-0.9177,-90.9771,30.68,7.939,Medium,108,True
496,2023-12-11,Maldives,3.1725,73.3067,31.49,7.952,Medium,81,True
497,2023-12-17,Caribbean Sea,15.0430,-75.0194,28.43,8.053,Low,91,False
498,2023-12-24,Red Sea,19.9993,38.4655,27.76,8.137,Medium,108,False


In [10]:
df_coral.sample(10)

Unnamed: 0,Date,Location,Latitude,Longitude,SST (°C),pH Level,Bleaching Severity,Species Observed,Marine Heatwave
287,2020-03-04,Caribbean Sea,15.0141,-75.0124,30.91,8.026,,106,True
178,2018-03-18,Maldives,3.2438,73.2461,30.1,8.076,Low,124,True
426,2022-09-06,Galápagos,-0.9319,-90.9646,29.51,8.059,Medium,104,False
370,2021-09-02,Red Sea,19.9677,38.5888,26.71,8.132,High,152,False
373,2021-09-22,Hawaiian Islands,19.8137,-155.5312,30.19,7.962,Low,96,True
439,2022-11-30,Red Sea,19.9669,38.5213,28.53,8.017,,127,False
473,2023-07-12,Caribbean Sea,15.0767,-75.0268,25.94,8.045,Low,164,False
330,2020-12-13,Hawaiian Islands,19.9258,-155.5665,28.79,8.027,Medium,122,False
399,2022-03-12,Caribbean Sea,15.0535,-75.0013,27.18,8.068,Medium,122,False
385,2021-12-10,Hawaiian Islands,19.8508,-155.7093,28.07,8.114,Medium,153,False


In [11]:
df_coral['Location'].unique()

array(['Red Sea', 'Great Barrier Reef', 'Caribbean Sea', 'Galápagos',
       'South China Sea', 'Maldives', 'Hawaiian Islands'], dtype=object)

In [4]:
# Función para conocer nulos y duplicados en un informe. Next step--> ETL

def nulos_duplicados(df_coral):
    # Cálculo del porcentaje de nulos
    porcentaje_nulos = df_coral.isna().sum() / df_coral.shape[0] * 100
    
    # Verificación de duplicados
    duplicados = df_coral.duplicated().sum()
    if duplicados == 0:
        mensaje_duplicados = "No hay duplicados"
    else:
        mensaje_duplicados = f"Hay {duplicados} duplicados"
    
    # Creación de un reporte bonito y visual
    reporte = f"""
    ===================== Informe de Datos =====================
    
    Porcentaje de Nulos por Columna:
    ------------------------------------------------------------
    {porcentaje_nulos.to_string()}
    
    ------------------------------------------------------------
    Duplicados:
    ------------------------------------------------------------
    {mensaje_duplicados}
    
    ============================================================
    """
    
    # Imprimir directamente el reporte
    print(reporte)

# Ejemplo de uso
# df_mp = pd.DataFrame(...)

# Llamar directamente a la función
nulos_duplicados(df_coral)



    
    Porcentaje de Nulos por Columna:
    ------------------------------------------------------------
    Date                   0.0
Location               0.0
Latitude               0.0
Longitude              0.0
SST (°C)               0.0
pH Level               0.0
Bleaching Severity    30.0
Species Observed       0.0
Marine Heatwave        0.0
    
    ------------------------------------------------------------
    Duplicados:
    ------------------------------------------------------------
    No hay duplicados
    
    


In [5]:
df_coral['Location'].unique()

array(['Red Sea', 'Great Barrier Reef', 'Caribbean Sea', 'Galápagos',
       'South China Sea', 'Maldives', 'Hawaiian Islands'], dtype=object)