In [1]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Imputación de nulos usando métodos avanzados estadísticos
# -----------------------------------------------------------------------
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

In [2]:
df_gal = pd.read_csv(
    "Proyecto-PowerBI-MOD4/files/Marine Microplastic Concentrations_GALAPAGOS.csv",
    sep=";",
    engine="python",
    encoding="utf-8",
    quotechar='"',
    on_bad_lines='warn'  # también puedes usar 'skip' para ignorar líneas malas
)

df_gal.head()

Unnamed: 0,OBJECTID,Date,Latitude,Longitude,Oceans,Regions,Location,Microplastics Measurement (density),Unit,Density Class Range,Concentration Class,Sampling Method,Short Reference,Long Reference,DOI,Organization,Keywords,NCEI Accession Number,NCEI Accession Link
0,6191,04/12/2001,0.07,-897.9,Pacific Ocean,Galápagos,Galápagos,0.0,pieces/m3,0-0.0005,Very Low,Neuston net,Law et al.2014,"Law, K.L, S.K. Morét-Ferguson, D.S. Goodwin, E...",https://doi.org/10.1021/es4053076,Sea Education Association,SEA,211008,https://www.ncei.noaa.gov/access/metadata/land...
1,6192,05/12/2001,-0.51,-900.8,Pacific Ocean,Galápagos,Galápagos,0.0,pieces/m3,0-0.0005,Very Low,Neuston net,Law et al.2014,"Law, K.L, S.K. Morét-Ferguson, D.S. Goodwin, E...",https://doi.org/10.1021/es4053076,Sea Education Association,SEA,211008,https://www.ncei.noaa.gov/access/metadata/land...
2,6193,08/12/2001,-0.87,-902.6,Pacific Ocean,Galápagos,Galápagos,0.0,pieces/m3,0-0.0005,Very Low,Neuston net,Law et al.2014,"Law, K.L, S.K. Morét-Ferguson, D.S. Goodwin, E...",https://doi.org/10.1021/es4053076,Sea Education Association,SEA,211008,https://www.ncei.noaa.gov/access/metadata/land...
3,6194,09/12/2001,-12.1,-904.3,Pacific Ocean,Galápagos,Galápagos,0.0,pieces/m3,0-0.0005,Very Low,Neuston net,Law et al.2014,"Law, K.L, S.K. Morét-Ferguson, D.S. Goodwin, E...",https://doi.org/10.1021/es4053076,Sea Education Association,SEA,211008,https://www.ncei.noaa.gov/access/metadata/land...
4,6195,09/12/2001,-10.7,-908.4,Pacific Ocean,Galápagos,Galápagos,0.0,pieces/m3,0-0.0005,Very Low,Neuston net,Law et al.2014,"Law, K.L, S.K. Morét-Ferguson, D.S. Goodwin, E...",https://doi.org/10.1021/es4053076,Sea Education Association,SEA,211008,https://www.ncei.noaa.gov/access/metadata/land...


In [3]:
# Función para conocer nulos y duplicados en un informe. Next step--> ETL

def nulos_duplicados(df_gal):
    # Cálculo del porcentaje de nulos
    porcentaje_nulos = df_gal.isna().sum() / df_gal.shape[0] * 100
    
    # Verificación de duplicados
    duplicados = df_gal.duplicated().sum()
    if duplicados == 0:
        mensaje_duplicados = "No hay duplicados"
    else:
        mensaje_duplicados = f"Hay {duplicados} duplicados"
    
    # Creación de un reporte bonito y visual
    reporte = f"""
    ===================== Informe de Datos =====================
    
    Porcentaje de Nulos por Columna:
    ------------------------------------------------------------
    {porcentaje_nulos.to_string()}
    
    ------------------------------------------------------------
    Duplicados:
    ------------------------------------------------------------
    {mensaje_duplicados}
    
    ============================================================
    """
    
    # Imprimir directamente el reporte
    print(reporte)

# Ejemplo de uso
# df_mp = pd.DataFrame(...)

# Llamar directamente a la función
nulos_duplicados(df_gal)



    
    Porcentaje de Nulos por Columna:
    ------------------------------------------------------------
    OBJECTID                               0.0
Date                                   0.0
Latitude                               0.0
Longitude                              0.0
Oceans                                 0.0
Regions                                0.0
Location                               0.0
Microplastics Measurement (density)    0.0
Unit                                   0.0
Density Class Range                    0.0
Concentration Class                    0.0
Sampling Method                        0.0
Short Reference                        0.0
Long Reference                         0.0
DOI                                    0.0
Organization                           0.0
Keywords                               0.0
NCEI Accession Number                  0.0
NCEI Accession Link                    0.0
    
    ------------------------------------------------------------
 

In [4]:
# Columna de media de density

# Función para eliminar el símbolo '>= y >'
def eliminar_menor_igual(rango):
    return rango.replace('>=','').replace('>','').strip()

# Aplicamos la función para eliminar '>=' de la columna 'Density Range'
df_gal['Density Class Range'] = df_gal['Density Class Range'].apply(eliminar_menor_igual)

In [5]:
# Función para extraer los valores numéricos y calcular el valor central
def calcular_densidad_central(rango):
    # Si el valor es solo un número
    if '-' not in rango:  # Caso cuando no hay guion, es un solo número
        return float(rango.strip())
    
    # Si el valor es un rango (con '-')
    else:
        # Extraemos los valores del rango y calculamos el promedio
        min_val, max_val = map(float, rango.replace(' ', '').split('-'))  # Convertimos los valores en float
        return (min_val + max_val) / 2  # Calculamos el promedio del rango

# Aplicamos la función a la columna 'Density Range' y creamos la nueva columna 'Density_Center'
df_gal['Density Center'] = df_gal['Density Class Range'].apply(calcular_densidad_central)

In [6]:
df_gal

Unnamed: 0,OBJECTID,Date,Latitude,Longitude,Oceans,Regions,Location,Microplastics Measurement (density),Unit,Density Class Range,Concentration Class,Sampling Method,Short Reference,Long Reference,DOI,Organization,Keywords,NCEI Accession Number,NCEI Accession Link,Density Center
0,6191,04/12/2001,0.07,-897.9,Pacific Ocean,Galápagos,Galápagos,0.0,pieces/m3,0-0.0005,Very Low,Neuston net,Law et al.2014,"Law, K.L, S.K. Morét-Ferguson, D.S. Goodwin, E...",https://doi.org/10.1021/es4053076,Sea Education Association,SEA,211008,https://www.ncei.noaa.gov/access/metadata/land...,0.00025
1,6192,05/12/2001,-0.51,-900.8,Pacific Ocean,Galápagos,Galápagos,0.0,pieces/m3,0-0.0005,Very Low,Neuston net,Law et al.2014,"Law, K.L, S.K. Morét-Ferguson, D.S. Goodwin, E...",https://doi.org/10.1021/es4053076,Sea Education Association,SEA,211008,https://www.ncei.noaa.gov/access/metadata/land...,0.00025
2,6193,08/12/2001,-0.87,-902.6,Pacific Ocean,Galápagos,Galápagos,0.0,pieces/m3,0-0.0005,Very Low,Neuston net,Law et al.2014,"Law, K.L, S.K. Morét-Ferguson, D.S. Goodwin, E...",https://doi.org/10.1021/es4053076,Sea Education Association,SEA,211008,https://www.ncei.noaa.gov/access/metadata/land...,0.00025
3,6194,09/12/2001,-12.1,-904.3,Pacific Ocean,Galápagos,Galápagos,0.0,pieces/m3,0-0.0005,Very Low,Neuston net,Law et al.2014,"Law, K.L, S.K. Morét-Ferguson, D.S. Goodwin, E...",https://doi.org/10.1021/es4053076,Sea Education Association,SEA,211008,https://www.ncei.noaa.gov/access/metadata/land...,0.00025
4,6195,09/12/2001,-10.7,-908.4,Pacific Ocean,Galápagos,Galápagos,0.0,pieces/m3,0-0.0005,Very Low,Neuston net,Law et al.2014,"Law, K.L, S.K. Morét-Ferguson, D.S. Goodwin, E...",https://doi.org/10.1021/es4053076,Sea Education Association,SEA,211008,https://www.ncei.noaa.gov/access/metadata/land...,0.00025
5,6196,10/12/2001,-11.3,-919.6,Pacific Ocean,Galápagos,Galápagos,0.0,pieces/m3,0-0.0005,Very Low,Neuston net,Law et al.2014,"Law, K.L, S.K. Morét-Ferguson, D.S. Goodwin, E...",https://doi.org/10.1021/es4053076,Sea Education Association,SEA,211008,https://www.ncei.noaa.gov/access/metadata/land...,0.00025
6,8982,24/11/2014,-0.7485,-903.131,Pacific Ocean,Galápagos,Galápagos,0.001,pieces/m3,0.0005-0.005,Low,Grab sample,Barrows et al.2018,"Barrows, A.P.W., S.E. Cathey, C.W. Petersen. 2...",https://doi.org/10.1016/j.envpol.2018.02.062,Adventure Scientist,Adventure Scientist/Citizen Science,211009,https://www.ncei.noaa.gov/access/metadata/land...,0.00275
7,9371,06/02/2015,0.2891,-905.589,Pacific Ocean,Galápagos,Galápagos,0.003,pieces/m3,0.0005-0.005,Low,Grab sample,Barrows et al.2018,"Barrows, A.P.W., S.E. Cathey, C.W. Petersen. 2...",https://doi.org/10.1016/j.envpol.2018.02.062,Adventure Scientist,Adventure Scientist/Citizen Science,211009,https://www.ncei.noaa.gov/access/metadata/land...,0.00275
8,9373,07/02/2015,0.314,-899.471,Pacific Ocean,Galápagos,Galápagos,0.002,pieces/m3,0.0005-0.005,Low,Grab sample,Barrows et al.2018,"Barrows, A.P.W., S.E. Cathey, C.W. Petersen. 2...",https://doi.org/10.1016/j.envpol.2018.02.062,Adventure Scientist,Adventure Scientist/Citizen Science,211009,https://www.ncei.noaa.gov/access/metadata/land...,0.00275
9,9806,25/04/2016,-0.57,-905.7,Pacific Ocean,Galápagos,Galápagos,0.003,pieces/m3,0.0005-0.005,Low,Grab sample,Barrows et al.2018,"Barrows, A.P.W., S.E. Cathey, C.W. Petersen. 2...",https://doi.org/10.1016/j.envpol.2018.02.062,Adventure Scientist,Adventure Scientist/Citizen Science,211009,https://www.ncei.noaa.gov/access/metadata/land...,0.00275


In [7]:
#Guardar csv limpio
df_gal.to_csv('Galapagos.csv', index=False)