In [1]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Imputación de nulos usando métodos avanzados estadísticos
# -----------------------------------------------------------------------
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

In [2]:
df_aus = pd.read_csv(
    "Proyecto-PowerBI-MOD4/files/Marine Microplastic Concentrations_Gran_reef_coral.csv",
    sep=";",
    engine="python",
    encoding="utf-8",
    quotechar='"',
    on_bad_lines='warn'  # también puedes usar 'skip' para ignorar líneas malas
)

df_aus.head()

FileNotFoundError: [Errno 2] No such file or directory: 'Proyecto-PowerBI-MOD4/files/Marine Microplastic Concentrations_Gran_reef_coral.csv'

In [3]:
# Función para conocer nulos y duplicados en un informe. Next step--> ETL

def nulos_duplicados(df_aus):
    # Cálculo del porcentaje de nulos
    porcentaje_nulos = df_aus.isna().sum() / df_aus.shape[0] * 100
    
    # Verificación de duplicados
    duplicados = df_aus.duplicated().sum()
    if duplicados == 0:
        mensaje_duplicados = "No hay duplicados"
    else:
        mensaje_duplicados = f"Hay {duplicados} duplicados"
    
    # Creación de un reporte bonito y visual
    reporte = f"""
    ===================== Informe de Datos =====================
    
    Porcentaje de Nulos por Columna:
    ------------------------------------------------------------
    {porcentaje_nulos.to_string()}
    
    ------------------------------------------------------------
    Duplicados:
    ------------------------------------------------------------
    {mensaje_duplicados}
    
    ============================================================
    """
    
    # Imprimir directamente el reporte
    print(reporte)

# Ejemplo de uso
# df_mp = pd.DataFrame(...)

# Llamar directamente a la función
nulos_duplicados(df_aus)



    
    Porcentaje de Nulos por Columna:
    ------------------------------------------------------------
    OBJECTID                               0.0
Date                                   0.0
Latitude                               0.0
Longitude                              0.0
Oceans                                 0.0
Regions                                0.0
Location                               0.0
Microplastics Measurement (density)    0.0
Unit                                   0.0
Density Class Range                    0.0
Concentration Class                    0.0
Sampling Method                        0.0
Short Reference                        0.0
Long Reference                         0.0
DOI                                    0.0
Organization                           0.0
Keywords                               0.0
NCEI Accession Number                  0.0
NCEI Accession Link                    0.0
    
    ------------------------------------------------------------
 

In [4]:
# Columna de media de density

# Función para eliminar el símbolo '>= y >'
def eliminar_menor_igual(rango):
    return rango.replace('>=','').replace('>','').strip()

# Aplicamos la función para eliminar '>=' de la columna 'Density Range'
df_aus['Density Class Range'] = df_aus['Density Class Range'].apply(eliminar_menor_igual)

In [5]:
# Función para extraer los valores numéricos y calcular el valor central
def calcular_densidad_central(rango):
    # Si el valor es solo un número
    if '-' not in rango:  # Caso cuando no hay guion, es un solo número
        return float(rango.strip())
    
    # Si el valor es un rango (con '-')
    else:
        # Extraemos los valores del rango y calculamos el promedio
        min_val, max_val = map(float, rango.replace(' ', '').split('-'))  # Convertimos los valores en float
        return (min_val + max_val) / 2  # Calculamos el promedio del rango

# Aplicamos la función a la columna 'Density Range' y creamos la nueva columna 'Density_Center'
df_aus['Density Center'] = df_aus['Density Class Range'].apply(calcular_densidad_central)

In [6]:
df_aus

Unnamed: 0,OBJECTID,Date,Latitude,Longitude,Oceans,Regions,Location,Microplastics Measurement (density),Unit,Density Class Range,Concentration Class,Sampling Method,Short Reference,Long Reference,DOI,Organization,Keywords,NCEI Accession Number,NCEI Accession Link,Density Center
0,10262,26/07/2012,-165.766,1.457.434,Pacific Ocean,Coral Sea,Great Barrier Reef,0.023068,pieces/m3,0.005-1,Medium,Manta net,Reisser et al.2013,"Reisser, J., J. Shaw, C. Wilcox, B.D. Hardesty...",https://doi.org/10.1371/journal.pone.0080466,University of Western Australia,RV Southern Surveyor/RV Solander,252260,https://www.ncei.noaa.gov/access/metadata/land...,0.5025
1,10263,26/07/2012,-165.617,1.457.530,Pacific Ocean,Coral Sea,Great Barrier Reef,0.063437,pieces/m3,0.005-1,Medium,Manta net,Reisser et al.2013,"Reisser, J., J. Shaw, C. Wilcox, B.D. Hardesty...",https://doi.org/10.1371/journal.pone.0080466,University of Western Australia,RV Southern Surveyor/RV Solander,252260,https://www.ncei.noaa.gov/access/metadata/land...,0.5025
2,10264,26/07/2012,-165.429,1.457.642,Pacific Ocean,Coral Sea,Great Barrier Reef,0.046136,pieces/m3,0.005-1,Medium,Manta net,Reisser et al.2013,"Reisser, J., J. Shaw, C. Wilcox, B.D. Hardesty...",https://doi.org/10.1371/journal.pone.0080466,University of Western Australia,RV Southern Surveyor/RV Solander,252260,https://www.ncei.noaa.gov/access/metadata/land...,0.5025
3,10265,27/07/2012,-150.275,1.453.907,Pacific Ocean,Coral Sea,Great Barrier Reef,0.027107,pieces/m3,0.005-1,Medium,Manta net,Reisser et al.2013,"Reisser, J., J. Shaw, C. Wilcox, B.D. Hardesty...",https://doi.org/10.1371/journal.pone.0080466,University of Western Australia,RV Southern Surveyor/RV Solander,252260,https://www.ncei.noaa.gov/access/metadata/land...,0.5025
4,10266,27/07/2012,-150.086,1.453.855,Pacific Ocean,Coral Sea,Great Barrier Reef,0.060851,pieces/m3,0.005-1,Medium,Manta net,Reisser et al.2013,"Reisser, J., J. Shaw, C. Wilcox, B.D. Hardesty...",https://doi.org/10.1371/journal.pone.0080466,University of Western Australia,RV Southern Surveyor/RV Solander,252260,https://www.ncei.noaa.gov/access/metadata/land...,0.5025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,21481,22/03/2020,-202.514,1.489.329,Pacific Ocean,Coral Sea,Great Barrier Reef,0.027810,pieces/m3,0.005-1,Medium,Manta net,Carbery et al. 2022,"Carbery, M., F. Herb, J. Reynes, C. K. Pham, W...",https://doi.org/10.1016/j.marpolbul.2022.114179,"University of Newcastle, Australia",Sail and Explore Association,279321,https://www.ncei.noaa.gov/access/metadata/land...,0.5025
61,21482,23/03/2020,-202.765,1.489.170,Pacific Ocean,Coral Sea,Great Barrier Reef,0.043204,pieces/m3,0.005-1,Medium,Manta net,Carbery et al. 2022,"Carbery, M., F. Herb, J. Reynes, C. K. Pham, W...",https://doi.org/10.1016/j.marpolbul.2022.114179,"University of Newcastle, Australia",Sail and Explore Association,279321,https://www.ncei.noaa.gov/access/metadata/land...,0.5025
62,21483,23/03/2020,-202.281,1.487.833,Pacific Ocean,Coral Sea,Great Barrier Reef,0.034875,pieces/m3,0.005-1,Medium,Manta net,Carbery et al. 2022,"Carbery, M., F. Herb, J. Reynes, C. K. Pham, W...",https://doi.org/10.1016/j.marpolbul.2022.114179,"University of Newcastle, Australia",Sail and Explore Association,279321,https://www.ncei.noaa.gov/access/metadata/land...,0.5025
63,21484,24/03/2020,-203.456,1.488.388,Pacific Ocean,Coral Sea,Great Barrier Reef,0.080216,pieces/m3,0.005-1,Medium,Manta net,Carbery et al. 2022,"Carbery, M., F. Herb, J. Reynes, C. K. Pham, W...",https://doi.org/10.1016/j.marpolbul.2022.114179,"University of Newcastle, Australia",Sail and Explore Association,279321,https://www.ncei.noaa.gov/access/metadata/land...,0.5025


In [7]:
#Guardar csv limpio
df_aus.to_csv('Great_Barrier_Reef.csv', index=False)