# Análisis Exploratorio de Datos (EDA)

Para observar los datos y saber su estructura base

In [1]:
# Librerias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
file = "data/Crudos/datos_2025.csv"
df = pd.read_csv(file)
df


Unnamed: 0,Fecha,Hora,O3,O3 8hrs,NO2,CO,SO2,PM-10,PM-2.5,Estacion
0,2025-01-01,00:00,F.O.,,F.O.,F.O.,F.O.,F.O.,F.O.,santa
1,2025-01-01,01:00,F.O.,,F.O.,F.O.,F.O.,F.O.,F.O.,santa
2,2025-01-01,02:00,F.O.,,F.O.,F.O.,F.O.,F.O.,F.O.,santa
3,2025-01-01,03:00,F.O.,,F.O.,F.O.,F.O.,F.O.,F.O.,santa
4,2025-01-01,04:00,F.O.,,F.O.,F.O.,F.O.,F.O.,F.O.,santa
...,...,...,...,...,...,...,...,...,...,...
16195,2025-05-15,19:00,0.076,,0.012,0.58,0.004,41,20,vel
16196,2025-05-15,20:00,0.061,,0.021,0.63,0.004,46,22,vel
16197,2025-05-15,21:00,0.037,,0.035,0.70,0.001,62,24,vel
16198,2025-05-15,22:00,0.017,,0.027,0.72,0.002,53,23,vel


# Estructura del dataset:

- Fecha y Hora están en columnas separadas.

- Contaminantes registrados:

    - O3 y O3_8hrs (ozono puntual y promedio móvil)
    - NO2, CO, SO2
    - PM-10, PM-2.5

- Estacion indica el sitio de monitoreo (por ahora solo vemos santa).

# Próximos pasos:

- Combinar Fecha y Hora en una sola columna DateTime.

- Convertir variables numéricas al tipo correcto (varias están como texto).

- Verificar valores nulos y tipos de datos.

- Iniciar el EDA con estadísticas descriptivas y calidad de datos.

In [3]:
df_clean = df.copy()

In [4]:
# Paso 1: Combinar Fecha y Hora en una sola columna DateTime
df_clean['DateTime'] = pd.to_datetime(df_clean['Fecha'] + ' ' + df_clean['Hora'], format='%Y-%m-%d %H:%M')

df_clean

Unnamed: 0,Fecha,Hora,O3,O3 8hrs,NO2,CO,SO2,PM-10,PM-2.5,Estacion,DateTime
0,2025-01-01,00:00,F.O.,,F.O.,F.O.,F.O.,F.O.,F.O.,santa,2025-01-01 00:00:00
1,2025-01-01,01:00,F.O.,,F.O.,F.O.,F.O.,F.O.,F.O.,santa,2025-01-01 01:00:00
2,2025-01-01,02:00,F.O.,,F.O.,F.O.,F.O.,F.O.,F.O.,santa,2025-01-01 02:00:00
3,2025-01-01,03:00,F.O.,,F.O.,F.O.,F.O.,F.O.,F.O.,santa,2025-01-01 03:00:00
4,2025-01-01,04:00,F.O.,,F.O.,F.O.,F.O.,F.O.,F.O.,santa,2025-01-01 04:00:00
...,...,...,...,...,...,...,...,...,...,...,...
16195,2025-05-15,19:00,0.076,,0.012,0.58,0.004,41,20,vel,2025-05-15 19:00:00
16196,2025-05-15,20:00,0.061,,0.021,0.63,0.004,46,22,vel,2025-05-15 20:00:00
16197,2025-05-15,21:00,0.037,,0.035,0.70,0.001,62,24,vel,2025-05-15 21:00:00
16198,2025-05-15,22:00,0.017,,0.027,0.72,0.002,53,23,vel,2025-05-15 22:00:00


In [5]:
# Paso 2: Eliminar columnas originales Fecha y Hora
df_clean.drop(columns=['Fecha', 'Hora'], inplace=True)
df_clean

Unnamed: 0,O3,O3 8hrs,NO2,CO,SO2,PM-10,PM-2.5,Estacion,DateTime
0,F.O.,,F.O.,F.O.,F.O.,F.O.,F.O.,santa,2025-01-01 00:00:00
1,F.O.,,F.O.,F.O.,F.O.,F.O.,F.O.,santa,2025-01-01 01:00:00
2,F.O.,,F.O.,F.O.,F.O.,F.O.,F.O.,santa,2025-01-01 02:00:00
3,F.O.,,F.O.,F.O.,F.O.,F.O.,F.O.,santa,2025-01-01 03:00:00
4,F.O.,,F.O.,F.O.,F.O.,F.O.,F.O.,santa,2025-01-01 04:00:00
...,...,...,...,...,...,...,...,...,...
16195,0.076,,0.012,0.58,0.004,41,20,vel,2025-05-15 19:00:00
16196,0.061,,0.021,0.63,0.004,46,22,vel,2025-05-15 20:00:00
16197,0.037,,0.035,0.70,0.001,62,24,vel,2025-05-15 21:00:00
16198,0.017,,0.027,0.72,0.002,53,23,vel,2025-05-15 22:00:00


In [6]:
# Paso 3: Renombrar columnas para mayor claridad y  consistencia
df_clean.columns = ['O3', 'O3_8hrs', 'NO2', 'CO', 'SO2', 'PM10', 'PM2_5', 'Estacion', 'DateTime']
df_clean


Unnamed: 0,O3,O3_8hrs,NO2,CO,SO2,PM10,PM2_5,Estacion,DateTime
0,F.O.,,F.O.,F.O.,F.O.,F.O.,F.O.,santa,2025-01-01 00:00:00
1,F.O.,,F.O.,F.O.,F.O.,F.O.,F.O.,santa,2025-01-01 01:00:00
2,F.O.,,F.O.,F.O.,F.O.,F.O.,F.O.,santa,2025-01-01 02:00:00
3,F.O.,,F.O.,F.O.,F.O.,F.O.,F.O.,santa,2025-01-01 03:00:00
4,F.O.,,F.O.,F.O.,F.O.,F.O.,F.O.,santa,2025-01-01 04:00:00
...,...,...,...,...,...,...,...,...,...
16195,0.076,,0.012,0.58,0.004,41,20,vel,2025-05-15 19:00:00
16196,0.061,,0.021,0.63,0.004,46,22,vel,2025-05-15 20:00:00
16197,0.037,,0.035,0.70,0.001,62,24,vel,2025-05-15 21:00:00
16198,0.017,,0.027,0.72,0.002,53,23,vel,2025-05-15 22:00:00


In [7]:
# Paso 4: Convertir columnas numericas al tipo float (algunas estan como string)
cols_numeric = ['O3', 'O3_8hrs', 'NO2', 'CO', 'SO2', 'PM10', 'PM2_5']
df_clean[cols_numeric] = df_clean[cols_numeric].apply(pd.to_numeric, errors='coerce')
df_clean.dtypes


O3                 float64
O3_8hrs            float64
NO2                float64
CO                 float64
SO2                float64
PM10               float64
PM2_5              float64
Estacion            object
DateTime    datetime64[ns]
dtype: object

In [8]:
# Normalización de unidades: convertir de ppm a ppb para gases si es necesario
if df_clean['O3'].max() < 1:  # indica que están en ppm
    df_clean['O3'] *= 1000
    df_clean['O3_8hrs'] *= 1000
    df_clean['CO'] *= 100
    df_clean['NO2'] *= 1000
    df_clean['SO2'] *= 1000
    print("✅ Gases convertidos de ppm a ppb.")

# CO se suele reportar en ppm, normalmente no se convierte (opcional)
# PM10 y PM2.5 ya están en µg/m³, no necesitan cambios


✅ Gases convertidos de ppm a ppb.


In [9]:
# Verificacion final y valores nulos
data_types = df_clean.dtypes
missing_values = df_clean.isnull().sum()

data_types, missing_values


(O3                 float64
 O3_8hrs            float64
 NO2                float64
 CO                 float64
 SO2                float64
 PM10               float64
 PM2_5              float64
 Estacion            object
 DateTime    datetime64[ns]
 dtype: object,
 O3           1743
 O3_8hrs     16200
 NO2          2442
 CO           1976
 SO2          2152
 PM10         1956
 PM2_5        3116
 Estacion        0
 DateTime        0
 dtype: int64)

In [10]:
# Extraer componentes temporales
df_clean['Anio'] = df_clean['DateTime'].dt.year
df_clean['Mes'] = df_clean['DateTime'].dt.month
df_clean['Dia'] = df_clean['DateTime'].dt.day
df_clean['Hora'] = df_clean['DateTime'].dt.hour

# Verificar estaciones únicas
unique_stations = df_clean['Estacion'].unique()
station_counts = df_clean['Estacion'].value_counts()

unique_stations, station_counts


(array(['santa', 'bine', 'ninfas', 'utp', 'vel'], dtype=object),
 Estacion
 santa     3240
 bine      3240
 ninfas    3240
 utp       3240
 vel       3240
 Name: count, dtype: int64)

In [None]:
# Guardar df_clean en un nuevo archivo CSV
import os

# Guardar archivo
file_path = 'data/Clean/datos_Clean_2025.csv'
df_clean.to_csv(file_path, index=False)

print(f"Archivo guardado en: {file_path}")

Archivo guardado en: app/data/times/datos_Clean_2025.csv
