In [1]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt

pd.options.mode.chained_assignment = None  # default='warn'

sstyle = "seaborn-poster"
plt.style.use(sstyle)
plt.rc('font',family = 'serif')

Data from the [Mexico City Atmospheric Monitoring System (SIMAT)](http://www.aire.cdmx.gob.mx/default.php?opc=%27aKBhnmI=%27&opcion=Zg==)

In [2]:
def get_data(year,zone):
    # Creating dictionary to retrieve units from id
    units = (
        pd.read_csv("data/info_parameters/cat_unidades.csv", 
                    encoding="latin-1")
        .rename(columns={"clave_unidad":"unit"})
        .drop(columns=["nombre_unidad"])
        .set_index("id_unidad")
    )["unit"].to_dict()
    # Reading data per year
    df = (
        pd.read_csv(f"data/contaminantes_{year}.csv",
                    skiprows=10).rename(columns={"unit":"id_unidad"})
    )
    df["units"] = df.id_unidad.map(units); df = df.drop(columns=["id_unidad"])
    # Uncomment for data diagnostics:
    #print(f"Available zones for this dataset: {df.id_station.unique()}")
    df = df[df.id_station.isin([zone])]; df = df.drop(columns=["id_station"])
    # Getting datasets per contaminat
    contaminants = df.id_parameter.unique(); DF_c = []
    for c in contaminants:
        df_ = df[df.id_parameter.isin([c])]
        df_.date = (
            pd.to_datetime(
                df_.date.str.replace(" 24:00"," 00:00"),
                format="%d/%m/%Y %H:%M"
            )
        )
        df_.set_index("date",inplace=True)
        c_name_unit = ((df_.id_parameter.unique()[0], df_.units.unique()[0]))
        df_ = df_.groupby(pd.Grouper(freq='D')).mean()
        df_.rename(columns={"value":f"{c_name_unit[0]} [{c_name_unit[-1]}]"},inplace=True)
        DF_c.append(df_)  
    return DF_c

In [3]:
# For data diagnostics
"""DF = get_data(2020,"MGH")
for df in DF:
    df.plot();""";

Based on data diagnostics, I am going to work with the Miguel Hidalgo zone. It appears to be the most complete set of data for the contaminants: CO, NO, NO$_2$, NO$_x$, O$_3$, and SO$_2$, for the whole range of time: 2017 - 2022.

In [5]:
DF_whole = get_data(2017,"MGH")
# samples to be worked, decision based from data diagnostics
DF = [DF_whole[i] for i in (list(range(5))+[len(DF_whole)-1])] 