In [28]:
from pathlib import Path

import pandas as pd

In [29]:
labels = {
	'stations': {
		'SE': 'sureste',
		'NE': 'noreste',
		'CE': 'centro',
		'NO': 'noroeste',
		'SO': 'suroeste',
		'NO2': ['noroeste2', 'noroeste 2'],
		'NTE': 'norte',
		'NE2': ['noreste2', 'noreste 2'],
		'SE2': ['sureste2', 'sureste 2'],
		'SO2': ['suroeste2', 'suroeste 2'],
		'SUR': 'sur',
		'NTE2': ['norte2', 'norte 2'],
		'SE3': ['sureste3', 'sureste 3'],
		'NE3': ['noreste3', 'noreste 3'],
		'NOE3': ['noroeste3', 'noroeste 3']
	},
	'contaminants': {
		'PM10': 'Partículas menores a 10 micras',
		'PM2.5': 'Partículas menores a 2.5 micras',
		'O3': 'Ozono',
		'SO2': 'Dióxido de azufre',
		'NO2': 'Dióxido de nitrógeno',
		'CO': 'Monóxido de carbono',
		'NO': 'Monóxido de nitrógeno',
		'NOX': 'Óxidos de nitrógeno'
	}
}

In [30]:
# Dataset one
df_2020_2021_all_stations = pd.read_excel(
	Path("../data/raw/DATOS HISTÓRICOS 2020_2021_TODAS ESTACIONES.xlsx"),
	sheet_name=None
)

In [31]:
# Process dataset 1
frames = []
for name, frame in df_2020_2021_all_stations.items():
	if name == 'NOROESTE3':
		continue
	for code, codename in labels['stations'].items():
		frame_copy = frame.copy()
		if isinstance(codename, list):
			if any(name.upper() == cn.upper() for cn in codename):
				frame_copy['station_code'] = code
				frames.append(frame_copy)
		else:
			if name.upper() == codename.upper():
				frame_copy['station_code'] = code
				frames.append(frame_copy)


df_2020_2021_all_stations_processed = pd.concat(frames, ignore_index=True)

In [32]:
# Dataset two
df_2022_2023_all_stations = pd.read_excel(
	Path("../data/raw/DATOS HISTÓRICOS 2022_2023_TODAS ESTACIONES.xlsx"),
	sheet_name=None
)

In [33]:
# Process dataset 2
frames = []
for name, frame in df_2022_2023_all_stations.items():
	for code, codename in labels['stations'].items():
		frame_copy = frame.copy()
		if isinstance(codename, list):
			if any(name.upper() == cn.upper() for cn in codename):
				frame_copy['station_code'] = code
				frames.append(frame_copy)
		else:
			if name.upper() == codename.upper():
				frame_copy['station_code'] = code
				frames.append(frame_copy)


df_2022_2023_all_stations_processed = pd.concat(frames, ignore_index=True)

In [34]:
# Dataset three
df_2023_2024_all_stations = pd.read_excel(
	Path("../data/raw/DATOS HISTÓRICOS 2023_2024_TODAS ESTACIONES_ITESM.xlsx"),
	sheet_name='Param_horarios_Estaciones',
	header=None
)

In [51]:
# Process dataset 3
stations_map = labels['stations']

station_name_to_code = {}
for code, names in stations_map.items():
    if isinstance(names, list):
        for name in names:
            station_name_to_code[name.upper()] = code
    else:
        station_name_to_code[names.upper()] = code

stations_row = df_2023_2024_all_stations.iloc[0, 1:].astype(str).str.strip()
vars_row = df_2023_2024_all_stations.iloc[1, 1:].astype(str).str.strip()

body = df_2023_2024_all_stations.iloc[3:].reset_index(drop=True)
dates = pd.to_datetime(body.iloc[:, 0], errors="coerce", dayfirst=True)

contaminants = list(labels["contaminants"].keys())

In [57]:
frames = []
for station in stations_row.unique():
    if pd.isna(station) or station == "nan" or station.upper() not in station_name_to_code:
        continue

    # Find which columns contain data for the current station
    station_columns = stations_row[stations_row == station].index.tolist()

    print(station, station_columns)

    station_data = {
        "station_code": station_name_to_code[station.upper()],
        "date": dates
    }

    for col_idx in station_columns:
        if col_idx >= len(vars_row):
            print(f"Warning: Column index {col_idx} is out of bounds. vars_row has {len(vars_row)} columns.")
            continue

        var_name = vars_row.iloc[col_idx]

        if var_name in contaminants:
            column_data = body.iloc[:, col_idx + 1]
            station_data[var_name] = pd.to_numeric(column_data, errors='coerce')

    if len(station_data) > 3:  # More than just station_name, station_code, date
        station_df = pd.DataFrame(station_data)
        frames.append(station_df)


# Concatenate all station dataframes
df_2023_2024_all_stations_processed = pd.concat(frames, ignore_index=True)

SURESTE [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
NORESTE [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
CENTRO [33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]
NOROESTE [49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]
SUROESTE [65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79]
NOROESTE 2 [81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95]
NORTE [97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111]
NORESTE2 [113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127]
SURESTE2 [129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143]
SUROESTE2 [145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159]
SURESTE 3 [161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175]
SUR [177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191]
NORTE 2 [193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207]
NOREST

In [59]:
# Show SE station on date 2024-01-01
df_2023_2024_all_stations_processed[
    (df_2023_2024_all_stations_processed['station_code'] == 'SE') &
    (df_2023_2024_all_stations_processed['date'] == '2024-1-1')
]

Unnamed: 0,station_code,date,NO,NO2,NOX,O3,PM10,PM2.5,SO2
8758,SE,2024-01-01,10.5,23.3,33.8,13.0,81.0,60.0,2.9


In [37]:
# Concat all dataframes
main_dataframe = pd.concat(
	[
		df_2020_2021_all_stations_processed,
		df_2022_2023_all_stations_processed,
		df_2023_2024_all_stations_processed
	],
	ignore_index=True
)

In [38]:
main_dataframe

Unnamed: 0,date,CO,NO,NO2,NOX,O3,PM10,PM2.5,PRS,RAINF,RH,SO2,SR,TOUT,WSR,WDR,station_code
0,2020-01-01 00:00:00,,,,,,66.0,54.23,,,,,0.00,,,,SE
1,2020-01-01 01:00:00,2.11,,,,19.0,57.0,,735.7,0.0,96.0,5.4,0.01,11.20,8.1,,SE
2,2020-01-01 02:00:00,2.06,,,,19.0,68.0,53.84,734.8,0.0,96.0,5.5,0.01,11.26,5.5,,SE
3,2020-01-01 03:00:00,1.96,,,,19.0,68.0,36.47,734.2,0.0,96.0,5.4,0.01,11.35,3.8,,SE
4,2020-01-01 04:00:00,1.98,,,,16.0,48.0,33.59,733.9,0.0,96.0,5.5,0.01,11.47,3.3,,SE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
659356,2024-07-31 19:00:00,,,,,,,,,,,,,,,,NOE3
659357,2024-07-31 20:00:00,,,,,,,,,,,,,,,,NOE3
659358,2024-07-31 21:00:00,,,,,,,,,,,,,,,,NOE3
659359,2024-07-31 22:00:00,,,,,,,,,,,,,,,,NOE3


In [39]:
main_dataframe.to_csv(
	Path("../data/processed/main_dataframe.csv"),
	index=False
)