# Dataset Exploration

_Note: For this notebook to work you will need to have run the process dataset notebook or script._
_The file `data/processed/main_dataframe.csv` should exist._

In [6]:
import os
import numpy as np
import pandas as pd

In [7]:
contaminants = ["PM10", "PM2.5", "O3", "SO2", "NO2", "CO", "NO", "NOX"]

df = pd.read_csv(
	"../data/processed/main_dataframe.csv",
	parse_dates=["date"],
	engine="pyarrow"
)

In [8]:
# There are certain values with -9999 that should be treated as NaN
df.replace(-9999, np.nan, inplace=True)

In [9]:
# Get a smaller dataframe with date, co and station_code
new_df = df[["date", "CO", "station_code"]].copy()
new_df

Unnamed: 0,date,CO,station_code
0,2020-01-01 00:00:00,,SE
1,2020-01-01 01:00:00,2.11,SE
2,2020-01-01 02:00:00,2.06,SE
3,2020-01-01 03:00:00,1.96,SE
4,2020-01-01 04:00:00,1.98,SE
...,...,...,...
779376,2025-06-30 19:00:00,,NE3
779377,2025-06-30 20:00:00,,NE3
779378,2025-06-30 21:00:00,,NE3
779379,2025-06-30 22:00:00,,NE3


## Saving the dataset subsets

For the per-station analysis, we will create separate CSV files for each station code.

In [10]:
output_dir = os.path.join("..", "data", "processed", "subsets")
os.makedirs(output_dir, exist_ok=True)

station_codes = df["station_code"].dropna().astype(str).unique()

for code in station_codes:
    subset = df[df["station_code"].astype(str) == code].copy()
    if "date" in subset.columns:
        subset.sort_values("date", inplace=True)
    filename = f"dataset_{code.lower()}.csv"
    subset_path = os.path.join(output_dir, filename)
    subset.to_csv(subset_path, index=False)